# Bigram
This notebook presents the conclusion of an assignment for the NLP course at UnB. It implements a bigram language model. For more details, click [here](https://github.com/thiagodepaulo/nlp/blob/main/aula_2/exercicio2.md])(in Portuguese).

## Install required libraries

In [6]:
!pip install tiktoken==0.8.0

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp39-cp39-win_amd64.whl (884 kB)
Collecting regex>=2022.1.18
  Downloading regex-2024.11.6-cp39-cp39-win_amd64.whl (274 kB)
Installing collected packages: regex, tiktoken
  Attempting uninstall: regex
    Found existing installation: regex 2021.8.3
    Uninstalling regex-2021.8.3:
      Successfully uninstalled regex-2021.8.3
Successfully installed regex-2024.11.6 tiktoken-0.8.0


In [8]:
!pip install torch==2.5.1

Collecting torch
  Downloading torch-2.5.1-cp39-cp39-win_amd64.whl (203.0 MB)
Collecting sympy==1.13.1
  Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
Collecting typing-extensions>=4.8.0
  Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions, sympy, torch
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.10.0.2
    Uninstalling typing-extensions-3.10.0.2:
      Successfully uninstalled typing-extensions-3.10.0.2
  Attempting uninstall: sympy
    Found existing installation: sympy 1.9
    Uninstalling sympy-1.9:
      Successfully uninstalled sympy-1.9
Successfully installed sympy-1.13.1 torch-2.5.1 typing-extensions-4.12.2


## Imports

In [26]:
import tiktoken
import json
import torch
import math

from typing import List
from typing import Set

from util.file_utils import get_file_names
from util.file_utils import train_test_split

from I03_bigram.bigram import encode
from I03_bigram.bigram import decode_single_token
from I03_bigram.bigram import compute_bigram_frequency
from I03_bigram.bigram import decode_bigrams
from I03_bigram.bigram import decode_bigram_freq

## Configurations

In [12]:
# Configuration
corpus_folder  = "corpus"
end_token      = "<|endoftext|>"
tokenizer_name = 'cl100k_base'

# Initialization
tokenizer = tiktoken.get_encoding(tokenizer_name)
bigrams_dict = {} # dictionary of bigram
vocabulary: Set[str] = None

## Corpus initialization

Read file names from the corpus' folder and split it into traning and test sets.

In [13]:
# Get file names from a folder ('corpus') and separate it into traning set and test set.
file_names = sorted(get_file_names(corpus_folder))
print("Test function 'train_test_split':")
train_set, test_set = train_test_split(file_names, test_size=0.2)
n_samples = 5
print(f"Files set (samples): {file_names[:n_samples]}... ({n_samples} of {len(file_names)})")
print(f"Train Set (samples): {train_set[:n_samples]}... ({n_samples} of {len(train_set)})")
print(f"Test Set (samples): {test_set[:n_samples]}... ({n_samples} of {len(test_set)})")   

Test function 'train_test_split':
Files set (samples): ['10000.json', '100008.json', '100013.json', '100022.json', '100042.json']... (5 of 10000)
Train Set (samples): ['47166.json', '8267.json', '23001.json', '18868.json', '121822.json']... (5 of 8000)
Test Set (samples): ['7519.json', '1311.json', '8340.json', '50686.json', '25274.json']... (5 of 2000)


### Training set (text load)
Read the content of the files from the corpus (traning set) and organize them into a list of texts adding a special token at the begining and at the end of each text.

In [15]:
# Load files and store its content ('text' attribute) into a list of texts
texts = []
for filename in train_set:  
    with open(f"{corpus_folder}/{filename}", "r", encoding='utf-8') as file:
        data = json.load(file);
        text = data.get("text", "")
        texts.append(end_token + text + end_token)  # Append text and add space

print("Total of text loaded:", len(texts))

Total of text loaded: 8000


## Training

### Vocabulary extraction
Initialize the vocabulary from the traning set.

In [24]:
    # Create a set of bigrams_dict and its frequencies
    texts_tokens = []
    vocabulary = None
    for txt in texts:
        cod_tokens = encode(txt)
        txt_tokens = decode_single_token(cod_tokens)
        if vocabulary:
            vocabulary = vocabulary.union(txt_tokens)
        else:
            vocabulary = set(txt_tokens)
        bigrams_dict = compute_bigram_frequency(cod_tokens)   
        texts_tokens.append(txt_tokens)

    # Show bigram
    print("Vocalubary size:", len(vocabulary))
    print('Bigrams:')
    print(list(bigrams_dict.keys())[:5], '...')  
    decoded_bigrams_list = decode_bigrams(list(bigrams_dict.keys()))
    print(list(decoded_bigrams_list)[:5], '...')

Vocalubary size: 48016
Bigrams:
[(100257, 45767), (45767, 1776), (1776, 9769), (9769, 70), (70, 2194)] ...
[('<|endoftext|>', 'Um'), ('Um', ' sl'), (' sl', 'ô'), ('ô', 'g'), ('g', 'ane')] ...


In [42]:
# Show part of the bigrams       
print('Bigrams frenquecies:')  
bigram_list = list(bigrams_dict.items())
print(bigram_list[:5], '...')   
tkn_freq = decode_bigram_freq(bigrams_dict)
tkn_freq = list(tkn_freq.items())
print(tkn_freq[:5], '...')   

# Sorted bigrams by frequency
print('Sorted bigrams frenquecies (descending):')  
bigram_list = sorted(bigrams_dict.items(), key = lambda value: value[1], reverse=True)
print(bigram_list[:5], '...')   
tkn_freq = decode_bigram_freq(bigrams_dict)
tkn_freq = sorted(tkn_freq.items(), key = lambda value: value[1], reverse=True)
print(tkn_freq[:5], '...', '\n')   

Bigrams frenquecies:
[((100257, 45767), 118), ((45767, 1776), 2), ((1776, 9769), 2), ((9769, 70), 4), ((70, 2194), 2)] ...
[(('<|endoftext|>', 'Um'), 118), (('Um', ' sl'), 2), ((' sl', 'ô'), 2), (('ô', 'g'), 4), (('g', 'ane'), 2)] ...
Sorted bigrams frenquecies (descending):
[((409, 220), 136352), ((991, 220), 66582), ((220, 1049), 59526), ((13, 362), 57300), ((11, 297), 57276)] ...
[((' de', ' '), 136352), ((' em', ' '), 66582), ((' ', '200'), 59526), (('.', ' A'), 57300), ((',', ' o'), 57276)] ... 



Get the two most frequently tokens.

In [47]:
bigram_tk_A = tkn_freq[0][0][0]
bigram_tk_B = tkn_freq[0][0][1]
print(f"The most frequently token (A): '{bigram_tk_A}'")
print(f"The second most frequently token (A): '{bigram_tk_B}'")

The most frequently token (A): ' de'
The second most frequently token (A): ' '


Sort the vocabulary and move the special token to the begining of the vocabulary. 

In [36]:
sort_voc = sorted(vocabulary)
print(f"Is '{end_token}' into the Vocabulary? {end_token in sort_voc} \n  ({sort_voc[:5]} ...)")
sort_voc.remove(end_token)
sort_voc = [end_token] + sort_voc
print(f"Is '{end_token}' into the Vocabulary? {end_token in sort_voc} \n  ({sort_voc[:5]} ...)")

Is '<|endoftext|>' into the Vocabulary? True 
  ([' ', ' !', ' !!', ' !=', ' "'] ...)
Is '<|endoftext|>' into the Vocabulary? True 
  (['<|endoftext|>', ' ', ' !', ' !!', ' !='] ...)


### Token mappings
Create dictionaries to map each token to an integer (<code>stoi</code>) and an integer to a token (<code>itos</code>). They must have the same size of the vocabulary.

In [40]:
# Maps it token (string) to a integer (sequencialy). For simplification we make the 'end_token' be the first element of the dictionaries ('stoi' and 'itos')
stoi = {s:i for i, s in enumerate(sort_voc)}  # stoi - string (word) to integer    
itos = {i:s for s, i in stoi.items()}

print("Dicionary: 'stoi'")
print("  ", list(stoi.items())[:7], '...')
print("Dicionary: 'itos'")
print("  ", list(itos.items())[:7], '...')
print(f"\nVocabulary size: {len(sort_voc)}")
print(f"stoi: {len(stoi)}")
print(f"itos: {len(itos)}", "\n")

Dicionary: 'stoi'
   [('<|endoftext|>', 0), (' ', 1), (' !', 2), (' !!', 3), (' !=', 4), (' "', 5), (' ""', 6)] ...
Dicionary: 'itos'
   [(0, '<|endoftext|>'), (1, ' '), (2, ' !'), (3, ' !!'), (4, ' !='), (5, ' "'), (6, ' ""')] ...

Vocabulary size: 48016
stoi: 48016
itos: 48016 



### Frequency table

In [41]:
# Create table of frequencies for bigrams
print("Frequency table:")
total_tokens = len(stoi)
N = torch.zeros((total_tokens, total_tokens), dtype=torch.int32)
for text_tkn in texts_tokens:
    for tk1, tk2 in zip(text_tkn, text_tkn[1:]):      
      r = stoi[tk1] # row index
      c = stoi[tk2] # col index
      N[r, c] += 1  

print(N[0:15,0:15], "...")
print(f"'{bigram_tk_A}' = ", stoi[bigram_tk_A])
print(f"'{bigram_tk_B}' = ", stoi[bigram_tk_B])
print(f"N[{stoi[bigram_tk_A]}, {stoi[bigram_tk_B]}] =", N[stoi[bigram_tk_A], stoi[bigram_tk_B]].item(), "\n")

Frequency table:
tensor([[ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
   

Compute the probability table of bigrams.

In [48]:
# Compute the table of probabilities
table_probabilities = (N+1).float()
table_probabilities /= table_probabilities.sum(1, keepdim=True)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 9222145024 bytes.