## Build language model to predict the next word for autocomplete 

### Implement count matrix

In [1]:
#Calculate the ngram count dictionary
n_gram_counts = {
    ('i', 'am', 'happy'): 2,
    ('am', 'happy', 'because'): 1
}

#Get the count of n-gram tuple
print(f"Count of n-gram {('i', 'am', 'happy')}: {n_gram_counts[('i', 'am', 'happy')]}")
      
#Check if n-gram is present in dictionary
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

# update the count in the word count dictionary
n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

Count of n-gram ('i', 'am', 'happy'): 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [2]:
#Create n_gram with concatenating a tuple for prefix and last word
prefix = ('i', 'am', 'happy')
word = 'because'

n_gram = prefix + (word,)
print(n_gram)

('i', 'am', 'happy', 'because')


In [3]:
#Define a function to create trigram counts
import numpy as np
import pandas as pd
from collections import defaultdict

def calculate_trigram_count_matrix(corpus):
    
    #Initialize empty matrix for storing bigrams and vocab
    bigrams = []
    vocabulary = []
    #Initialize empty dictionary to store trigram count
    count_matrix_d = defaultdict(dict)
    
    #Iterate through corpus for single pass
    for i in range(len(corpus) - 3 + 1):
        
        #Get the trigram and bigram
        trigram = tuple(corpus[i:i + 3])
        bigram = trigram[:-1]
        
        #Check for bigram in list
        if bigram not in bigrams:
            bigrams.append(bigram)
        
        #Get the last word of trigram
        last_word = trigram[-1]
        #Check of last word in vocabulary
        if last_word not in vocabulary:
            vocabulary.append(last_word)
        
        #Add the bigram and lastword in count dictionary
        if (bigram, last_word) not in count_matrix_d:
            count_matrix_d[bigram, last_word] = 1
        else:
            count_matrix_d[bigram, last_word] += 1
        
    #Convert the dictionary to numpy array
    count_matrix = np.zeros((len(bigrams), len(vocabulary)))

    #Fill the matrix from dictionary
    for trigram_key, trigram_count in count_matrix_d.items():
        count_matrix[bigrams.index(trigram_key[0]), vocabulary.index(trigram_key[1])] = trigram_count

    #Convert the matrix to dataframe
    count_matrix = pd.DataFrame(count_matrix, index = bigrams, columns = vocabulary)

    return bigrams, vocabulary, count_matrix

In [4]:
#Create a sample corpus
corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

#Get the bigrams, vocabulary and count_matrix
bigrams, vocabulary, count_matrix = calculate_trigram_count_matrix(corpus)

print(count_matrix)

                  happy  because    i   am  learning    .
(i, am)             1.0      0.0  0.0  0.0       1.0  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [5]:
#Convert the count matrix to probability matrix

#Get the sum of rows of Dataframe
row_sums = count_matrix.sum(axis = 1)

#Get the probability matrix
prob_matrix = count_matrix / row_sums[:, None]

print(prob_matrix)

                  happy  because    i   am  learning    .
(i, am)             0.5      0.0  0.0  0.0       0.5  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [6]:
#Find the probability of trigram

trigram = ('i', 'am', 'happy')

#Get the bigram and word
bigram, word = trigram[:-1], trigram[-1]

#Get the probability of trigram
trigram_probability = prob_matrix[word][bigram]

print(f'"Probability of {word} following {bigram} is {trigram_probability}"')

"Probability of happy following ('i', 'am') is 0.5"


In [7]:
#Check if word starts with given word
# lists all words in vocabulary starting with a given prefix
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 'you', 'seen','it', '?']
starts_with = 'ha'

print(f'words in vocabulary starting with prefix: {starts_with}\n')
for word in vocabulary:
    if word.startswith(starts_with):
        print(word)

words in vocabulary starting with prefix: ha

happy
have


### Language Model Evaluation

In [8]:
#Divide the data into training, validation and test data
import random

def training_validation_test_split(data, train_percentage, val_percentage):
    
    #Fix randomness
    random.seed(42)
    #Shuffle data
    random.shuffle(data)
    
    #Get the training data
    train_size = int(len(data) * train_percentage / 100)
    train_data = data[:train_size]
    
    #Get the validation data
    val_size = int(len(data) * val_percentage / 100)
    val_data = data[train_size: train_size + val_size]
    
    #Get the test data
    test_data = data[train_size + val_size:]
    
    return train_data, val_data, test_data

In [9]:
#Test the function
data = [x for x in range (0, 100)]

train_data, validation_data, test_data = training_validation_test_split(data, 80, 10)
print("split 80/10/10:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n", 
      f"test data:{test_data}\n")

train_data, validation_data, test_data = training_validation_test_split(data, 98, 1)
print("split 98/1/1:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n", 
      f"test data:{test_data}\n")

split 80/10/10:
 train data:[42, 41, 91, 9, 65, 50, 1, 70, 15, 78, 73, 10, 55, 56, 72, 45, 48, 92, 76, 37, 30, 21, 32, 96, 80, 49, 83, 26, 87, 33, 8, 47, 59, 63, 74, 44, 98, 52, 85, 12, 36, 23, 39, 40, 18, 66, 61, 60, 7, 34, 99, 46, 2, 51, 16, 38, 58, 68, 22, 62, 24, 5, 6, 67, 82, 19, 79, 43, 90, 20, 0, 95, 57, 93, 53, 89, 25, 71, 84, 77]
 validation data:[64, 29, 27, 88, 97, 4, 54, 75, 11, 69]
 test data:[86, 13, 17, 28, 31, 35, 94, 3, 14, 81]

split 98/1/1:
 train data:[39, 23, 13, 78, 19, 99, 41, 0, 45, 84, 93, 73, 38, 58, 57, 66, 7, 17, 25, 52, 8, 21, 59, 94, 64, 34, 88, 83, 75, 63, 15, 60, 62, 67, 53, 18, 14, 2, 4, 55, 98, 96, 12, 36, 76, 79, 5, 24, 70, 74, 81, 61, 91, 46, 48, 85, 22, 90, 32, 6, 80, 50, 1, 43, 27, 37, 77, 40, 86, 30, 42, 35, 68, 28, 51, 69, 49, 95, 97, 71, 82, 33, 26, 11, 3, 65, 16, 89, 10, 20, 54, 56, 92, 87, 47, 44, 31, 9]
 validation data:[72]
 test data:[29]



In [10]:
#Find the perplexity
p = 10 ** (-250)
M = 100
perplexity = p ** (-1 / M)
print(perplexity)

316.22776601683796
