#### We will build autocomplete system in this notebook. We will use n-gram as language model to predict the next word in the sentence. We will use twitter data as corpus.

In [1]:
#Import the libraries
import math
import random
import pandas as pd
import numpy as np
import nltk
nltk.data.path.append('.')

## Load and process data

In [2]:
#Load the data
with open('en_US.twitter.txt', 'r', encoding = 'utf8') as f:
    data = f.read()
    
print("Number of letters:", len(data))
print("First 100 letters of the data")
print("-------")
display(data[0:100])

Number of letters: 3335477
First 100 letters of the data
-------


'How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way '

#### Preprocess the data

In [3]:
# Split the data into sentence (Split by \n)
def split_data_to_sentences(data):
    
    sentences = data.split('\n')
    
    #Remove the extra spaces to remove empty line
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if s]
    return sentences

#Test the function
temp_data ='I like NLP.\n I like Machine Learning.\n I like Data Science.\n '
print(temp_data)

print(split_data_to_sentences(temp_data))

I like NLP.
 I like Machine Learning.
 I like Data Science.
 
['I like NLP.', 'I like Machine Learning.', 'I like Data Science.']


In [4]:
#Split the sentence into tokens (Use nltk's word tokenizer)
def split_sentences_to_tokens(sentences):
    
    tokenized_sentences = []
    
    for sentence in sentences:
        
        #Convert to lowercase
        sentence = sentence.lower()
        
        #Tokenize the sentence
        sentence = nltk.word_tokenize(sentence)
        
        tokenized_sentences.append(sentence)
    
    return tokenized_sentences

#Test the function
temp_sentences = ['I like NLP.', 'I like Machine Learning.', 'I like Data Science.']
print(temp_sentences)

print(split_sentences_to_tokens(temp_sentences))

['I like NLP.', 'I like Machine Learning.', 'I like Data Science.']
[['i', 'like', 'nlp', '.'], ['i', 'like', 'machine', 'learning', '.'], ['i', 'like', 'data', 'science', '.']]


In [5]:
#Define a function that creates tokens directly from data
def tokenize_data(data):
    
    #Get the sentences from data
    sentences = split_data_to_sentences(data)
    
    #Get the tokenized sentences from sentences
    tokenized_sentences = split_sentences_to_tokens(sentences)
    
    return tokenized_sentences

#Test the function
temp_data = 'I like NLP.\n I like Machine Learning.\n I like Data Science.\n '
print(tokenize_data(temp_data))

[['i', 'like', 'nlp', '.'], ['i', 'like', 'machine', 'learning', '.'], ['i', 'like', 'data', 'science', '.']]


In [6]:
#Tokenize our data
tokenized_data = tokenize_data(data)
print(tokenized_data[0])

['how', 'are', 'you', '?', 'btw', 'thanks', 'for', 'the', 'rt', '.', 'you', 'gon', 'na', 'be', 'in', 'dc', 'anytime', 'soon', '?', 'love', 'to', 'see', 'you', '.', 'been', 'way', ',', 'way', 'too', 'long', '.']


#### Split the data into train and test

In [7]:
#Split data in 80-20 ratio

def train_test_split(tokenized_sentences, percentage):
    #tokenized_data = tokenize_data(data)
    random.seed(42)
    random.shuffle(tokenized_sentences)

    train_size = int(len(tokenized_sentences) * percentage / 100)
    train_data = tokenized_sentences[:train_size]
    test_data = tokenized_sentences[train_size:]
    
    return train_data, test_data

temp_data = 'I like NLP.\n I like Machine Learning.\n I like Data Science.\n '
temp_tokenized_sentences = tokenize_data(temp_data)
temp_train_data, temp_test_data = train_test_split(temp_tokenized_sentences, 67)
print("{} data are split into {} train and {} test set".format(
    len(temp_tokenized_sentences), len(temp_train_data), len(temp_test_data)))

print("First training sample:")
print(temp_train_data[0])
      
print("First test sample")
print(temp_test_data[0])

3 data are split into 2 train and 1 test set
First training sample:
['i', 'like', 'machine', 'learning', '.']
First test sample
['i', 'like', 'data', 'science', '.']


In [8]:
#Split our data into train and test
train_data, test_data = train_test_split(tokenized_data, 80)
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])
      
print("First test sample")
print(test_data[0])

47961 data are split into 38368 train and 9593 test set
First training sample:
['what', 'little', 'i', 'have', 'goes', 'toward', 'my', 'commute', '.', '$', '4', 'gasoline', '.']
First test sample
['at', 'least', 'it', 'was', 'ervin', 'santana', "'s", 'own', 'wild', 'pitch', 'that', 'kept', 'him', 'from', 'throwing', 'a', 'scorelss', 'no-hitter', '.', 'pretty', 'unique', 'i', 'must', 'say', '.']


In [9]:
#Rather than using all the words, we will use only words with a threshold frequency to make efficient computation

#Define the function to word counts
def word_count(tokenized_sentences):
    
    word_counts = {}
    
    for sentence in tokenized_sentences:
        
        for word in sentence:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1
    return word_counts

#Test the code
# test your code
temp_tokenized_sentences = [['i', 'like', 'nlp', '.'], ['i', 'like', 'machine', 'learning', '.'], ['i', 'like', 'data', 'science', '.']]
print(word_count(temp_tokenized_sentences))

{'i': 3, 'like': 3, 'nlp': 1, '.': 3, 'machine': 1, 'learning': 1, 'data': 1, 'science': 1}


### Create a vocabulary 

In [10]:
#Define a function to create a vocabulary with words having frequency greater than threshold
def create_vocab(tokenized_sentences, threshold):
    
    #Create a list to store the vocab
    vocab = []
    
    #Get the word counts of all the tokens
    word_counts = word_count(tokenized_sentences)
    
    #Iterate over the dictionary and check the occurence of all words
    for word, count in word_counts.items():
        
        if count >= threshold:
            vocab.append(word)
    return vocab

#Test the function
temp_tokenized_sentences = [['i', 'like', 'nlp', '.'], ['i', 'love', 'machine', 'learning', '.'], ['i', 'like', 'data', 'science', '.']]
print(create_vocab(temp_tokenized_sentences, 2))
print(create_vocab(temp_tokenized_sentences, 3))

['i', 'like', '.']
['i', '.']


#### Handling out of vocabulory words (frequency than threshold)

In [11]:
# Replace the words with frequency less than threshold with unknown tag

def replace_oov_word(tokenized_sentences, vocab, unknown_tag = '<unk>'):
    
    #Get the set of vocabulary
    vocab = set(vocab)
    
    #Initiate a list to store final sentences
    replaced_tokenized_sentences = []
    
    for sentence in tokenized_sentences:
        
        replaced_sentence = []
        
        for word in sentence:
            
            if word in vocab:
                replaced_sentence.append(word)
            else:
                replaced_sentence.append(unknown_tag)
        
        replaced_tokenized_sentences.append(replaced_sentence)
    
    return replaced_tokenized_sentences

#Test the code
temp_tokenized_sentences = [['i', 'like', 'nlp', '.'], ['i', 'love', 'machine', 'learning', '.'], ['i', 'like', 'data', 'science', '.']]
temp_vocab = create_vocab(temp_tokenized_sentences, 2)

print(replace_oov_word(temp_tokenized_sentences, temp_vocab, '<unk>'))

[['i', 'like', '<unk>', '.'], ['i', '<unk>', '<unk>', '<unk>', '.'], ['i', 'like', '<unk>', '<unk>', '.']]


### Create the preprocessed train and test data

In [12]:
#Define the function to preprocess the train and test data
def preprocess_data(train_data, test_data, threshold):
    
    #Get the vocab
    vocab = create_vocab(train_data, threshold)
    
    #Create the replaced train and test data
    train_data_replaced = replace_oov_word(train_data, vocab, '<unk>')
    test_data_replaced = replace_oov_word(test_data, vocab, '<unk>')
    
    return train_data_replaced, test_data_replaced, vocab

#Test the function
temp_tokenized_sentences = [['i', 'like', 'nlp', '.'], ['i', 'love', 'machine', 'learning', '.'], ['i', 'like', 'data', 'science', '.']]
temp_train_data, temp_test_data = train_test_split(temp_tokenized_sentences, 67)
temp_train_data_replaced, temp_test_data_replaced, temp_vocab = preprocess_data(temp_train_data, temp_test_data, 2)
print("Preprocessed train data: ", temp_train_data_replaced)
print("Preprocessed test data: ", temp_test_data_replaced)

Preprocessed train data:  [['i', '<unk>', '<unk>', '<unk>', '.'], ['i', '<unk>', '<unk>', '.']]
Preprocessed test data:  [['i', '<unk>', '<unk>', '<unk>', '.']]


In [13]:
#Preprocess our data
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, test_data, threshold = 2)
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['what', 'little', 'i', 'have', 'goes', 'toward', 'my', 'commute', '.', '$', '4', 'gasoline', '.']

First preprocessed test sample:
['at', 'least', 'it', 'was', '<unk>', 'santana', "'s", 'own', 'wild', 'pitch', 'that', 'kept', 'him', 'from', 'throwing', 'a', '<unk>', 'no-hitter', '.', 'pretty', 'unique', 'i', 'must', 'say', '.']

First 10 vocabulary:
['what', 'little', 'i', 'have', 'goes', 'toward', 'my', 'commute', '.', '$']

Size of vocabulary: 14859


## Develop n-gram based language model

In [14]:
#Define a function that computes the n-grams count
def count_n_grams(data, n, start_token = '<s>', end_token = '<e>'):
    
    #Initialize dictionary to store the counts
    n_grams = {}
    
    for sentence in data:
        
        #Append start and end tokens
        sentence = [start_token] * n + sentence + [end_token]
        
        #Convert to tuple to store as key
        sentence = tuple(sentence)
        
        if n == 1:
            l = len(sentence)
        else:
            l = len(sentence) - 1
            
        for i in range(l):
            
            #Get n-gram
            n_gram = sentence[i: i + n]
            
            #Increase the count in dictionary
            if n_gram in n_grams:
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
            
    return n_grams

#Test the function
temp_sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]

print("Uni-gram:")
print(count_n_grams(temp_sentences, 1))
print("Bi-gram:")
print(count_n_grams(temp_sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


In [15]:
#Estimate the probability of a n-gram word
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocab_size, k = 1.0):
    
    #Convert the previous n_gram list to tuple
    previous_n_gram = tuple(previous_n_gram)
    
    #Get the previous n_gram count
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
    
    #Set the denominator
    denominator = previous_n_gram_count + k * vocab_size
    
    #Create the n_plus_1 word 
    n_plus1_gram = previous_n_gram + (word,)
    
    #Check its count in other dictionary (+1 gram)
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0
    
    numerator = n_plus1_gram_count + k
    
    #Calculate the probability
    probability = numerator / denominator
    
    return probability

#Test the function
# test your code
temp_sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))

temp_unigram_counts = count_n_grams(temp_sentences, 1)
temp_bigram_counts = count_n_grams(temp_sentences, 2)
temp_tmp_prob = estimate_probability("cat", "a", temp_unigram_counts, temp_bigram_counts, len(temp_unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {temp_tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


In [16]:
#Define a function to estimate the probability of all words
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k = 1.0):
    
    #Create the list to tuple
    previous_n_gram = tuple(previous_n_gram)
    
    #Add unknown and end tokens to vocabulary. <s> is not needed as it does not appear as next word
    vocabulary += ['<e>' , '<unk>']
    vocab_size = len(vocabulary)
    
    probabilities = {}
    
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocab_size, k)
        probabilities[word] = probability
    
    return probabilities

#Test the code as following word 'a'
temp_sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))
temp_unigram_counts = count_n_grams(temp_sentences, 1)
temp_bigram_counts = count_n_grams(temp_sentences, 2)
estimate_probabilities("a", temp_unigram_counts, temp_bigram_counts, temp_unique_words, k=1)

{'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'this': 0.09090909090909091,
 'is': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'i': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [17]:
#Test on bigram and trigram as starting word
temp_trigram_counts = count_n_grams(temp_sentences, 3)
estimate_probabilities(["<s>", "<s>"], temp_bigram_counts, temp_trigram_counts, temp_unique_words, k=1)

{'a': 0.07692307692307693,
 'like': 0.07692307692307693,
 'dog': 0.07692307692307693,
 'this': 0.15384615384615385,
 'is': 0.07692307692307693,
 'cat': 0.07692307692307693,
 'i': 0.15384615384615385,
 '<e>': 0.07692307692307693,
 '<unk>': 0.07692307692307693}

## Create count and probability matrix

In [18]:
#Define a function that creates a count matrix
def create_count_matrix(n_plus1_gram_counts, vocabulary):
    
    #Add end and unknown tokens
    vocabulary += ['<e>', '<unk>']
    
    #Get the n-grams
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    
    #Mapping n-gram to row
    row_index = {n_gram: i for i, n_gram in enumerate(n_grams)}
    #Mapping next word to column
    col_index = {word: j for j, word in enumerate(vocabulary)}
    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    
    count_matrix = np.zeros((nrow, ncol))
    
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[:-1]
        word = n_plus1_gram[-1]
        
        if word not in vocabulary:
            continue
        
        i = row_index[n_gram]
        j = col_index[word]
        
        count_matrix[i][j] = count
    
    #Convert matrix to DataFrame
    count_matrix = pd.DataFrame(count_matrix, index = n_grams, columns = vocabulary)
    return count_matrix

#Test the function
temp_sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))
temp_bigram_counts = count_n_grams(temp_sentences, 2)

print('bigram counts')
display(create_count_matrix(temp_bigram_counts, temp_unique_words))

bigram counts


Unnamed: 0,a,like,dog,this,is,cat,i,<e>,<unk>
"(this,)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>,)",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
"(a,)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
"(i,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(like,)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(is,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [19]:
#Check for trigram counts
print("Trigram counts")
temp_sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))
temp_trigram_counts = count_n_grams(temp_sentences, 3)
display(create_count_matrix(temp_trigram_counts, temp_unique_words))

Trigram counts


Unnamed: 0,a,like,dog,this,is,cat,i,<e>,<unk>
"(<s>, this)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(is, like)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog, is)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i, like)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, <s>)",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
"(<s>, i)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(like, a)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
"(a, cat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(this, dog)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [20]:
#Define a function to convert the count matrix to the probability matrix
def create_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    
    #Get the count matrix
    count_matrix = create_count_matrix(n_plus1_gram_counts, vocabulary)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

#Test the function
temp_sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))
temp_bigram_counts = count_n_grams(temp_sentences, 2)
print("bigram probabilities")
display(create_probability_matrix(temp_bigram_counts, temp_unique_words, k=1))

bigram probabilities


Unnamed: 0,a,like,dog,this,is,cat,i,<e>,<unk>
"(this,)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>,)",0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.181818,0.090909,0.090909
"(a,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909
"(i,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(like,)",0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(dog,)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(is,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909


In [21]:
#Check for trigram probabilities
temp_sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))
temp_trigram_counts = count_n_grams(temp_sentences, 3)
print("Trigram probabilities")
display(create_probability_matrix(temp_trigram_counts, temp_unique_words, k=1))

Trigram probabilities


Unnamed: 0,a,like,dog,this,is,cat,i,<e>,<unk>
"(<s>, this)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1
"(is, like)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(dog, is)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(i, like)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, <s>)",0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.181818,0.090909,0.090909
"(<s>, i)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(like, a)",0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909
"(a, cat)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(this, dog)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909


### Evaluation using Perplexity

In [22]:
#Define a function to calculate perplexity
def calculatePerplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k = 1.0):
    
    #Get the length of n_gram_counts
    n = len(list(n_gram_counts.keys())[0])
    
    #Append start and end tags
    sentence = ['<s>'] * n + sentence + ['</e>']
    
    #Convert the sentence to tuple
    sentence = tuple(sentence)
    
    #Get the length of sentence
    N = len(sentence)
    
    product_pi = 1.0
    
    #Iterate from n to N
    for i in range(n, N):
        n_gram = sentence[i-n: i]
        
        word = sentence[i]
        
        #Get the probability
        prob = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k = 1)
        
        product_pi *= 1 / prob
    
    perplexity = product_pi ** (1 / float(N))
    
    return perplexity

#Test the function
temp_sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))

temp_unigram_counts = count_n_grams(temp_sentences, 1)
temp_bigram_counts = count_n_grams(temp_sentences, 2)


temp_perplexity_train1 = calculatePerplexity(temp_sentences[0],
                                         temp_unigram_counts, temp_bigram_counts,
                                         len(temp_unique_words), k=1.0)
print(f"Perplexity for first train sample: {temp_perplexity_train1:.4f}")

temp_test_sentence = ['i', 'like', 'a', 'dog']
temp_perplexity_test = calculatePerplexity(temp_test_sentence,
                                       temp_unigram_counts, temp_bigram_counts,
                                       len(temp_unique_words), k=1.0)
print(f"Perplexity for test sample: {temp_perplexity_test:.4f}")

Perplexity for first train sample: 3.3674
Perplexity for test sample: 3.9654


## Build an autocomplete system

In [23]:
#Define a function that suggests the next word based on given text
def suggest_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k = 1.0, starts_with = None):
    
    #Get the length of previous words
    n = len((list(n_gram_counts.keys()))[0])
    
    #Get the last n words
    previous_n_gram = previous_tokens[-n:]
    
    #Get the probabilities for each word given previous words
    probabilities = estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k = k)
    
    #Initialize the suggestion word
    suggestion = None
    
    #Initialize the probability to zero
    max_prob = 0
    
    #Iterate over all word in probabilities and get max prob
    for word, prob in probabilities.items():
        
        #Check if start with is given
        if starts_with:
            
            #If the word does not begin with start with, continue
            if not word.startswith(starts_with):
                continue
        
        #Update suggestion if prob > max_prob
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    
    return suggestion, max_prob

#Test the function
# test your code
temp_sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
temp_unique_words = list(set(temp_sentences[0] + temp_sentences[1]))

temp_unigram_counts = count_n_grams(temp_sentences, 1)
temp_bigram_counts = count_n_grams(temp_sentences, 2)

temp_previous_tokens = ["i", "like"]
temp_tmp_suggest1 = suggest_word(temp_previous_tokens, temp_unigram_counts, temp_bigram_counts, temp_unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{temp_tmp_suggest1[0]}` with a probability of {temp_tmp_suggest1[1]:.4f}")

print()
# test your code when setting the starts_with
temp_tmp_starts_with = 'c'
temp_tmp_suggest2 = suggest_word(temp_previous_tokens, temp_unigram_counts, temp_bigram_counts, temp_unique_words, k=1.0, starts_with=temp_tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{temp_tmp_starts_with}`\n\tand the suggested word is `{temp_tmp_suggest2[0]}` with a probability of {temp_tmp_suggest2[1]:.4f}")

The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727

The previous words are 'i like', the suggestion must start with `c`
	and the suggested word is `cat` with a probability of 0.0769


In [24]:
#Define a function to get multiple suggestions
def get_multiple_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k = 1.0, starts_with = None):
    
    model_counts = len(n_gram_counts_list)
    
    suggestions = []
    
    for i in range(model_counts - 1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i + 1]
        
        suggestion = suggest_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k = k, starts_with = starts_with)
        
        suggestions.append(suggestion)
    
    return suggestions

#Test the function
# test your code
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
trigram_counts = count_n_grams(sentences, 3)
quadgram_counts = count_n_grams(sentences, 4)
qintgram_counts = count_n_grams(sentences, 5)

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, qintgram_counts]
previous_tokens = ["i", "like"]
tmp_suggest3 = get_multiple_suggestions(previous_tokens, n_gram_counts_list, unique_words, k=1.0)

print(f"The previous words are 'i like', the suggestions are:")
display(tmp_suggest3)

The previous words are 'i like', the suggestions are:


[('a', 0.2727272727272727),
 ('a', 0.16666666666666666),
 ('a', 0.07692307692307693),
 ('a', 0.06666666666666667)]

In [25]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


## Get suggestions for our data

In [26]:
previous_tokens = ["i", "am", "to"]
tmp_suggest4 = get_multiple_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['i', 'am', 'to'], the suggestions are:


[('be', 0.02690861294680346),
 ('have', 0.00013449899125756557),
 ('have', 0.00013452613170108295),
 ('what', 6.726306585054147e-05)]

In [27]:
previous_tokens = ["i", "want", "to", "go"]
tmp_suggest5 = get_multiple_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:


[('to', 0.014243065057843016),
 ('to', 0.0050784856879039705),
 ('to', 0.0009392191064001073),
 ('to', 0.00040295500335795837)]

In [28]:
previous_tokens = ["hey", "how", "are"]
tmp_suggest6 = get_multiple_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest6)

The previous words are ['hey', 'how', 'are'], the suggestions are:


[('you', 0.02189156832563708),
 ('you', 0.0035460992907801418),
 ('what', 6.719978496068813e-05),
 ('what', 6.719075455217362e-05)]

In [29]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest7 = get_multiple_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest7)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[("'re", 0.023579545454545454),
 ('?', 0.0026195153896529143),
 ('?', 0.00147245833612208),
 ('what', 6.715465717547512e-05)]

In [30]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest8 = get_multiple_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, starts_with="d")

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[('do', 0.009290814670561505),
 ('doing', 0.0017017934284592224),
 ('doing', 0.00046825874640444177),
 ('dl', 6.711859856366199e-05)]