This notebook builds an autocomplete model using n-grams. It has been adapted from an assignent from Coursera's NLP with Probababilistic Models Course (Week 3).

In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.data.path.append('.')
import random
from collections import defaultdict


In [2]:
# import twitter data - single files with line separated tweets
with open("en_US.twitter.txt", "r",encoding="utf-8") as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------
Last 300 letters of the data
-------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

-------


In [3]:
# data preprocessing
from nltk.tokenize import word_tokenize

#fns to split and tokenize  data into a list of lists of tokens
def split_to_sentences(data):
    sentences = data.split('\n')
    sentences = [x.strip() for x in sentences]
    sentences = [x for x in sentences if len(x)>0]
    return sentences

def tokenize_sentences(sentences):
    sentences = [x.lower() for x in sentences]
    tokens =[]
    for text in sentences:
        tokens.append(word_tokenize(text))
    return tokens

def get_tokenized_data(data):
    tokenized_sentences=[]
    sentences = split_to_sentences(data)
    return tokenize_sentences(sentences)

# fn to count frequency o tokens
def count_words(tokenized_sentences):
    word_counts = defaultdict(int)
    for tokens in tokenized_sentences:
        for token in tokens:
            word_counts[token]+=1
    return word_counts

#fn to filter words which  occur at least n times
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):

    closed_vocab=[]
    word_counts = count_words(tokenized_sentences)

    for word in word_counts.keys():
        if word_counts[word]>=count_threshold:
            closed_vocab.append(word)
    
    return closed_vocab

# fn to replace infrequent words  with unknown token
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    replaced_tokenized_sentences=[]
    for tokens in tokenized_sentences:
        t=[]
        for token in tokens:
            if token not in vocabulary:
                t.append(unknown_token)
            else:
                t.append(token)
        replaced_tokenized_sentences.append(t)

    return replaced_tokenized_sentences

#fn to create train, test data and vocabulury using the above fns
def preprocess_data(train_data, test_data, count_threshold): #train and test data are tokenized sentences
    word_counts = count_words(train_data)
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    preprocessed_train_data = replace_oov_words_by_unk(train_data, vocabulary)

    preprocessed_test_data = replace_oov_words_by_unk(test_data, vocabulary)

    return preprocessed_train_data, preprocessed_test_data, vocabulary



In [4]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [5]:
# bringing it all together  - preprocessing the data 
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

Making the N-grams model

In [6]:
#fn to make a dict of unique n_grams and their frequency  in the training data 
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    n_grams = defaultdict(int)
    updated_data=[]
    for tokens in data:
        new_token = tuple([start_token]*(n) + tokens + [end_token])
        updated_data.append(new_token)

    for tokens in updated_data:
        for i in range(len(tokens)-n+1):
            n_grams[tuple(tokens[i:i+n])]+=1
    
    return n_grams

# given a word and previous n-gram,fn to estimate its probability
def estimate_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    n_plus1_gram = previous_n_gram + (word,)
    return (n_plus1_gram_counts[n_plus1_gram] + k)/(n_gram_counts[previous_n_gram]+k*vocabulary_size)

# fn to estimate all probabilities  of  words possible after agiven n-gram
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    word_probs=defaultdict(int)
    vocabulary = vocabulary + ["<e>", "<unk>"]
    for word in vocabulary:
        word_probs[word]=estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, len(vocabulary),k)

    return word_probs

In [7]:
#fn to make a count matrix with n-grams as rows and vocabularyas columns
def make_count_matrix(n_plus1_gram_counts, vocabulary,k=1):
    vocabulary=vocabulary+["<e>", "<unk>"]
    n_grams = list(set([x[0:-1] for x in n_plus1_gram_counts.keys()]))

    matrix = np.zeros((len(n_grams),len(vocabulary)))

    for i in range(len(n_grams)):
        for j in range(len(vocabulary)):
            key = tuple(n_grams[i]) + (vocabulary[j],)
            matrix[i,j] = n_plus1_gram_counts[key]
    matrix = pd.DataFrame(matrix, index=n_grams, columns=vocabulary)
    return matrix

#fn to make a probability matrix fromthe count matrix                   
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts,vocabulary,k)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [16]:
# test code

sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))

unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability("cat", "a", unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

Uni-gram:
defaultdict(<class 'int'>, {('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1})
Bi-gram:
defaultdict(<class 'int'>, {('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1})
The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


Perplexity

In [8]:
# perplexity - it is used to complare the performance of different ngram models. The lower the better
from math import log
from math import exp
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    n = len(list(n_gram_counts.keys())[0])
    perplexity=0
    sentence = ['<s>']*n + sentence + ['<e>']
    m = len(sentence)
    for i in range(n,m):
        perplexity+=log(estimate_probability(sentence[i], sentence[i-n:i], 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0))
    return exp((-1/m)*perplexity)
        

In [17]:
#test the perplexity function
perplexity_train1 = calculate_perplexity(sentences[0],
                                         unigram_counts, bigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       unigram_counts, bigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 2.8040
Perplexity for test sample: 3.9654


Final Autocomplete System 

In [14]:
# fn to suggest the most likely next word with its probability
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0]) 
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    if start_with!=None:
        i = len(start_with)
        probabilities = {k:v for k,v in probabilities.items() if k[:i]==start_with}
    top_suggestion = sorted(probabilities.items(), key=lambda item: item[1], reverse=True)[0]
    return (top_suggestion[0],top_suggestion[1])

# fn to suggest multiple next words using different n_grams
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [18]:
#we'll make multiple n gram models and get multiple suggestions from them
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [24]:
sample = input('Enter text for autocompletion: ')
previous_tokens = get_tokenized_data(sample)[0]
start = input('Optional: word starts with: ')

if start!="":
    suggestion = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0,start_with=start)
else:
    suggestion = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(suggestion)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[("'re", 0.0239720461563465),
 ('?', 0.002888086642599278),
 ('?', 0.001613228473482557),
 ('<e>', 0.00013489815189531904)]