### Instructions

1. Load and preprocess data
    - Load and tokenize data.
    - Split the sentences into train and test sets.
    - Replace words with a low frequency by an unknown marker `<unk>`.

2. Develop N-gram based language models
    - Compute the count of n-grams from a given data set.
    - Estimate the conditional probability of a next word with k-smoothing.
    
3. Evaluate the N-gram models by computing the perplexity score.
4. Use your own model to suggest an upcoming word given your sentence.

In [3]:
import math
import random
import numpy as np
import pandas as pd
import nltk

nltk.download('punkt')

nltk.data.path.append('.')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pallavisingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load and Preprocess Data

In [4]:
with open('./data/en_US.twitter.txt','r') as f:
  data = f.read()


print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------


Preprocess this data with the following steps:

1. Split data into sentences using "\n" as the delimiter.
2. Split each sentence into tokens. 3. Note that in this assignment we use "token" and "words" interchangeably.
4. Assign sentences into train or test sets.
5. Find tokens that appear at least N times in the training data.
6. Replace tokens that appear less than N times by <unk>

In [5]:
def split_sentences(data):

  sentences = data.split('\n')

  sentences = [i.strip() for i in sentences]
  sentences = [s for s in sentences if len(s) > 0]

  return sentences

In [6]:
def tokenize_sentences(sentences):
  tokens_lists = []
  for s in sentences:

    tokenized = nltk.word_tokenize(s.lower())
    tokens_lists.append(tokenized)

  return tokens_lists




In [7]:
tokenize_sentences(['I am a girl','you are a boy'])

[['i', 'am', 'a', 'girl'], ['you', 'are', 'a', 'boy']]

In [8]:
def get_tokenized_data(data):

    # Get the sentences by splitting up the data
    sentences = split_sentences(data)

    # Get the list of lists of tokens by tokenizing the sentences
    tokenized_sentences = tokenize_sentences(sentences)

    return tokenized_sentences

In [9]:
# test your function
x = "Sky is blue.\nLeaves are green\nRoses are red."
get_tokenized_data(x)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

In [10]:
tokenized_data = get_tokenized_data(data)
random.seed(53)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data)*0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [11]:
print("First training sample:")
print(train_data[0])

print("First test sample")
print(test_data[0])

First training sample:
['singing', 'instead', 'of', 'doing', 'my', 'project', '.']
First test sample
['right', 'back', 'at', 'you', 'from', 'scottsdale', '!', '\\m/']


In [12]:
 def count_words(tokenize_sentences):
  word_freq = {}

  for s in tokenize_sentences:

    for t in s:

      if t in word_freq.keys():
        word_freq[t] += 1

      else:
        word_freq[t] = 1

  return word_freq


In [13]:
count_words(get_tokenized_data(x))

{'sky': 1,
 'is': 1,
 'blue': 1,
 '.': 2,
 'leaves': 1,
 'are': 2,
 'green': 1,
 'roses': 1,
 'red': 1}

### Handling 'Out of Vocabulary' words

In [14]:
def get_words_higher_freq(tokenize_sentences,count_threshold):
  count_dict= count_words(tokenize_sentences)
  closed_vocab = []
  for i in count_dict.keys():

    if count_dict[i] >= count_threshold:
      closed_vocab.append(i)

  return closed_vocab



In [15]:
# test your code
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
tmp_closed_vocab = get_words_higher_freq(tokenized_sentences, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


In [16]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    updated_tokenized_sentences = []
    vocabulary_set = set(vocabulary)  # Convert vocabulary to a set for faster look-up

    for s in tokenized_sentences:
        replaced_sentence = []
        for t in s:
            if t in vocabulary_set:
                replaced_sentence.append(t)
            else:
                replaced_sentence.append(unknown_token)
        updated_tokenized_sentences.append(replaced_sentence)

    return updated_tokenized_sentences





In [17]:
# test your code
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['are', 'are', 'red', '.']]
vocabulary = ['leaves', 'blue']
replace_oov_words_by_unk(tokenized_sentences, vocabulary)

[['<unk>', '<unk>', 'blue', '<unk>'],
 ['leaves', '<unk>', '<unk>', '<unk>'],
 ['<unk>', '<unk>', '<unk>', '<unk>']]

In [18]:
def preprocess_data(train_data, test_data, count_threshold, unknown_token ='unk'):

    # Get the closed vocabulary using the train data
    vocabulary = get_words_higher_freq(train_data,count_threshold)

    # For the train data, replace less common words with "<unk>"
    train_data_replaced = replace_oov_words_by_unk(train_data,vocabulary,unknown_token=unknown_token)

    # For the test data, replace less common words with "<unk>"
    test_data_replaced = replace_oov_words_by_unk(test_data,vocabulary,unknown_token=unknown_token)

    ### END CODE HERE ###
    return train_data_replaced, test_data_replaced, vocabulary


In [19]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]

tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data(tmp_train,
                                                           tmp_test,
                                                           count_threshold = 1
                                                          )

print("tmp_train_repl")
print(tmp_train_repl)
print()
print("tmp_test_repl")
print(tmp_test_repl)
print()
print("tmp_vocab")
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['unk', 'are', 'unk', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


In [20]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data,
                                                                        test_data,
                                                                        minimum_freq)

### Develop n-gram based language models


In [21]:
def count_n_grams(data, n, start_token = '<s>' , end_token = '<e>'):
  n_grams = {}

  for s in data:
    s =  [start_token] * n + s + [end_token]

    s_tup = tuple(s)

    for i in range(len(s_tup)) if n == 1 else range(len(s_tup) - n + 1):

      n_gram = s_tup[i:i+n]

      if n_gram in n_grams.keys():
        n_grams[n_gram] += 1

      else:
        n_grams[n_gram] = 1

  return n_grams




In [22]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


In [23]:
def estimate_probability(word, previous_n_gram,
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):

    previous_n_gram = tuple(previous_n_gram)

    count_previous_n_gram = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0

    denominator = count_previous_n_gram + vocabulary_size *k

    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0

    numerator =  n_plus1_gram_count + k

    probability = numerator/denominator

    return probability

In [24]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability("cat", "a", unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


In [25]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>",  k=1.0):


    previous_n_gram = tuple(previous_n_gram)
    count_previous_n_gram = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
    vocabulary = vocabulary +[end_token, unknown_token]
    denominator = count_previous_n_gram + len(vocabulary) *k
    out_dict = {}
    for word in vocabulary:
      numerator = 0
      n_plus1_gram = previous_n_gram + (word,)
      n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0

      numerator =  n_plus1_gram_count + k

      probability = numerator/denominator
      out_dict[word] = probability
    return out_dict

In [26]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

estimate_probabilities("a", unigram_counts, bigram_counts, unique_words, k=1)


{'like': 0.09090909090909091,
 'this': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'i': 0.09090909090909091,
 'a': 0.09090909090909091,
 'is': 0.09090909090909091,
 'cat': 0.2727272727272727,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [27]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):

  vocabulary = vocabulary + ['<e>','<unk>']

  n_grams = []

  for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]  # Correct slicing to get the n-gram
        n_grams.append(n_gram)

  n_grams = list(set(n_grams))

  row_index = {n_gram : i for i, n_gram in enumerate(n_grams)}

  col_index = {word : j for j , word in enumerate(vocabulary)}
  nrow = len(n_grams)
  ncol = len(vocabulary)

  count_matrix = np.zeros((nrow,ncol))

  for n_plus1_gram, count in n_plus1_gram_counts.items():

    n_gram = n_plus1_gram[0:-1]
    word = n_plus1_gram[-1]

    if word not in vocabulary:
            continue
    i = row_index[n_gram]
    j = col_index[word]
    count_matrix[i, j] = count


    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix






In [28]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)

print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,like,this,dog,i,a,is,cat,<e>,<unk>
"(like,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(is,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(a,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [30]:

sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,like,this,dog,i,a,is,cat,<e>,<unk>
"(like,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
"(this,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
"(dog,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
"(is,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
"(i,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
"(<s>,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(cat,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
"(a,)",0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111


# Perplexity

In [31]:
import numpy as np

def calculate_perplexity(test_data, n_gram_counts, vocabulary, n):
    """
    Calculate the perplexity of an n-gram model on test data.

    Args:
    test_data (list of list of str): Tokenized test sentences.
    n_gram_counts (dict): Counts of n-grams.
    vocabulary (list of str): List of unique words in the vocabulary.
    n (int): The n in n-gram.

    Returns:
    float: Perplexity score.
    """
    N = 0  # Total number of n-grams in the test set
    log_likelihood = 0.0  # Sum of log probabilities

    vocabulary_size = len(vocabulary)
    vocabulary_set = set(vocabulary)

    for sentence in test_data:
        # Pad the sentence with start tokens
        sentence = ['<s>'] * (n - 1) + sentence

        # Loop through the sentence and calculate the log-likelihood
        for i in range(len(sentence) - n + 1):
            n_gram = tuple(sentence[i:i + n])
            n_minus_1_gram = tuple(sentence[i:i + n - 1])
            next_word = sentence[i + n - 1]

            # Calculate n-gram and (n-1)-gram counts
            n_gram_count = n_gram_counts.get(n_gram, 0)
            n_minus_1_gram_count = sum(
                count for ngram, count in n_gram_counts.items() if ngram[:-1] == n_minus_1_gram)

            # Calculate the probability with add-one smoothing
            probability = (n_gram_count + 1) / (n_minus_1_gram_count + vocabulary_size)

            # Update log-likelihood
            log_likelihood += np.log(probability)
            N += 1

    # Calculate perplexity
    perplexity = np.exp(-log_likelihood / N)
    return perplexity




In [32]:
# Example usage
n_gram_counts = {
    ('<s>', 'I', 'love'): 2,
    ('I', 'love', 'Python'): 2,
    ('love', 'Python', 'is'): 1,
    ('Python', 'is', 'great'): 3,
    ('is', 'great', '</s>'): 2
}
vocabulary = ['<s>', 'I', 'love', 'Python', 'is', 'great', '</s>']
test_data = [["I", "love", "Python"], ["Python", "is", "great"]]
n = 3

perplexity = calculate_perplexity(test_data, n_gram_counts, vocabulary, n)
print("Perplexity:", perplexity)

Perplexity: 4.445419733444931


# Build an auto-complete system

In [33]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary,end_token='<e>', unknown_token="<unk>", k=1.0, start_with=None):

  n = len(list(n_gram_counts.keys())[0])

  previous_n_gram = previous_tokens[-n:]

  probabilities = estimate_probabilities(previous_n_gram,n_gram_counts,n_plus1_gram_counts, vocabulary, k=k)

  suggestion = None
  max_prob = 0

  for word, prob in probabilities.items():

    if start_with != None:
      if not word.startswith(start_with):

        continue

    if prob > max_prob:

      suggestion = word

      max_prob = prob

  return suggestion, max_prob


In [34]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}")


The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727


In [35]:
tmp_starts_with = 'c'
tmp_suggest2 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with=tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{tmp_starts_with}`\n\tand the suggested word is `{tmp_suggest2[0]}` with a probability of {tmp_suggest2[1]:.4f}")

The previous words are 'i like', the suggestion must start with `c`
	and the suggested word is `cat` with a probability of 0.0909


The function defined below loop over varioud n-gram models to get multiple suggestions

In [36]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
  model_counts = len(n_gram_counts_list)
  suggestions = []

  for i in range(model_counts-1):
    n_gram_counts =  n_gram_counts_list[i]

    n_plus1_gram_counts = n_gram_counts_list[i+1]

    suggestion = suggest_a_word(previous_tokens,n_gram_counts,n_plus1_gram_counts,vocabulary,k = 1, start_with=start_with)

    suggestions.append(suggestion)

  return suggestions

In [37]:
n_gram_counts_list = [
    {('I',): 10, ('love',): 5, ('to',): 7},  # Unigram counts
    {('I', 'love'): 4, ('love', 'to'): 3},   # Bigram counts
    {('I', 'love', 'to'): 2}                 # Trigram counts
]
vocabulary = ['I', 'love', 'to', 'code', 'Python']
previous_tokens = ['I', 'love']

suggestions = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with='c')
print(suggestions)


[('code', 0.08333333333333333), ('code', 0.09090909090909091)]
