<a href="https://colab.research.google.com/github/prahladpunia/AI/blob/main/ASSIGNMENT_UNIGRAM_BIGRAM_TRIGRAM_JUN_22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ASSIGNMENT -   
- Steps to build the next word recommender system   
- Loading and exploring the dataset    
- Creating N-grams of the dialogue   
- Building the N-gram Language Model   
- Predicting the next word using N-gram Language Model   

# Loading and exploring the dataset

In [1]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [7]:
# open text file and read in data
dialogs = pd.read_csv("sample_reuters_dataset.csv") 
  

In [8]:
dialogs.head()

Unnamed: 0,sentence_number,sentence_text
0,0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1,1,They told Reuter correspondents in Asian capit...
2,2,But some exporters said that while the conflic...
3,3,The U . S . Has said it will impose 300 mln dl...
4,4,Unofficial Japanese estimates put the impact o...


In [9]:
len(dialogs)

10000

In [10]:
# text cleaning
dialogs_clean = []

for i in dialogs["sentence_text"]:
  # remove everything except alphabets, ' and white spaces
  i = re.sub("[^a-zA-Z' ]", "", i)
  # convert text to lowercase
  i = i.lower()
  # add cleaned text to the list
  dialogs_clean.append(i)

In [11]:
random.sample(dialogs_clean, 10)

['insee changed its base year to  from  last month ',
 "he said the allocations were in line with indonesia ' s production ceiling under its current opec quota  so they might be increased if opec ratifies a production rise next week ",
 'pueblo international inc  lt  pii  sets payout qtly div five cts vs five cts prior pay june two record april ',
 'a company spokeswoman said the four dlr per share gain will be included in second quarter net  which compares with  cts per share last year  including the spirits and wine business  they netted over  mln dlrs for spirits and wine ',
 'bankers say the rebound in oil prices is the major reason for cautious optimism ',
 "the company said the san diego  based defense systems and software development company had sales of about  mln dlrs in  and will operate as part of ball ' s technical products group ",
 'without these currency fluctuations  net profit would have been    mln guilders higher and revenue    billion higher  natned said ',
 'squibb

In [12]:
# creating the vocabulary
# get list of all the words
all_words = " ".join(dialogs_clean).split()

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:   
    # check if the word is already in dictionary 
    if word in words_dict:
        # increment count of word by 1 
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1 
        words_dict[word] = 1

In [13]:
# word dictionary
words_dict

{'asian': 13,
 'exporters': 52,
 'fear': 8,
 'damage': 29,
 'from': 1369,
 'u': 1117,
 's': 2864,
 'japan': 441,
 'rift': 1,
 'mounting': 5,
 'trade': 549,
 'friction': 8,
 'between': 191,
 'the': 12496,
 'and': 4599,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6671,
 'asia': 14,
 "'": 2094,
 'exporting': 12,
 'nations': 71,
 'that': 1376,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 518,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5070,
 'capitals': 3,
 'a': 4412,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1643,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 200,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 119,
 'run': 21,
 'short': 87,
 'term': 120,
 'toky

In [14]:
# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by = ['count'])

# reset dataframe index
words_df.reset_index(inplace = True, drop=True)

In [15]:
# words with least frequency
words_df.head()

Unnamed: 0,word,count
0,ulcer,1
1,gaons,1
2,securitiesd,1
3,unfiltered,1
4,preceeding,1


In [16]:
# words with highest frequency
words_df.tail()

Unnamed: 0,word,count
12575,said,4649
12576,in,5070
12577,to,6337
12578,of,6671
12579,the,12496


In [17]:
# vocabulary size
len(words_df)

12580

 # Creating N-grams of the dialogue

In [18]:
# creating an empty dataframe
dataset = pd.DataFrame()

# adding cleaned sentences in the dataframe
dataset['Sentences'] = dialogs_clean

# first 20 cleaned sentences
dataset.head(20)

Unnamed: 0,Sentences
0,asian exporters fear damage from u s japan r...
1,they told reuter correspondents in asian capit...
2,but some exporters said that while the conflic...
3,the u s has said it will impose mln dlrs of...
4,unofficial japanese estimates put the impact o...
5,we wouldn ' t be able to do business said a ...
6,if the tariffs remain in place for any length...
7,in taiwan businessmen and officials are also ...
8,we are aware of the seriousness of the u s
9,threat against japan because it serves as a wa...


In [19]:
# using .split() to get tokens from the sentence
dataset['Sentences'][0].split()

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u',
 's',
 'japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u',
 's',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 'asia',
 "'",
 's',
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'far',
 'reaching',
 'economic',
 'damage',
 'businessmen',
 'and',
 'officials',
 'said']

In [20]:
# function to create unigrams
# taking a sentence as input
def create_unigram(sentence):
    # creating tokens from the sentence
    tokens = sentence.split()
    # empty list to store the unigrams
    unigram_list = []
    # number of unigrams is equal to the number of tokens in the sentence
    for i in range(len(tokens)):
        # appending each unigram in the list
        unigram_list.append(tokens[i:i+1])
    # returning the unigram list for a sentence    
    return unigram_list

In [21]:
# function to create bigrams
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    # number of bigrams is one less than the number of tokens in the sentence
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [22]:
# function to create trigrams
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    # number of trigrams is two less than the number of tokens in the sentence
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [23]:
# creating unigrams for all the sentences in the dataset 
final_unigram = []
# for each sentence
for i in range(dataset.shape[0]):
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

# adding the unigram in a seperate column in the dataset
dataset['unigram'] = final_unigram

In [24]:
# creating bigrams for all the sentences in the dataset
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [25]:
# creating trigrams for all the sentences in the dataset
final_trigram = []
for i in range(dataset.shape[0]):
    final_trigram.append(create_trigram(dataset['Sentences'][i]))

dataset['trigram'] = final_trigram

In [26]:
# first 20 rows of the dataset
dataset.head(20)

Unnamed: 0,Sentences,unigram,bigram,trigram
0,asian exporters fear damage from u s japan r...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the u s has said it will impose mln dlrs of...,"[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai...","[[the, u, s], [u, s, has], [s, has, said], [ha..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."
5,we wouldn ' t be able to do business said a ...,"[[we], [wouldn], ['], [t], [be], [able], [to],...","[[we, wouldn], [wouldn, '], [', t], [t, be], [...","[[we, wouldn, '], [wouldn, ', t], [', t, be], ..."
6,if the tariffs remain in place for any length...,"[[if], [the], [tariffs], [remain], [in], [plac...","[[if, the], [the, tariffs], [tariffs, remain],...","[[if, the, tariffs], [the, tariffs, remain], [..."
7,in taiwan businessmen and officials are also ...,"[[in], [taiwan], [businessmen], [and], [offici...","[[in, taiwan], [taiwan, businessmen], [busines...","[[in, taiwan, businessmen], [taiwan, businessm..."
8,we are aware of the seriousness of the u s,"[[we], [are], [aware], [of], [the], [seriousne...","[[we, are], [are, aware], [aware, of], [of, th...","[[we, are, aware], [are, aware, of], [aware, o..."
9,threat against japan because it serves as a wa...,"[[threat], [against], [japan], [because], [it]...","[[threat, against], [against, japan], [japan, ...","[[threat, against, japan], [against, japan, be..."


In [27]:
# sample sentence
dataset['Sentences'][0]

"asian exporters fear damage from u  s  japan rift mounting trade friction between the u  s  and japan has raised fears among many of asia ' s exporting nations that the row could inflict far  reaching economic damage  businessmen and officials said "

In [28]:
# unigram of the sentence
dataset['unigram'][0]

[['asian'],
 ['exporters'],
 ['fear'],
 ['damage'],
 ['from'],
 ['u'],
 ['s'],
 ['japan'],
 ['rift'],
 ['mounting'],
 ['trade'],
 ['friction'],
 ['between'],
 ['the'],
 ['u'],
 ['s'],
 ['and'],
 ['japan'],
 ['has'],
 ['raised'],
 ['fears'],
 ['among'],
 ['many'],
 ['of'],
 ['asia'],
 ["'"],
 ['s'],
 ['exporting'],
 ['nations'],
 ['that'],
 ['the'],
 ['row'],
 ['could'],
 ['inflict'],
 ['far'],
 ['reaching'],
 ['economic'],
 ['damage'],
 ['businessmen'],
 ['and'],
 ['officials'],
 ['said']]

In [29]:
# bigram of the sentence
dataset['bigram'][0]

[['asian', 'exporters'],
 ['exporters', 'fear'],
 ['fear', 'damage'],
 ['damage', 'from'],
 ['from', 'u'],
 ['u', 's'],
 ['s', 'japan'],
 ['japan', 'rift'],
 ['rift', 'mounting'],
 ['mounting', 'trade'],
 ['trade', 'friction'],
 ['friction', 'between'],
 ['between', 'the'],
 ['the', 'u'],
 ['u', 's'],
 ['s', 'and'],
 ['and', 'japan'],
 ['japan', 'has'],
 ['has', 'raised'],
 ['raised', 'fears'],
 ['fears', 'among'],
 ['among', 'many'],
 ['many', 'of'],
 ['of', 'asia'],
 ['asia', "'"],
 ["'", 's'],
 ['s', 'exporting'],
 ['exporting', 'nations'],
 ['nations', 'that'],
 ['that', 'the'],
 ['the', 'row'],
 ['row', 'could'],
 ['could', 'inflict'],
 ['inflict', 'far'],
 ['far', 'reaching'],
 ['reaching', 'economic'],
 ['economic', 'damage'],
 ['damage', 'businessmen'],
 ['businessmen', 'and'],
 ['and', 'officials'],
 ['officials', 'said']]

In [30]:
# trigram of the sentence
dataset['trigram'][0]

[['asian', 'exporters', 'fear'],
 ['exporters', 'fear', 'damage'],
 ['fear', 'damage', 'from'],
 ['damage', 'from', 'u'],
 ['from', 'u', 's'],
 ['u', 's', 'japan'],
 ['s', 'japan', 'rift'],
 ['japan', 'rift', 'mounting'],
 ['rift', 'mounting', 'trade'],
 ['mounting', 'trade', 'friction'],
 ['trade', 'friction', 'between'],
 ['friction', 'between', 'the'],
 ['between', 'the', 'u'],
 ['the', 'u', 's'],
 ['u', 's', 'and'],
 ['s', 'and', 'japan'],
 ['and', 'japan', 'has'],
 ['japan', 'has', 'raised'],
 ['has', 'raised', 'fears'],
 ['raised', 'fears', 'among'],
 ['fears', 'among', 'many'],
 ['among', 'many', 'of'],
 ['many', 'of', 'asia'],
 ['of', 'asia', "'"],
 ['asia', "'", 's'],
 ["'", 's', 'exporting'],
 ['s', 'exporting', 'nations'],
 ['exporting', 'nations', 'that'],
 ['nations', 'that', 'the'],
 ['that', 'the', 'row'],
 ['the', 'row', 'could'],
 ['row', 'could', 'inflict'],
 ['could', 'inflict', 'far'],
 ['inflict', 'far', 'reaching'],
 ['far', 'reaching', 'economic'],
 ['reaching', 

# Building the N-gram Language Model

In [31]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for i in range(dataset.shape[0]):
    # for each trigram pair
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1

In [33]:
# defined model
model

defaultdict(<function __main__.<lambda>>,
            {('asian',
              'exporters'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'fear': 1}),
             ('exporters',
              'fear'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'damage': 1}),
             ('fear',
              'damage'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'from': 1}),
             ('damage',
              'from'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'local': 1,
                          'u': 1}),
             ('from',
              'u'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'k': 1,
                          's': 5}),
             ('u',
              's'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {"'": 1,
                          'a': 6,
                          'acquisition': 1,
                          'action': 1,
                          'aegon': 1,
                    

# Predicting the next word using N-gram Language Model

In [34]:
# predict the next word
dict(model["software", "development"])

{'company': 2}

In [35]:
# another example
dict(model["in", "all"])

{'areas': 1,
 'cases': 1,
 'centres': 1,
 'crazy': 1,
 'divisions': 1,
 'domestic': 1,
 'fiscal': 1,
 'for': 1,
 'main': 1,
 'major': 1,
 'merrill': 1,
 'of': 1,
 'sectors': 2}

In [36]:
# another example
dict(model["how", "are"])

{'european': 1}

In [37]:
# another example
dict(model["good", "to"])

{'go': 1, 'have': 1, 'very': 1}

#  Probablistic output

In [38]:
# creating the unigram list
unigram_dict = {}
for i in tqdm(range(dataset.shape[0])):
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:   
        # check if the word is already in dictionary 
        if word[0] in unigram_dict:
            # increment count of word by 1 
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1 
            unigram_dict[word[0]] = 1

100%|██████████| 10000/10000 [00:00<00:00, 59630.53it/s]


In [39]:
# unigram list
unigram_dict

{'asian': 13,
 'exporters': 52,
 'fear': 8,
 'damage': 29,
 'from': 1369,
 'u': 1117,
 's': 2864,
 'japan': 441,
 'rift': 1,
 'mounting': 5,
 'trade': 549,
 'friction': 8,
 'between': 191,
 'the': 12496,
 'and': 4599,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6671,
 'asia': 14,
 "'": 2094,
 'exporting': 12,
 'nations': 71,
 'that': 1376,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 518,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5070,
 'capitals': 3,
 'a': 4412,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1643,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 200,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 119,
 'run': 21,
 'short': 87,
 'term': 120,
 'toky

In [40]:
# find the overall frequency of words in the corpus
counts = Counter(unigram_dict)
counts

Counter({'asian': 13,
         'exporters': 52,
         'fear': 8,
         'damage': 29,
         'from': 1369,
         'u': 1117,
         's': 2864,
         'japan': 441,
         'rift': 1,
         'mounting': 5,
         'trade': 549,
         'friction': 8,
         'between': 191,
         'the': 12496,
         'and': 4599,
         'has': 974,
         'raised': 70,
         'fears': 13,
         'among': 44,
         'many': 54,
         'of': 6671,
         'asia': 14,
         "'": 2094,
         'exporting': 12,
         'nations': 71,
         'that': 1376,
         'row': 3,
         'could': 291,
         'inflict': 1,
         'far': 55,
         'reaching': 7,
         'economic': 244,
         'businessmen': 15,
         'officials': 190,
         'said': 4649,
         'they': 518,
         'told': 237,
         'reuter': 27,
         'correspondents': 3,
         'in': 5070,
         'capitals': 3,
         'a': 4412,
         'move': 101,
         'against': 2

In [41]:
# vocabulary size
total_count = len(unigram_dict)
total_count

12580

In [42]:
# relative frequencies of each word
for word in counts:
    counts[word] /= float(total_count)

counts

Counter({'asian': 0.0010333863275039745,
         'exporters': 0.004133545310015898,
         'fear': 0.0006359300476947536,
         'damage': 0.002305246422893482,
         'from': 0.10882352941176471,
         'u': 0.08879173290937997,
         's': 0.22766295707472178,
         'japan': 0.035055643879173294,
         'rift': 7.94912559618442e-05,
         'mounting': 0.000397456279809221,
         'trade': 0.04364069952305247,
         'friction': 0.0006359300476947536,
         'between': 0.015182829888712241,
         'the': 0.993322734499205,
         'and': 0.36558028616852145,
         'has': 0.07742448330683625,
         'raised': 0.005564387917329093,
         'fears': 0.0010333863275039745,
         'among': 0.0034976152623211448,
         'many': 0.004292527821939587,
         'of': 0.5302861685214626,
         'asia': 0.0011128775834658188,
         "'": 0.16645468998410176,
         'exporting': 0.0009538950715421304,
         'nations': 0.005643879173290938,
         't

In [43]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [44]:
# predict the next word
dict(model["to", "book"])

{}

In [45]:
# another example
dict(model["how", "are"])

{'european': 1.0}

In [46]:
# another example
dict(model["good", "to"])

{'go': 0.3333333333333333,
 'have': 0.3333333333333333,
 'very': 0.3333333333333333}