Steps to build the next word recommender system
Loading and exploring the dataset
Creating N-grams of the dialogue
Building the N-gram Language Model
Predicting the next word using N-gram Language Model

In [68]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [69]:
# open text file and read in data
df = pd.read_csv("sample_reuters_dataset.csv")

In [70]:
df.head()

Unnamed: 0,sentence_number,sentence_text
0,0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1,1,They told Reuter correspondents in Asian capit...
2,2,But some exporters said that while the conflic...
3,3,The U . S . Has said it will impose 300 mln dl...
4,4,Unofficial Japanese estimates put the impact o...


In [71]:
import re
import string

# Convert text to lowercase
df['cleaned_text'] = df['sentence_text'].apply(lambda x: x.lower())

# Removing punctuation
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Removing special characters
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Display the cleaned and tokenized dataset
print(df.head())

   sentence_number                                      sentence_text  \
0                0  ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...   
1                1  They told Reuter correspondents in Asian capit...   
2                2  But some exporters said that while the conflic...   
3                3  The U . S . Has said it will impose 300 mln dl...   
4                4  Unofficial Japanese estimates put the impact o...   

                                        cleaned_text  
0  asian exporters fear damage from u  s  japan r...  
1  they told reuter correspondents in asian capit...  
2  but some exporters said that while the conflic...  
3  the u  s  has said it will impose  mln dlrs of...  
4  unofficial japanese estimates put the impact o...  


In [72]:
import nltk
from nltk.tokenize import word_tokenize

In [73]:
 df['cleaned_text']

0       asian exporters fear damage from u  s  japan r...
1       they told reuter correspondents in asian capit...
2       but some exporters said that while the conflic...
3       the u  s  has said it will impose  mln dlrs of...
4       unofficial japanese estimates put the impact o...
                              ...                        
9995    in addition  british printing and communicatio...
9996    salomon said in a filing with the securities a...
9997    if the court decides they should be converted ...
9998    harcourt is asking the court to rule the compa...
9999    an increase in reed shares in london today was...
Name: cleaned_text, Length: 10000, dtype: object

In [74]:
# Tokenization
df['tokenized_text'] = df['cleaned_text'].apply(word_tokenize)

In [75]:
df['tokenized_text']

0       [asian, exporters, fear, damage, from, u, s, j...
1       [they, told, reuter, correspondents, in, asian...
2       [but, some, exporters, said, that, while, the,...
3       [the, u, s, has, said, it, will, impose, mln, ...
4       [unofficial, japanese, estimates, put, the, im...
                              ...                        
9995    [in, addition, british, printing, and, communi...
9996    [salomon, said, in, a, filing, with, the, secu...
9997    [if, the, court, decides, they, should, be, co...
9998    [harcourt, is, asking, the, court, to, rule, t...
9999    [an, increase, in, reed, shares, in, london, t...
Name: tokenized_text, Length: 10000, dtype: object

In [76]:
# Create vocabulary
vocabulary = set()
for tokenized_sentence in df['tokenized_text']:
    vocabulary.update(tokenized_sentence)


In [77]:
# function to create unigrams
# taking a sentence as input
def create_unigram(sentence):
    # creating tokens from the sentence
    tokens = sentence.split()
    # empty list to store the unigrams
    unigram_list = []
    # number of unigrams is equal to the number of tokens in the sentence
    for i in range(len(tokens)):
        # appending each unigram in the list
        unigram_list.append(tokens[i:i+1])
    # returning the unigram list for a sentence    
    return unigram_list

In [78]:
# function to create bigrams
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    # number of bigrams is one less than the number of tokens in the sentence
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [79]:
# function to create trigrams
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    # number of trigrams is two less than the number of tokens in the sentence
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [80]:
# creating unigrams for all the sentences in the dataset 
final_unigram = []
# for each sentence
for i in range(df.shape[0]):
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram( df['cleaned_text'][i]))

# adding the unigram in a seperate column in the dataset
df['unigram'] = final_unigram

In [81]:
# creating bigrams for all the sentences in the dataset
final_bigram = []
for i in range(df.shape[0]):
    final_bigram.append(create_bigram(df['cleaned_text'][i]))

df['bigram'] = final_bigram

In [82]:
# creating trigrams for all the sentences in the dataset
final_trigram = []
for i in range(df.shape[0]):
    final_trigram.append(create_trigram(df['cleaned_text'][i]))

df['trigram'] = final_trigram

In [83]:
df.head(20)

Unnamed: 0,sentence_number,sentence_text,cleaned_text,tokenized_text,unigram,bigram,trigram
0,0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...,asian exporters fear damage from u s japan r...,"[asian, exporters, fear, damage, from, u, s, j...","[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,1,They told Reuter correspondents in Asian capit...,they told reuter correspondents in asian capit...,"[they, told, reuter, correspondents, in, asian...","[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,2,But some exporters said that while the conflic...,but some exporters said that while the conflic...,"[but, some, exporters, said, that, while, the,...","[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,3,The U . S . Has said it will impose 300 mln dl...,the u s has said it will impose mln dlrs of...,"[the, u, s, has, said, it, will, impose, mln, ...","[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai...","[[the, u, s], [u, s, has], [s, has, said], [ha..."
4,4,Unofficial Japanese estimates put the impact o...,unofficial japanese estimates put the impact o...,"[unofficial, japanese, estimates, put, the, im...","[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."
5,5,""" We wouldn ' t be able to do business ,"" said...",we wouldn t be able to do business said a s...,"[we, wouldn, t, be, able, to, do, business, sa...","[[we], [wouldn], [t], [be], [able], [to], [do]...","[[we, wouldn], [wouldn, t], [t, be], [be, able...","[[we, wouldn, t], [wouldn, t, be], [t, be, abl..."
6,6,""" If the tariffs remain in place for any lengt...",if the tariffs remain in place for any length...,"[if, the, tariffs, remain, in, place, for, any...","[[if], [the], [tariffs], [remain], [in], [plac...","[[if, the], [the, tariffs], [tariffs, remain],...","[[if, the, tariffs], [the, tariffs, remain], [..."
7,7,"In Taiwan , businessmen and officials are also...",in taiwan businessmen and officials are also ...,"[in, taiwan, businessmen, and, officials, are,...","[[in], [taiwan], [businessmen], [and], [offici...","[[in, taiwan], [taiwan, businessmen], [busines...","[[in, taiwan, businessmen], [taiwan, businessm..."
8,8,""" We are aware of the seriousness of the U . S .",we are aware of the seriousness of the u s,"[we, are, aware, of, the, seriousness, of, the...","[[we], [are], [aware], [of], [the], [seriousne...","[[we, are], [are, aware], [aware, of], [of, th...","[[we, are, aware], [are, aware, of], [aware, o..."
9,9,Threat against Japan because it serves as a wa...,threat against japan because it serves as a wa...,"[threat, against, japan, because, it, serves, ...","[[threat], [against], [japan], [because], [it]...","[[threat, against], [against, japan], [japan, ...","[[threat, against, japan], [against, japan, be..."


In [84]:
# unigram of the sentence
df['unigram'][0]

[['asian'],
 ['exporters'],
 ['fear'],
 ['damage'],
 ['from'],
 ['u'],
 ['s'],
 ['japan'],
 ['rift'],
 ['mounting'],
 ['trade'],
 ['friction'],
 ['between'],
 ['the'],
 ['u'],
 ['s'],
 ['and'],
 ['japan'],
 ['has'],
 ['raised'],
 ['fears'],
 ['among'],
 ['many'],
 ['of'],
 ['asia'],
 ['s'],
 ['exporting'],
 ['nations'],
 ['that'],
 ['the'],
 ['row'],
 ['could'],
 ['inflict'],
 ['far'],
 ['reaching'],
 ['economic'],
 ['damage'],
 ['businessmen'],
 ['and'],
 ['officials'],
 ['said']]

In [85]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for i in range(df.shape[0]):
    # for each trigram pair
    for w1, w2, w3 in create_trigram(df['cleaned_text'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1

In [86]:
# defined model
model

defaultdict(<function __main__.<lambda>()>,
            {('asian',
              'exporters'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'fear': 1}),
             ('exporters',
              'fear'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'damage': 1}),
             ('fear',
              'damage'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'from': 1}),
             ('damage',
              'from'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'u': 1, 'local': 1}),
             ('from',
              'u'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'s': 5, 'k': 1}),
             ('u',
              's'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'japan': 6,
                          'and': 34,
                      

In [87]:
#from nltk.util import ngrams

# Unigrams
#df['unigrams'] = df['tokenized_text'].apply(lambda x: list(ngrams(x, 1)))

# Bigrams
#df['bigrams'] = df['tokenized_text'].apply(lambda x: list(ngrams(x, 2)))

# Trigrams
#df['trigrams'] = df['tokenized_text'].apply(lambda x: list(ngrams(x, 3)))


In [88]:
#from collections import defaultdict

# Count occurrences of trigrams
#trigram_counts = defaultdict(lambda: defaultdict(lambda: 0))
#for sentence in df['trigrams']:
    #for trigram in sentence:
       # trigram_counts[trigram[:2]][trigram[2]] += 1




In [89]:
# predict the next word
dict(model["asian", "exporters"])

{'fear': 1}

In [90]:
#prob model

In [91]:
# creating the unigram list
unigram_dict = {}
for i in tqdm(range(df.shape[0])):
    # add word-count pair to the dictionary
    for word in df['unigram'][i]:   
        # check if the word is already in dictionary 
        if word[0] in unigram_dict:
            # increment count of word by 1 
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1 
            unigram_dict[word[0]] = 1

100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 31876.60it/s]


In [92]:
# unigram list
unigram_dict

{'asian': 13,
 'exporters': 52,
 'fear': 8,
 'damage': 29,
 'from': 1369,
 'u': 1117,
 's': 2864,
 'japan': 441,
 'rift': 1,
 'mounting': 5,
 'trade': 549,
 'friction': 8,
 'between': 191,
 'the': 12496,
 'and': 4599,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6671,
 'asia': 14,
 'exporting': 12,
 'nations': 71,
 'that': 1376,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 518,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5070,
 'capitals': 3,
 'a': 4412,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1643,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 200,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 119,
 'run': 21,
 'short': 87,
 'term': 120,
 'tokyo': 75,
 'lo

In [93]:
# Calculate probabilities
trigram_probabilities = defaultdict(lambda: defaultdict(lambda: 0))
for prefix, suffixes in trigram_counts.items():
    total_count = sum(suffixes.values())
    for suffix, count in suffixes.items():
        trigram_probabilities[prefix][suffix] = count / total_count

In [94]:
# find the overall frequency of words in the corpus
counts = Counter(unigram_dict)
counts

Counter({'the': 12496,
         'of': 6671,
         'to': 6337,
         'in': 5070,
         'said': 4649,
         'and': 4599,
         'a': 4412,
         'mln': 3064,
         's': 2864,
         'for': 2592,
         'vs': 2269,
         'dlrs': 2140,
         'it': 1970,
         'pct': 1811,
         'on': 1643,
         'lt': 1498,
         'is': 1420,
         'that': 1376,
         'by': 1375,
         'its': 1372,
         'from': 1369,
         'at': 1311,
         'cts': 1232,
         'year': 1212,
         'be': 1165,
         'with': 1127,
         'was': 1127,
         'u': 1117,
         'billion': 1112,
         'net': 1098,
         'will': 984,
         'has': 974,
         'he': 971,
         'would': 926,
         'as': 884,
         'an': 844,
         'company': 843,
         'not': 838,
         'inc': 723,
         'last': 654,
         'but': 650,
         'which': 643,
         'this': 642,
         'are': 625,
         'corp': 623,
         'shr': 622,
 

In [95]:
# vocabulary size
total_count = len(unigram_dict)
total_count

12579

In [96]:
# relative frequencies of each word
for word in counts:
    counts[word] /= float(total_count)

counts

Counter({'the': 0.9934017012481119,
         'of': 0.530328324986088,
         'to': 0.5037761348278877,
         'in': 0.40305270689243977,
         'said': 0.36958422768105575,
         'and': 0.3656093489148581,
         'a': 0.35074330232927897,
         'mln': 0.24358057079259082,
         's': 0.2276810557278003,
         'for': 0.2060577152396852,
         'vs': 0.18037999841004848,
         'dlrs': 0.17012481119325862,
         'it': 0.15661022338818667,
         'pct': 0.1439701089116782,
         'on': 0.13061451625725415,
         'lt': 0.11908736783528102,
         'is': 0.11288655696001272,
         'that': 0.1093886636457588,
         'by': 0.10930916607043485,
         'its': 0.10907067334446299,
         'from': 0.10883218061849113,
         'at': 0.10422132124970189,
         'cts': 0.09794101279910963,
         'year': 0.09635106129263057,
         'be': 0.0926146752524048,
         'with': 0.0895937673900946,
         'was': 0.0895937673900946,
         'u': 0.088798

In [97]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [98]:
# predict the next word
dict(model["asian", "exporters"])

{'fear': 1.0}

In [99]:
#df['trigrams']