# Import Libraries and Data

In [2]:
import pandas as pd
import re
from tqdm import tqdm

In [3]:
df = pd.read_csv('sample_reuters_dataset.csv')
df.head()

Unnamed: 0,sentence_number,sentence_text
0,0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1,1,They told Reuter correspondents in Asian capit...
2,2,But some exporters said that while the conflic...
3,3,The U . S . Has said it will impose 300 mln dl...
4,4,Unofficial Japanese estimates put the impact o...


# Cleaning

In [4]:
sentences_clean = []

for i in df['sentence_text']:
    i = re.sub("[^a-zA-Z' ]", "", i) # remove everything except alphabets, ' and white spaces
    i = i.lower()
    sentences_clean.append(i)

In [7]:
dataset = pd.DataFrame()
dataset['Sentences'] = sentences_clean
dataset.head()

Unnamed: 0,Sentences
0,asian exporters fear damage from u s japan r...
1,they told reuter correspondents in asian capit...
2,but some exporters said that while the conflic...
3,the u s has said it will impose mln dlrs of...
4,unofficial japanese estimates put the impact o...


# Vocabulary Dictionary

In [6]:
all_words = " ".join(sentences_clean).split()

words_dict = {}

for word in all_words:   
    if word in words_dict:
        words_dict[word] = words_dict[word] + 1
    else:
        words_dict[word] = 1

# Unigrams, Bigrams, Trigrams

In [8]:
def create_unigram(sentence):
    tokens = sentence.split()
    unigram_list = []
    for i in range(len(tokens)):
        unigram_list.append(tokens[i:i+1])    
    return unigram_list

In [9]:
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [10]:
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [11]:
final_unigram = []

# for each sentence
for i in range(dataset.shape[0]):
    
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

# adding the unigram in a seperate column in the dataset
dataset['unigram'] = final_unigram

In [12]:
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [13]:
dataset.head()

Unnamed: 0,Sentences,unigram,bigram
0,asian exporters fear damage from u s japan r...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s..."
3,the u s has said it will impose mln dlrs of...,"[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]..."


# Trigram Model

In [14]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for i in range(dataset.shape[0]):
    
    # for each trigram pair
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1

In [15]:
dict(model["good", "to"])

{'go': 1, 'have': 1, 'very': 1}

# Probablistic Output

In [None]:
unigram_dict = {}
for i in tqdm(range(dataset.shape[0])):
    
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:   
        
        # check if the word is already in dictionary 
        if word[0] in unigram_dict:
            
            # increment count of word by 1 
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1 
            unigram_dict[word[0]] = 1

100%|██████████| 10000/10000 [00:00<00:00, 41244.97it/s]


In [21]:
counts = Counter(unigram_dict)

In [22]:
# vocabulary size
total_count = len(unigram_dict)
total_count

12580

In [26]:
for word in counts:
    counts[word] /= float(total_count)

In [24]:
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [25]:
dict(model["to", "a"])

{'pact': 0.0056179775280898875,
 'surplus': 0.02247191011235955,
 'marked': 0.011235955056179775,
 'high': 0.016853932584269662,
 'squeeze': 0.0056179775280898875,
 'common': 0.0056179775280898875,
 'record': 0.05056179775280899,
 'calendar': 0.0056179775280898875,
 'number': 0.02247191011235955,
 'very': 0.028089887640449437,
 'total': 0.033707865168539325,
 'group': 0.033707865168539325,
 'query': 0.016853932584269662,
 'meeting': 0.0056179775280898875,
 'gain': 0.0056179775280898875,
 'buyer': 0.0056179775280898875,
 'final': 0.0056179775280898875,
 'rise': 0.0056179775280898875,
 'message': 0.0056179775280898875,
 'reporter': 0.0056179775280898875,
 'to': 0.0056179775280898875,
 'treasury': 0.0056179775280898875,
 'decline': 0.011235955056179775,
 'shortage': 0.0056179775280898875,
 'pl': 0.0056179775280898875,
 'more': 0.0056179775280898875,
 'quick': 0.0056179775280898875,
 'change': 0.0056179775280898875,
 'maximum': 0.016853932584269662,
 'seasonally': 0.02247191011235955,
 'ye