## Importing libraries

In [1]:
# import libraries needed, read the dataset

import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Reading dataset

In [3]:
string.punctuation = string.punctuation + '“' + '”' +'-' + '’' + '‘' + '—'
string.punctuation = string.punctuation.replace('.', '')
file = open('Downloads/corpus.txt', encoding = 'utf8').read()

## Preprocessing Data

In [4]:
#preprocess data

file_nl_removed = ""
for line in file:
    line_nl_removed = line.replace("\n", " ")           #removes newlines
    file_nl_removed += line_nl_removed

file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])   #removes all special characters

In [4]:
sents = nltk.sent_tokenize(file_p)
print("The number of sentences is", len(sents)) #prints the number of sentences

words = nltk.word_tokenize(file_p)
print("The number of tokens is", len(words)) #prints the number of tokens

average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is", average_tokens) #prints the average number of tokens per sentence

unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) #prints the number of unique tokens

The number of sentences is 981
The number of tokens is 27361
The average number of tokens per sentence is 28
The number of unique tokens are 3039


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Model creation

In [6]:
from nltk.util import ngrams
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

unigram=[]
bigram=[]
trigram=[]
fourgram=[]
tokenized_text = []

for sentence in sents:
    sentence = sentence.lower()
    sequence = word_tokenize(sentence) 
    for word in sequence:
        if (word =='.'):
            sequence.remove(word) 
        else:
            unigram.append(word)
    tokenized_text.append(sequence) 
    unigram.extend(list(ngrams(sequence,1)))
    bigram.extend(list(ngrams(sequence, 2)))              #unigram, bigram, trigram, and fourgram models are created
    trigram.extend(list(ngrams(sequence, 3)))
    fourgram.extend(list(ngrams(sequence, 4)))

def removal(x):                                    #removes ngrams containing only stopwords
    y = []
    for pair in x:
        count = 0
        for word in pair:
            if word in stop_words:
                count = count or 0
            else:
                count = count or 1
        if (count==1):
            y.append(pair)
    return(y)

unigram = removal(unigram)
bigram = removal(bigram)
trigram = removal(trigram)             
fourgram = removal(fourgram)

freq_uni = nltk.FreqDist(unigram)
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)

print("Most common n-grams without stopword removal and without add-1 smoothing: \n")
print ("Most common unigrams: ", freq_uni.most_common(5))      
print ("Most common bigrams: ", freq_bi.most_common(5))      #prints most common n-grams without add-1 smoothing and without stopword removal.
print ("\nMost common trigrams: ", freq_tri.most_common(5))
print ("\nMost common fourgrams: ", freq_four.most_common(5))

Most common n-grams without stopword removal and without add-1 smoothing: 

Most common unigrams:  [('the', 1630), ('and', 844), ('she', 537), ('of', 508), (('said',), 462)]
Most common bigrams:  [(('said', 'the'), 209), (('said', 'alice'), 115), (('the', 'queen'), 65), (('the', 'king'), 60), (('a', 'little'), 59)]

Most common trigrams:  [(('the', 'mock', 'turtle'), 51), (('the', 'march', 'hare'), 30), (('said', 'the', 'king'), 29), (('the', 'white', 'rabbit'), 21), (('said', 'the', 'hatter'), 21)]

Most common fourgrams:  [(('said', 'the', 'mock', 'turtle'), 19), (('she', 'said', 'to', 'herself'), 16), (('a', 'minute', 'or', 'two'), 11), (('said', 'the', 'march', 'hare'), 8), (('will', 'you', 'wont', 'you'), 8)]


## Stop words removal

In [7]:
#stopwords = code for downloading stop words through nltk

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#prints top 10 unigrams, bigrams after removing stopwords

print("Most common n-grams with stopword removal and without add-1 smoothing: \n")
unigram_sw_removed = [p for p in unigram if p not in stop_words]
fdist = nltk.FreqDist(unigram_sw_removed)
print("Most common unigrams: ", fdist.most_common(10))

bigram_sw_removed = []
bigram_sw_removed.extend(list(ngrams(unigram_sw_removed, 2)))
fdist = nltk.FreqDist(bigram_sw_removed)
print("\nMost common bigrams: ", fdist.most_common(10))

Most common n-grams with stopword removal and without add-1 smoothing: 

Most common unigrams:  [(('said',), 462), ('alice', 385), (('alice',), 385), ('little', 128), (('little',), 128), ('one', 101), (('one',), 101), ('like', 85), (('like',), 85), ('know', 85)]

Most common bigrams:  [((('said',), ('alice',)), 120), (('mock', 'turtle'), 54), ((('mock',), ('turtle',)), 54), (('march', 'hare'), 31), ((('march',), ('hare',)), 31), ((('said',), ('king',)), 29), (('thought', 'alice'), 27), ((('thought',), ('alice',)), 26), (('white', 'rabbit'), 22), ((('white',), ('rabbit',)), 22)]


## Laplace Smoothing

In [8]:
#Add-1 smoothing is performed here:
            
ngrams_all = {1:[], 2:[], 3:[], 4:[]}
for i in range(4):
    for each in tokenized_text:
        for j in ngrams(each, i+1):
            ngrams_all[i+1].append(j);

ngrams_voc = {1:set([]), 2:set([]), 3:set([]), 4:set([])}

for i in range(4):
    for gram in ngrams_all[i+1]:
        if gram not in ngrams_voc[i+1]:
            ngrams_voc[i+1].add(gram)

total_ngrams = {1:-1, 2:-1, 3:-1, 4:-1}
total_voc = {1:-1, 2:-1, 3:-1, 4:-1}
for i in range(4):
    total_ngrams[i+1] = len(ngrams_all[i+1])
    total_voc[i+1] = len(ngrams_voc[i+1])                       
    
ngrams_prob = {1:[], 2:[], 3:[], 4:[]}
for i in range(4):
    for ngram in ngrams_voc[i+1]:
        tlist = [ngram]
        tlist.append(ngrams_all[i+1].count(ngram))
        ngrams_prob[i+1].append(tlist)
    
for i in range(4):
    for ngram in ngrams_prob[i+1]:
        ngram[-1] = (ngram[-1]+1)/(total_ngrams[i+1] + total_voc[i+1])             #add-1

In [17]:
#smoothed models without stopwords removed are used
str1 = 'it was a commercial '
str2 = 'she said of the '
str3 = 'but the way i look at it people will like your next album '
str4 = 'the song instantly garnered attention for its scathing explicit lyrics and a subsequent music '
str5 = 'after the success of you oughta know the album other hit singles helped'
str6 = 'all i really want and hand in my pocket followed but the fourth single'
str7 = 'you learn and head over feet the fifth and sixth singles respectively'
str8 = 'although the track was never commercially '
str9 = 'morissette popularity grew significantly in canada where the album'
str10= 'the album was also a bestseller'

token_1 = word_tokenize(str1)
token_2 = word_tokenize(str2)
token_3 = word_tokenize(str3)
token_4 = word_tokenize(str4)
token_5 = word_tokenize(str5)
token_6 = word_tokenize(str6)
token_7 = word_tokenize(str7)
token_8 = word_tokenize(str8)
token_9 = word_tokenize(str9)
token_10 = word_tokenize(str10)

ngram_1 = {1:[], 2:[], 3:[]}                  
ngram_2 = {1:[], 2:[], 3:[]}
ngram_3 = {1:[], 2:[], 3:[]}                  
ngram_4 = {1:[], 2:[], 3:[]}
ngram_5 = {1:[], 2:[], 3:[]}                 
ngram_6 = {1:[], 2:[], 3:[]}
ngram_7 = {1:[], 2:[], 3:[]}                  
ngram_8 = {1:[], 2:[], 3:[]}
ngram_9 = {1:[], 2:[], 3:[]}                  
ngram_10 = {1:[], 2:[], 3:[]}


for i in range(3):
    ngram_1[i+1] = list(ngrams(token_1, i+1))[-1]
    ngram_2[i+1] = list(ngrams(token_2, i+1))[-1]
    ngram_3[i+1] = list(ngrams(token_3, i+1))[-1]
    ngram_4[i+1] = list(ngrams(token_4, i+1))[-1]
    ngram_5[i+1] = list(ngrams(token_5, i+1))[-1]
    ngram_6[i+1] = list(ngrams(token_6, i+1))[-1]
    ngram_7[i+1] = list(ngrams(token_7, i+1))[-1]
    ngram_8[i+1] = list(ngrams(token_8, i+1))[-1]
    ngram_9[i+1] = list(ngrams(token_9, i+1))[-1]
    ngram_10[i+1] = list(ngrams(token_10, i+1))[-1]
    
    

print("String 1: ", ngram_1)
print("String 2: ",ngram_2)
print("String 3: ", ngram_3)
print("String 4: ", ngram_4)
print("String 5: ", ngram_5)
print("String 6: ", ngram_6)
print("String 7: ", ngram_7)
print("String 8: ", ngram_8)
print("String 9: ", ngram_9)
print("String 10: ", ngram_10)




String 1:  {1: ('the',), 2: ('said', 'the'), 3: ('alice', 'said', 'the')}
String 2:  {1: ('was',), 2: ('she', 'was'), 3: ('that', 'she', 'was')}
String 3:  {1: ('album',), 2: ('next', 'album'), 3: ('your', 'next', 'album')}
String 4:  {1: ('music',), 2: ('subsequent', 'music'), 3: ('a', 'subsequent', 'music')}
String 5:  {1: ('helped',), 2: ('singles', 'helped'), 3: ('hit', 'singles', 'helped')}
String 6:  {1: ('single',), 2: ('fourth', 'single'), 3: ('the', 'fourth', 'single')}
String 7:  {1: ('respectively',), 2: ('singles', 'respectively'), 3: ('sixth', 'singles', 'respectively')}
String 8:  {1: ('commercially',), 2: ('never', 'commercially'), 3: ('was', 'never', 'commercially')}
String 9:  {1: ('album',), 2: ('the', 'album'), 3: ('where', 'the', 'album')}
String 10:  {1: ('bestseller',), 2: ('a', 'bestseller'), 3: ('also', 'a', 'bestseller')}


## Bigram and trigram model

In [18]:
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_1 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_1[i+1]:       #to find predictions based on highest probability of n-grams                   
            count +=1
            pred_1[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_1[i+1].append("NOT FOUND")           #if no word prediction is found, replace with NOT FOUND
            count +=1

In [11]:
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_2 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_2[i+1]:
            count +=1
            pred_2[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_2[i+1].append("\0")
            count +=1

In [12]:
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_3 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_3[i+1]:
            count +=1
            pred_3[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_3[i+1].append("\0")
            count +=1

In [13]:
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_4 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_4[i+1]:
            count +=1
            pred_4[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_4[i+1].append("\0")
            count +=1

In [14]:
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_5 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_5[i+1]:
            count +=1
            pred_5[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_5[i+1].append("\0")
            count +=1
#6th sentence            
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_6 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_6[i+1]:
            count +=1
            pred_6[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_6[i+1].append("\0")
            count +=1

#7th sentence
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)

pred_7 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_7[i+1]:
            count +=1
            pred_7[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_7[i+1].append("\0")
            count +=1
            
#8th sentence
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_8 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_8[i+1]:
            count +=1
            pred_8[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_8[i+1].append("\0")
            count +=1
            
#9th
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_9 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_9[i+1]:
            count +=1
            pred_9[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_9[i+1].append("\0")
            count +=1
            
#10th 
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_10 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_10[i+1]:
            count +=1
            pred_10[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_10[i+1].append("\0")
            count +=1

In [123]:
print("Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams\n")
print("String 1 - it was a commercial-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}\n" .format(pred_1[1], pred_1[2], pred_1[3]))
print("String 2 - she said of the-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}\n" .format(pred_2[1], pred_2[2], pred_2[3]))


Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams

String 1 - it was a commercial-

Bigram model predictions: ['arts', 'art', 'or', 'failure', 'NOT FOUND']
Trigram model predictions: ['or', 'failure', 'NOT FOUND', 'NOT FOUND', 'NOT FOUND']
Fourgram model predictions: ['failure', 'NOT FOUND', 'NOT FOUND', 'NOT FOUND', 'NOT FOUND']

String 2 - she said of the-

Bigram model predictions: ['num', 'same', 'first', 'album', 'year']
Trigram model predictions: ['year', 'num', 'human', 'community', 'word']
Fourgram model predictions: ['album', '\x00', '\x00', '\x00', '\x00']



In [124]:
print("String 3 - but the way i look at it people will like your next album '-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_3[1], pred_3[2]))

print("String 4 - the song instantly garnered attention for its scathing explicit lyrics and a subsequent music '-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_4[1], pred_4[2]))

print("String 5'-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_5[1], pred_5[2]))

print("String 6 --\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_6[1], pred_6[2]))

print("String 7 - \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_7[1], pred_7[2]))

print("String 8 -\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_8[1], pred_8[2]))

print("String 9 - \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_9[1], pred_9[2]))

print("String 10 - \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\n" .format(pred_10[1], pred_10[2]))

String 3 - but the way i look at it people will like your next album '-

Bigram model predictions: ['was', 'by', 'alanis', 'of', 'with']
Trigram model predictions: ['if', '\x00', '\x00', '\x00', '\x00']

String 4 - the song instantly garnered attention for its scathing explicit lyrics and a subsequent music '-

Bigram model predictions: ['video', 'dvd', 'and', 'for', 'producer']
Trigram model predictions: ['video', '\x00', '\x00', '\x00', '\x00']

String 5'-

Bigram model predictions: ['send', 'an', '\x00', '\x00', '\x00']
Trigram model predictions: ['send', '\x00', '\x00', '\x00', '\x00']

String 6 --

Bigram model predictions: ['of', 'digit', 'was', 'digits', 'number']
Trigram model predictions: ['ironic', '\x00', '\x00', '\x00', '\x00']

String 7 - 

Bigram model predictions: ['the', 'kept', '\x00', '\x00', '\x00']
Trigram model predictions: ['kept', '\x00', '\x00', '\x00', '\x00']

String 8 -

Bigram model predictions: ['than', 'released', '\x00', '\x00', '\x00']
Trigram model pred