## Importing libraries

In [21]:
import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize
string.punctuation = string.punctuation + '“' + '”' +'-' + '’' + '‘' + '—'
string.punctuation = string.punctuation.replace('.', '')
file = open('Downloads/data.txt', encoding = 'utf8').read()

In [22]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preprocessing data

In [23]:
file_nl_removed = ""
for line in file:
    line_nl_removed = line.replace("\n", " ")           #removes newlines
    file_nl_removed += line_nl_removed

file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])   #removes all special characters

In [24]:
sents = nltk.sent_tokenize(file_p)
print("The number of sentences is", len(sents)) #prints the number of sentences

words = nltk.word_tokenize(file_p)
print("The number of tokens is", len(words)) #prints the number of tokens

average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is", average_tokens) #prints the average number of tokens per sentence

unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) #prints the number of unique tokens

The number of sentences is 996
The number of tokens is 23931
The average number of tokens per sentence is 24
The number of unique tokens are 4741


In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
from nltk.util import ngrams
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

unigram=[]
bigram=[]
trigram=[]
fourgram=[]
tokenized_text = []

for sentence in sents:
    sentence = sentence.lower()
    sequence = word_tokenize(sentence) 
    for word in sequence:
        if (word =='.'):
            sequence.remove(word) 
        else:
            unigram.append(word)
    tokenized_text.append(sequence) 
    unigram.extend(list(ngrams(sequence,1)))
    bigram.extend(list(ngrams(sequence, 2)))              #unigram, bigram, trigram, and fourgram models are created
    trigram.extend(list(ngrams(sequence, 3)))
    fourgram.extend(list(ngrams(sequence, 4)))

def removal(x):                                    #removes ngrams containing only stopwords
    y = []
    for pair in x:
        count = 0
        for word in pair:
            if word in stop_words:
                count = count or 0
            else:
                count = count or 1
        if (count==1):
            y.append(pair)
    return(y)

unigram = removal(unigram)
bigram = removal(bigram)
trigram = removal(trigram)             
fourgram = removal(fourgram)

freq_uni = nltk.FreqDist(unigram)
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)

print("Most common n-grams without stopword removal and without add-1 smoothing: \n")
print ("Most common unigrams: ", freq_uni.most_common(5))      
print ("Most common bigrams: ", freq_bi.most_common(5))      #prints most common n-grams without add-1 smoothing and without stopword removal.
print ("\nMost common trigrams: ", freq_tri.most_common(5))
print ("\nMost common fourgrams: ", freq_four.most_common(5))

Most common n-grams without stopword removal and without add-1 smoothing: 

Most common unigrams:  [('the', 1609), ('of', 896), ('and', 645), ('in', 584), ('num', 392)]
Most common bigrams:  [(('in', 'num'), 59), (('the', 'num'), 39), (('of', 'art'), 37), (('num', 'and'), 25), (('num', 'num'), 23)]

Most common trigrams:  [(('of', 'the', 'year'), 15), (('one', 'of', 'the'), 14), (('jagged', 'little', 'pill'), 13), (('the', 'num', 'century'), 11), (('of', 'the', 'num'), 11)]

Most common fourgrams:  [(('on', 'num', 'june', 'num'), 8), (('on', 'the', 'same', 'day'), 5), (('in', 'the', 'united', 'states'), 5), (('in', 'the', 'early', 'num'), 5), (('administrators', 'are', 'expected', 'to'), 5)]


In [27]:
#stopwords = code for downloading stop words through nltk

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#prints top 10 unigrams, bigrams after removing stopwords

print("Most common n-grams with stopword removal and without add-1 smoothing: \n")
unigram_sw_removed = [p for p in unigram if p not in stop_words]
fdist = nltk.FreqDist(unigram_sw_removed)
print("Most common unigrams: ", fdist.most_common(10))

bigram_sw_removed = []
bigram_sw_removed.extend(list(ngrams(unigram_sw_removed, 2)))
fdist = nltk.FreqDist(bigram_sw_removed)
print("\nMost common bigrams: ", fdist.most_common(10))

Most common n-grams with stopword removal and without add-1 smoothing: 

Most common unigrams:  [('num', 392), (('num',), 392), ('art', 160), (('art',), 160), ('turing', 119), (('turing',), 119), ('morissette', 85), (('morissette',), 85), ('also', 67), (('also',), 67)]

Most common bigrams:  [(('num', 'num'), 44), ((('num',), ('num',)), 41), (('num', 'morissette'), 17), ((('num',), ('morissette',)), 17), (('num', 'turing'), 15), ((('num',), ('turing',)), 15), (('num', 'century'), 14), ((('num',), ('century',)), 14), (('june', 'num'), 14), ((('june',), ('num',)), 14)]


## Laplace smoothing

In [28]:
#Add-1 smoothing is performed here:
            
ngrams_all = {1:[], 2:[], 3:[], 4:[]}
for i in range(4):
    for each in tokenized_text:
        for j in ngrams(each, i+1):
            ngrams_all[i+1].append(j);

ngrams_voc = {1:set([]), 2:set([]), 3:set([]), 4:set([])}

for i in range(4):
    for gram in ngrams_all[i+1]:
        if gram not in ngrams_voc[i+1]:
            ngrams_voc[i+1].add(gram)

total_ngrams = {1:-1, 2:-1, 3:-1, 4:-1}
total_voc = {1:-1, 2:-1, 3:-1, 4:-1}
for i in range(4):
    total_ngrams[i+1] = len(ngrams_all[i+1])
    total_voc[i+1] = len(ngrams_voc[i+1])                       
    
ngrams_prob = {1:[], 2:[], 3:[], 4:[]}
for i in range(4):
    for ngram in ngrams_voc[i+1]:
        tlist = [ngram]
        tlist.append(ngrams_all[i+1].count(ngram))
        ngrams_prob[i+1].append(tlist)
    
for i in range(4):
    for ngram in ngrams_prob[i+1]:
        ngram[-1] = (ngram[-1]+1)/(total_ngrams[i+1] + total_voc[i+1])             #add-1

## Unigram Model

In [29]:
freq_uni.most_common(1)


[('the', 1609)]

In [30]:
from nltk.util import ngrams
f = open("Downloads/test.txt", "r")
preds=[]
counter=0
for i in f:
    ngram[1]=list(ngrams(i,1))[-1]
    preds.append(freq_uni.most_common(1))
    #print(str("".join(i))+""+str(preds[counter][0][0]))
    print(str(preds[counter][0][0]))
    counter+=1
          

the
the
the
the
the
the
the
the
the
the
the
