# **Classical Approach**

In [0]:
cd sample_data

[Errno 2] No such file or directory: 'sample_data'
/content/sample_data


In [0]:
pip install nltk==3.4



**Data pre-processing**

In [1]:
import io
import os
import requests
import nltk
nltk.download('punkt')
url = "https://raw.githubusercontent.com/ronakkaoshik42/NLP/master/jane.txt"
text = requests.get(url).content.decode('utf8')
with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)
print(len(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


4452799


**Procuring meaningful and complete text**

In [0]:
import re
s=re.sub('Chapter [0-9]+','',text)
s=re.sub('Esq.','Esq',text)
s=s.lower()

**Generating sentence tokens, followed by word tokens**

In [0]:
from nltk.util import pad_sequence
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
sent_tokenize_list = sent_tokenize(s)
for i in range(len(sent_tokenize_list)):
  #sent_tokenize_list[i]=re.sub("[^\w\s]","",sent_tokenize_list[i])
  sent_tokenize_list[i]=re.sub("\\r\\n"," ",sent_tokenize_list[i])
g=[]
for i in range(len(sent_tokenize_list)):
  g.append(word_tokenize(sent_tokenize_list[i]))
tokens=0
for i in range(len(g)):
  tokens=tokens+len(g[i])
print('Total number of tokens in given dataset-->',tokens)

924146


**Splitting dataset for test and train**

In [0]:
from sklearn.model_selection import train_test_split
import numpy as np
x=np.array(g)
x_train, x_test= train_test_split(g,test_size=0.2,random_state=42)

**Compute MLE for unigram, bigram, trigrams, and quadgrams. How many n-grams are
possible and how many actually exist? Use the training corpus and NLTK library.**

In [0]:
from collections import Counter
l=[]
from nltk.lm.preprocessing import pad_both_ends
for i in range(len(x_train)):
  l.append(list(pad_both_ends(x_train[i], n=2)))#padding the sentences for start end marking
#detokenizing from sentence to words
flist=[]
for i in range(len(l)):
  for j in range(len(l[i])):
    flist.append(l[i][j])
#For unigrams
unigrams=Counter(flist)
c=0
for i in unigrams.keys():
    c=c+unigrams[i]
print('Unigrams Count, i.e, Vocabulary count -->',len(unigrams))
V=len(unigrams)
print('Theoretical Unigram count-->',V)


Unigrams Count, i.e, Vocabulary count --> 15581
Theoretical Unigram count--> 15581


In [0]:
#MLE for unigrams
unigrams_mle={} #Dictionary for storing MLE values of unigrams
for i in unigrams.keys():
    unigrams_mle[i]=unigrams[i]/tokens
print("The MLE for the word 'own' is-->",unigrams_mle["own"])

The MLE for the word 'own' is--> 0.0012790186831950795


Sentence generator for unigram

In [0]:
from nltk.lm import MLE
n = 1
x_train1, padded_sents1 = padded_everygram_pipeline(n, g)
model1 = MLE(n)
len(model1.vocab)
model1.fit(x_train1, padded_sents1)
print(model1.vocab)
len(model1.vocab)
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenize = TreebankWordDetokenizer().detokenize
def Generator(model, num_words, random_seed=42): # here num_words is the max word count
    x = []
    for i in model.generate(num_words, random_seed=random_seed):
        if i == '<s>':
            continue
        if i == '</s>':
            break
        x.append(i)
    return detokenize(x)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 16941 items>


In [0]:
generate_sent(model1, 3, random_seed=7)

'curricle curricle curricle'

As can be seen above, the generated sentence has repetitive terms. So unigram not much useful here.

In [0]:
#For bigrams
bigrams=Counter() #Stores the bigram counts for all the possible combinations from given dataset
for i in range(len(flist)-1):
    b=(flist[i],flist[i+1])
    bigrams[b]=bigrams.get(b,0)+1
print('Bigrams Count-->',len(bigrams))
print('Theoretical Bigram count-->',V**2)

Bigrams Count--> 176793
Theoretical Bigram count--> 242767561


In [0]:
#MLE for bigrams
bigrams_mle={}
for (i,j) in bigrams.keys():
    bigrams_mle[(i,j)]=bigrams[(i,j)]/unigrams[i]

In [0]:
n = 2
x_train2, padded_sents2 = padded_everygram_pipeline(n, g)
model2 = MLE(n)
len(model2.vocab)
model2.fit(x_train2, padded_sents2)
print(model2.vocab)
len(model2.vocab)
Generator(model2, 12, random_seed=23)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 16943 items>


'very well take up with us to-day on to think what will'

As can be seen above, the sentence generated using bigram models is much more better than previous one.

In [0]:
#For trigrams
trigrams=Counter()
for i in range(len(flist)-2):
    b=(flist[i],flist[i+1],flist[i+2])
    trigrams[b]=trigrams.get(b,0)+1
print('Trigrams Count-->',len(trigrams))
print('Theoretical Trigram count-->',V**3)

Trigrams Count--> 451217
Theoretical Trigram count--> 3782561367941


In [0]:
#MLE for trigrams
trigrams_mle={}
for (i,j,k) in trigrams.keys():
    trigrams_mle[(i,j,k)]=trigrams[(i,j,k)]/bigrams[(i,j)]

In [0]:
n = 3
x_train3, padded_sents3 = padded_everygram_pipeline(n, g)
model3 = MLE(n)
len(model3.vocab)
model3.fit(x_train3, padded_sents3)
print(model3.vocab)
len(model3.vocab)
Generator(model3, 12, random_seed=45)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 16943 items>


'a gentleman, and happiness.'

In [0]:
#For quadgrams
quadgrams=Counter()
for i in range(len(flist)-3):
    b=(flist[i],flist[i+1],flist[i+2],flist[i+3])
    quadgrams[b]=quadgrams.get(b,0)+1
print('Quadgrams Count-->',len(quadgrams))
print('Theoretical Quadgram count-->',V**3)

Quadgrams Count--> 631197
Theoretical Quadgram count--> 3782561367941


In [0]:
#MLE for quadgrams
quadgrams_mle={}
for (i,j,k,l) in quadgrams.keys():
    quadgrams_mle[(i,j,k,l)]=quadgrams[(i,j,k,l)]/trigrams[(i,j,k)]

In [0]:
n = 4
x_train4, padded_sents4 = padded_everygram_pipeline(n,g)
model4 = MLE(n)
len(model4.vocab)
model4.fit(x_train4, padded_sents4)
print(model4.vocab)
len(model4.vocab)
Generator(model4, 12, random_seed=37)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 16943 items>


'mrs. norris had not the smallest objection to his marrying their daughter'

The perplexity on the test dataset will turn out to be infinity without smoothing. Hence applying add 1 and good turing smoothing to bigram model and calculating the respective perplexity as follows:

In [0]:
bigrams_mle1={}
bigrams1={} #effective count after smoothing
for (i,j) in bigrams.keys():
    bigrams_mle1[(i,j)]=(bigrams[(i,j)]+1)/(unigrams[i]+V) #Add-1
    bigrams1[(i,j)]=unigrams[i]*bigrams_mle1[(i,j)]

In [0]:
test_words=x_test
def perplexityadd1(test_words,big_mle1):
    p=1
    for i in range(len(test_words)-2):
        prob=1/(unigrams[i]+V)
        p*=(1/prob_w)**(1/len(test_words))
    return p
print("The perplexity for add 1 smoothing is-->",perplexityadd1(test_words,big_mle1))
def perplexityturing(test_words,d,bigrams):
    p=1
    for i in range(len(test_words)-2):
        try:
            b=(test_words[i],test_words[i+1])
            c=bigrams[b]
            if(c>d):
                cstar=c-d
            else:
                cstar=d-c
            prob_w=cstar/924146 #It is the total tokens
        p*=(1/prob_w)**(1/len(test_words))
    return p
print("The perplexity for good turing smoothing is -->",perplexityturing(test_words,0.780,bigrams))


The perplexity for add 1 smoothing is--> 155.38144359579456
The perplexity for good turing smoothing is --> 92.05312200539007


# Neural **Approac**h

In [4]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import RNN
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [0]:
characters = sorted(list(set(text)))

n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

In [6]:
vocab_size = len(characters)
print('Number of unique characters: ', vocab_size)
print(characters)

Number of unique characters:  85
['\n', '\r', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [0]:
X = []   # extracted sequences
Y = []   # the target - the follow up character
length = len(text)
seq_length = 100   #number of characters to consider before predicting the following character

In [8]:
for i in range(0, length - seq_length, 1):
    sequence = text[i:i + seq_length]
    label = text[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])
    
print('Number of extracted sequences:', len(X))

Number of extracted sequences: 4452699


In [0]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)

In [10]:
X_modified.shape, Y_modified.shape

((4452699, 100, 1), (4452699, 85))

In [11]:
print("X[0].shape = {}, Y[0].shape = {}".format(X_modified[0].shape, Y_modified[0].shape))
print("X[0]: ", X_modified[0])
print("Y[0]: ", Y_modified[0])

X[0].shape = (100, 1), Y[0].shape = (85,)
X[0]:  [[0.37647059]
 [0.77647059]
 [0.69411765]
 [0.87058824]
 [0.91764706]
 [0.74117647]
 [0.89411765]
 [0.02352941]
 [0.2       ]
 [0.01176471]
 [0.        ]
 [0.01176471]
 [0.        ]
 [0.01176471]
 [0.        ]
 [0.56470588]
 [0.78823529]
 [0.89411765]
 [0.02352941]
 [0.61176471]
 [0.69411765]
 [0.82352941]
 [0.91764706]
 [0.74117647]
 [0.89411765]
 [0.02352941]
 [0.4       ]
 [0.82352941]
 [0.82352941]
 [0.78823529]
 [0.85882353]
 [0.91764706]
 [0.14117647]
 [0.02352941]
 [0.85882353]
 [0.75294118]
 [0.02352941]
 [0.47058824]
 [0.74117647]
 [0.82352941]
 [0.82352941]
 [0.97647059]
 [0.84705882]
 [0.71764706]
 [0.77647059]
 [0.02352941]
 [0.43529412]
 [0.69411765]
 [0.82352941]
 [0.82352941]
 [0.14117647]
 [0.02352941]
 [0.78823529]
 [0.84705882]
 [0.02352941]
 [0.56470588]
 [0.85882353]
 [0.83529412]
 [0.74117647]
 [0.89411765]
 [0.90588235]
 [0.74117647]
 [0.91764706]
 [0.90588235]
 [0.77647059]
 [0.78823529]
 [0.89411765]
 [0.74117647]

In [0]:
model = Sequential()
model.add(LSTM(400, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))

In [25]:
ls

baseline-improvement-06-0.9927.hdf5


In [0]:
# load the network weights
filename = "baseline-improvement-06-0.9927.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [0]:
filepath="model_weights/baseline-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [27]:
model.fit(X_modified, Y_modified, epochs=2, batch_size=128, callbacks = callbacks_list)

Epoch 1/2

Epoch 00001: loss improved from inf to 1.23540, saving model to model_weights/baseline-improvement-01-1.2354.hdf5
Epoch 2/2

Epoch 00002: loss improved from 1.06508 to 0.99274, saving model to model_weights/baseline-improvement-02-0.9927.hdf5


In [0]:
start = np.random.randint(0, len(X)-1) # or generate random start

string_mapped = list(X[start])

full_string = [n_to_char[value] for value in string_mapped]

print("Seed:")
print("\"", ''.join(full_string), "\"")

In [0]:
for i in range(100):
    x = np.reshape(string_mapped,(1,len(string_mapped), 1))
    x = x / float(len(characters))

    pred_index = np.argmax(model.predict(x, verbose=0))
    seq = [n_to_char[value] for value in string_mapped]
    full_string.append(n_to_char[pred_index])
    
    string_mapped.append(pred_index)  # add the predicted character to the end
    string_mapped = string_mapped[1:len(string_mapped)]

In [0]:
txt=""
for char in full_string:
    txt = txt+char

In [30]:
print(start)
print(txt)

elizabeth did not quite equal her father in personal contentment. always to be presented with the date had had a disappointment. he had not been known to them as a boy


# **Comparison**

As is observed above, the text generated by this LSTM based architecture is much better grammatically when compared to the sentences generated by the n-gram models. However, the quadgram is quite close. (Will compare the perplexity scores and upload in a later version of the assignment, post deadline)