# Our RNN-LSTM Model for training

## Sources
1. [Beam search Keras code](https://gist.github.com/udibr/67be473cf053d8c38730)
2. [Keras next character prediction](https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py)
3. [Skip Thoughts Paper](http://arxiv.org/pdf/1506.06726v1.pdf)

In [37]:
import csv
import itertools
import operator
import numpy as np
import nltk

import matplotlib.pyplot as plt
%matplotlib inline

In [38]:
# Variables to be used later
vocabulary_size = 2**13
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
file_name = 'reddit-comments-2015-08.csv'

In [39]:
# Read csv file and append sentences into text after adding delimiters
text = ""
with open(file_name) as f:
    reader = csv.reader(f,skipinitialspace=True)
    for x in reader:
        text += "%s %s %s " %(sentence_start_token,x[0],sentence_end_token)
    
print "The text is %d characters long" %len(text)

The text is 7991963 characters long


In [40]:
import string
printable = set(string.printable)
text = filter(lambda x: x in printable, text)

In [41]:
tokenized_text = nltk.word_tokenize(text)
print '%d tokens generated' %(len(tokenized_text))

1587838 tokens generated


In [42]:
# Remove long strings
temp = [s for s in tokenized_text if len(s) <= 30]
tokenized_text = temp
print len(tokenized_text)

1584481


In [43]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(tokenized_text)
print "Found %d unique word tokens"%(len(word_freq))

Found 73292 unique word tokens


In [44]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

Using vocabulary size 8192.
The least frequent word in our vocabulary is 'consist' and appeared 10 times


In [45]:
# Replace all words not in our vocabulary with the unknown token
for i, word in enumerate(tokenized_text):
    if word not in word_to_index:
        tokenized_text[i] = unknown_token

In [46]:
maxlen = 10
step = 3
sentences = np.empty([len(range(0, len(tokenized_text) - maxlen, step)), maxlen], dtype='a30')
next_word = np.empty([len(range(0, len(tokenized_text) - maxlen, step)), 1], dtype ='a30')

count = 0
for i in range(0, len(tokenized_text) - maxlen, step):
    sentences[count] = tokenized_text[i: i + maxlen]
    next_word[count] = tokenized_text[i + maxlen]
    count += 1
print('nb sequences:', len(sentences))

('nb sequences:', 528157)


In [47]:
n = next_word.tolist()
n = [l[0] for l in n]
from collections import Counter
count = Counter(n)
count.most_common(10)

[('UNKNOWN_TOKEN', 43554),
 ('.', 22820),
 (',', 17359),
 ('the', 16197),
 ('to', 11584),
 ('a', 10436),
 ('I', 10073),
 ('and', 9497),
 ('of', 7617),
 ('you', 6573)]

In [48]:
# Since we have memory issue, we will free variables that will not be used now
import gc
del text, tokenized_text, word_freq, vocab
gc.collect()

0

## X_train and y_train

In [49]:
# Creating a small dataset out of our given sentences
# We will train in batches
subdata_size = 32768
sents = sentences[subdata_size:2*subdata_size]
nexts = next_word[subdata_size:2*subdata_size]

In [50]:
print('Vectorization...')
X = np.zeros((len(sents), maxlen), dtype='int_')
y = np.zeros((len(sents), vocabulary_size), dtype=np.bool)
for i, sentence in enumerate(sents):
    for t, word in enumerate(sentence):
        X[i, t] = word_to_index[word]
    y[i, word_to_index[nexts[i][0]]] = 1

Vectorization...


In [51]:
X.shape, y.shape

((32768, 10), (32768, 8192))

In [52]:
## Saving the numpy array for future
# np.savez('x_train', X)
# np.savez('y_train', y)

### Our Model

In [53]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,TimeDistributed, Dense, Dropout

EMBED_HIDDEN_SIZE = 512
LSTM_SIZE = 512

model = Sequential()
model.add(Embedding(vocabulary_size, EMBED_HIDDEN_SIZE, input_length=maxlen, name='Embedding'))
model.add(Dropout(0.3, name='Dropout1'))
model.add(LSTM(EMBED_HIDDEN_SIZE, return_sequences=True, input_shape=(maxlen, EMBED_HIDDEN_SIZE), name='LSTM1'))
model.add(Dropout(0.2, name='Dropout2'))
model.add(LSTM(512, return_sequences=False, name='LSTM2'))
model.add(Dropout(0.2, name='Dropout3'))
model.add(Dense(vocabulary_size, activation='softmax', name='Dense'))

In [54]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
Embedding (Embedding)              (None, 10, 512)     4194304     embedding_input_7[0][0]          
____________________________________________________________________________________________________
Dropout1 (Dropout)                 (None, 10, 512)     0           Embedding[0][0]                  
____________________________________________________________________________________________________
LSTM1 (LSTM)                       (None, 10, 512)     2099200     Dropout1[0][0]                   
____________________________________________________________________________________________________
Dropout2 (Dropout)                 (None, 10, 512)     0           LSTM1[0][0]                      
___________________________________________________________________________________________

In [55]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [99]:
model.fit(X, y, batch_size=128, nb_epoch=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f614fcfe550>

In [100]:
model.save_weights('/home/najeeb/Desktop/Dataset/mynz_jmi/zafar.hdf5')

In [56]:
model.load_weights('/home/najeeb/Desktop/Dataset/mynz_jmi/weights_final.hdf5')

In [63]:
sentence = 'SENTENCE_START While on our way to the show we saw'
x = nltk.word_tokenize(sentence)
a = [word_to_index[i] for i in x] 
x_predict = np.array([a])
y = model.predict_proba(x_predict)
top_10 = np.argsort(y[0])[::-1][:10]
for i in top_10:
    print index_to_word[i]

the
30
him
her
maybe
and
doing
your
up
his


# Word Prediction Using the Model

In [72]:
## Function to convert a sentence in a matrix
def sentence_lemmatized(sentence):
    X = nltk.word_tokenize(sentence)
    A = [word_to_index[w] if w in word_to_index else word_to_index[unknown_token] for w in X] 
    X_predict = np.array([A])
    return X_predict
## Function to predict the next word
def predict_word(X):
    yProb = model.predict_proba(X,verbose = 0)
    top = np.argsort(yProb[0])[::-1][:20]
    if index_to_word[top[0]] == unknown_token:
        return index_to_word[top[1]]
    else:
        return index_to_word[top[0]]

In [73]:
## Generating words so as to form a sentence
def form_text(sentence,para_length):
    print "Forming text(%d words)"%(para_length)
    print "--------------------------------------------------"
    i,flag,text = 0,0,[]
    while (i < para_length):
        s = sentence_lemmatized(sentence)
        word = predict_word(s)
        if flag == 1:
            text.append(word)
            i += 1
        if word == '.' or word == sentence_start_token:
            flag = 1
        sentence = sentence_start_token + ' ' + ' '.join(sentence.split(' ')[2:]) + " " + word
    return " ".join(text)

In [74]:
def generate_paragraphs(n_texts,sentence,para_length):
    for i in range(n_texts):
        string = form_text(sentence,para_length)
        print "Paragraph Number %d"%(i+1)
        print "--------------------------------------------------"
        print string
        print "--------------------------------------------------"
        sentence = sentence_start_token+" "+" ".join(string.split(' ')[0:9])

In [75]:
sentence = "SENTENCE_START While on our way to the show we saw"
generate_paragraphs(2,sentence,100)

Forming text(100 words)
--------------------------------------------------
Paragraph Number 1
--------------------------------------------------
So he 'd always make to buy running ! I thought my faith less being able to deal with any things that he was telling him the OP the last different language . But since the government is not an OP ) . You 're trying to say that the only comment very similar wall , and I do hope I 'd be able to do a lot of people who seem help sense who would be some other research . I have been removed because you have being helped ? SENTENCE_END SENTENCE_START Your post has been removed because its
--------------------------------------------------
Forming text(100 words)
--------------------------------------------------
Paragraph Number 2
--------------------------------------------------
But since the government is not an OP ) . You 're trying to say that the only comment very similar wall , and I do hope I 'd be able to do a lot of people who seem help sen