# Our RNN-LSTM Model for training

## Sources
1. [Beam search Keras code](https://gist.github.com/udibr/67be473cf053d8c38730)
2. [Keras next character prediction](https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py)
3. [Skip Thoughts Paper](http://arxiv.org/pdf/1506.06726v1.pdf)

In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Variables to be used later
vocabulary_size = 2**13
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
file_name = 'reddit-comments-2015-08.csv'

In [4]:
# Read csv file and append sentences into text after adding delimiters
text = ""
with open(file_name) as f:
    reader = csv.reader(f,skipinitialspace=True)
    for x in reader:
        text += "%s %s %s " %(sentence_start_token,x[0],sentence_end_token)
    
print "The text is %d characters long" %len(text)

The text is 7991963 characters long


In [5]:
import string
printable = set(string.printable)
text = filter(lambda x: x in printable, text)

In [6]:
tokenized_text = nltk.word_tokenize(text)
print '%d tokens generated' %(len(tokenized_text))

1587838 tokens generated


In [7]:
# Remove long strings
temp = [s for s in tokenized_text if len(s) <= 30]
tokenized_text = temp
print len(tokenized_text)

1584481


In [8]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(tokenized_text)
print "Found %d unique word tokens"%(len(word_freq))

Found 73292 unique word tokens


In [9]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

Using vocabulary size 8192.
The least frequent word in our vocabulary is 'consist' and appeared 10 times


In [10]:
# Replace all words not in our vocabulary with the unknown token
for i, word in enumerate(tokenized_text):
    if word not in word_to_index:
        tokenized_text[i] = unknown_token

In [11]:
maxlen = 10
step = 3
sentences = np.empty([len(range(0, len(tokenized_text) - maxlen, step)), maxlen], dtype='a30')
next_word = np.empty([len(range(0, len(tokenized_text) - maxlen, step)), 1], dtype ='a30')

count = 0
for i in range(0, len(tokenized_text) - maxlen, step):
    sentences[count] = tokenized_text[i: i + maxlen]
    next_word[count] = tokenized_text[i + maxlen]
    count += 1
print('nb sequences:', len(sentences))

('nb sequences:', 528157)


In [12]:
n = next_word.tolist()
n = [l[0] for l in n]
from collections import Counter
count = Counter(n)
count.most_common(10)

[('UNKNOWN_TOKEN', 43554),
 ('.', 22820),
 (',', 17359),
 ('the', 16197),
 ('to', 11584),
 ('a', 10436),
 ('I', 10073),
 ('and', 9497),
 ('of', 7617),
 ('you', 6573)]

In [13]:
# Since we have memory issue, we will free variables that will not be used now
import gc
del text, tokenized_text, word_freq, vocab
gc.collect()

0

## X_train and y_train

In [14]:
# Creating a small dataset out of our given sentences
# We will train in batches
subdata_size = 32768
sents = sentences[subdata_size:2*subdata_size]
nexts = next_word[subdata_size:2*subdata_size]

In [15]:
print('Vectorization...')
X = np.zeros((len(sents), maxlen), dtype='int_')
y = np.zeros((len(sents), vocabulary_size), dtype=np.bool)
for i, sentence in enumerate(sents):
    for t, word in enumerate(sentence):
        X[i, t] = word_to_index[word]
    y[i, word_to_index[nexts[i][0]]] = 1

Vectorization...


In [16]:
X.shape, y.shape

((32768, 10), (32768, 8192))

In [15]:
## Saving the numpy array for future
# np.savez('x_train', X)
# np.savez('y_train', y)

### Our Model

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,TimeDistributed, Dense, Dropout

EMBED_HIDDEN_SIZE = 512
LSTM_SIZE = 512

model = Sequential()
model.add(Embedding(vocabulary_size, EMBED_HIDDEN_SIZE, input_length=maxlen, name='Embedding'))
model.add(Dropout(0.3, name='Dropout1'))
model.add(LSTM(EMBED_HIDDEN_SIZE, return_sequences=True, input_shape=(maxlen, EMBED_HIDDEN_SIZE), name='LSTM1'))
model.add(Dropout(0.2, name='Dropout2'))
model.add(LSTM(512, return_sequences=False, name='LSTM2'))
model.add(Dropout(0.2, name='Dropout3'))
model.add(Dense(vocabulary_size, activation='softmax', name='Dense'))


Using Theano backend.


In [18]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
Embedding (Embedding)            (None, 10, 512)       4194304     embedding_input_1[0][0]          
____________________________________________________________________________________________________
Dropout1 (Dropout)               (None, 10, 512)       0           Embedding[0][0]                  
____________________________________________________________________________________________________
LSTM1 (LSTM)                     (None, 10, 512)       2099200     Dropout1[0][0]                   
____________________________________________________________________________________________________
Dropout2 (Dropout)               (None, 10, 512)       0           LSTM1[0][0]                      
___________________________________________________________________________________________

In [19]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [99]:
model.fit(X, y, batch_size=128, nb_epoch=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f614fcfe550>

In [100]:
model.save_weights('zafar1.hdf5')

In [20]:
model.load_weights('zafar1.hdf5')

In [25]:
sentence = 'SENTENCE_START I managed to go there but I could not'
x = nltk.word_tokenize(sentence)
a = [word_to_index[i] for i in x] 
x_predict = np.array([a])
y = model.predict_proba(x_predict)
top_10 = np.argsort(y[0])[::-1][:10]
for i in top_10:
    print index_to_word[i]

find
use
pick
observe
do
think
care
remember
be
kill


In [23]:
# model.save_weights('zafar.h5')

## Expermintal blocks... Don't run (Will finish tomorrow)

In [None]:
import random
def form_sentence:
    start_index = random.randint(0, X.shape[0] - 1)
    x = X[staticmethodticmethodticmethodticmethodrt_index]
    count = 0
    sentence = []
    while count < 2:
        y = model.predict_proba(x)
        word = index_to_word[np.argmax(y)]
        if count > 0:
            sentence.append(word)
        if word = sentence_end_token:
            count += 1
        x = np.append(x)
            