# Implementing LSTM Based Next Word Prediction Using Keras

In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
import random

import matplotlib.pyplot as plt
%matplotlib inline

Using Theano backend.


In [2]:
from collections import Counter

In [3]:
maxlen = 10
no_word = 'NO_WORD'
vocabulary_size = 10000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
with open('reddit-comments-2015-08.csv') as f:
    reader = csv.reader(f,skipinitialspace=True)
    ## Split comments into sentences
    sentences  = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in reader]
print "Parsed %d sentences"%(len(sentences))

Parsed 15001 sentences


In [37]:
## Tokenize the sentences into words
count = 0
tokenized_words = [nltk.word_tokenize(word) for word in sentences]
for l in tokenized_words:
    count += len(l)
print "Found %d number of words"%(count)

Found 1716192 number of words


In [38]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(itertools.chain(*tokenized_words))
print "Found %d unique word tokens"%(len(word_freq))

Found 65751 unique word tokens


In [39]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
vocab.insert(0,(u'NO_WORD',1000))
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

Using vocabulary size 10000.


In [40]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_words):
    tokenized_words[i] = [w if w in word_to_index else unknown_token for w in sent]

## Simplest Model

In [None]:
## Creating each sentence in the collection to be 10 words long
## I have split the sentences unknowingly thus making sentence incomplete
## This methos needs to be checked
text,next_word,maxlen = [],[],
for sent in tokenized_words:
    if len(sent) < maxlen:
        continue
    if len(sent) >= maxlen:
        val = sent[0:maxlen-1]
        val.append(sentence_end_token)
        text.append(val)
        next_word.append(sent[maxlen-1])
print text[0:10]
print "--------------------"
print next_word[0:10]

In [None]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(text),maxlen,vocabulary_size),dtype = np.bool)
y = np.zeros((len(text),vocabulary_size),dtype = np.bool)
for i,sent in enumerate(text):
    for t,word in enumerate(sent):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [None]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

In [None]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,vocabulary_size)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
model.fit(X, y, batch_size=128, nb_epoch=10)

In [None]:
## Model needs to be trained on a GPU
## The output needs to be predicted by using a mixture of sentences

## Using Masking Rather Than To Slice The Input

In [None]:
next_word = []
def mask(string):
    if len(string) < maxlen:
        ## Pad the string with no_word type of string
        next_word.append(string[-2])
        l = [no_word for i in range(maxlen - len(string) + 1)]
        string.remove(sentence_end_token)
        string[-1] = sentence_end_token
        string = l + string
    else:
        string = string[0:maxlen]
        next_word.append(string.pop(maxlen-1))
        string.append(sentence_end_token)
    return string

In [None]:
for i,sent in enumerate(tokenized_words):
    if len(sent) <= 3:
        tokenized_words.pop(i)
for i,sent in enumerate(tokenized_words):
    tokenized_words[i] = mask(tokenized_words[i])

In [None]:
print tokenized_words[0:10]
print '--------------------------------------------'
print next_word[0:10]

In [None]:
## Frequency distribution of the next word
dist = nltk.FreqDist(next_word)
for key in sorted(dist,key=dist.get,reverse=True)[0:10]:
    print str(key.encode('utf-8')) + " : " + str(dist[unicode(key)])

## Using the sliding window approach

In [41]:
## The above model seem to have not captured the essence of long words
## So we use the sliding window approach
## Converting the segregated sentences into one sentence (text)
text = []
for string in tokenized_words:
    for word in string:
        text.append(word)

In [42]:
## Making a sentence matrix having sentences of length 'maxlen'
sent,next_word = [],[]
step = 3
for i in range(0,len(text) - maxlen,step):
    sent.append(text[i:i+maxlen])
    next_word.append(text[i+maxlen])
print('nb sequences:', len(sent))

('nb sequences:', 572061)


In [43]:
Counter(next_word).most_common(10)

[('UNKNOWN_TOKEN', 32235),
 (u'SENTENCE_START', 26491),
 (u'SENTENCE_END', 26394),
 (u'.', 22495),
 (u'the', 17454),
 (u',', 17378),
 (u'to', 11888),
 (u'i', 10660),
 (u'a', 10526),
 (u'and', 10046)]

In [44]:
## Sanity Check
for string in sent:
    if len(string) != 10:
        print "Warning"

## RNN Model

In [45]:
## Memory is constrained so dropping some items
import gc
del tokenized_words,text,word_freq,vocab,sentences
gc.collect()

0

In [47]:
## Due to constraints on the Memory we will use only a subset of our data
data_size = 30000
sent = sent[data_size:2*data_size]
next_word = next_word[data_size:2*data_size]

In [48]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(sent),maxlen,vocabulary_size+1),dtype = np.bool)
y = np.zeros((len(sent),vocabulary_size+1),dtype = np.bool)
for i,s in enumerate(sent):
    for t,word in enumerate(s):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [49]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

[[[False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False False  True ..., False False False]
  [False  True False ..., False False False]
  [False False False ..., False False False]]]
---------------------------------------------------
[[False False False ..., False False False]]


In [50]:
print np.shape(X)
print np.shape(y)

(30000, 10, 10001)
(30000, 10001)


In [51]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,vocabulary_size+1)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size+1))
model.add(Activation('softmax'))

Build model...


In [52]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
lstm_1 (LSTM)                      (None, 10, 512)     21532672    lstm_input_1[0][0]               
____________________________________________________________________________________________________
dropout_1 (Dropout)                (None, 10, 512)     0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                      (None, 512)         2099200     dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)                (None, 512)         0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [53]:
model.compile(loss='categorical_crossentropy', optimizer='adagrad')

In [None]:
model.load_weights('weights.hdf5')

In [None]:
model.fit(X, y, batch_size=128, nb_epoch=30)

In [55]:
model.save_weights('weights.hdf5')

In [56]:
string = 'Are you going out with him to the'
def convert(string):
    word = nltk.word_tokenize(string)
    word.insert(0,sentence_start_token)
    word.append(sentence_end_token)
    word = [word_to_index[w] if w in word_to_index else word_to_index[unknown_token] for w in word]
    return word
print convert(string)

[1, 10000, 11, 125, 63, 25, 122, 6, 4, 2]


In [57]:
word,i = convert(string),0
X_test = np.zeros((len(word),maxlen,vocabulary_size+1),dtype = np.bool)
for t,w in enumerate(word):
    X_test[i,t,w] = 1
    i += 1
print X_test

[[[False  True False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]]

 [[False False False ..., False False False]
  [False False False ..., False False  True]
  [False False False ..., False False False]
  ..., 
  [False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]]

 [[False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]]

 ..., 
 [[False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False F

In [58]:
for i in range(10):
    yPred = model.predict_classes(X_test,verbose = 1)



In [59]:
print yPred
print yPred.shape

[10000 10000 10000 10000 10000 10000 10000 10000 10000 10000]
(10,)


In [60]:
for word in yPred:
    print index_to_word[word]

UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
UNKNOWN_TOKEN
