# Implementing LSTM Based Next Word Prediction Using Keras

In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
import random

import matplotlib.pyplot as plt
%matplotlib inline

Using Theano backend.


In [15]:
maxlen = 10
no_word = 'NO_WORD'
vocabulary_size = 10000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
with open('reddit-comments-2015-08.csv') as f:
    reader = csv.reader(f,skipinitialspace=True)
    ## Split comments into sentences
    sentences  = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
print "Parsed %d sentences"%(len(sentences))
sentences = sentences[0:50000]
print "Parsed %d sentences"%(len(sentences))

Parsed 79171 sentences
Parsed 50000 sentences


In [16]:
## Tokenize the sentences into words
count = 0
tokenized_words = [nltk.word_tokenize(word) for word in sentences]
for l in tokenized_words:
    count += len(l)
print "Found %d number of words"%(count)

Found 1082845 number of words


In [17]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(itertools.chain(*tokenized_words))
print "Found %d unique word tokens"%(len(word_freq))

Found 49713 unique word tokens


In [18]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
vocab.insert(0,(u'NO_WORD',1000))
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

Using vocabulary size 10000.
The least frequent word in our vocabulary is 'textbox' and appeared 5 times


In [19]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_words):
    tokenized_words[i] = [w if w in word_to_index else unknown_token for w in sent]

## Simplest Model

In [None]:
## Creating each sentence in the collection to be 10 words long
## I have split the sentences unknowingly thus making sentence incomplete
## This methos needs to be checked
text,next_word,maxlen = [],[],
for sent in tokenized_words:
    if len(sent) < maxlen:
        continue
    if len(sent) >= maxlen:
        val = sent[0:maxlen-1]
        val.append(sentence_end_token)
        text.append(val)
        next_word.append(sent[maxlen-1])
print text[0:10]
print "--------------------"
print next_word[0:10]

In [None]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(text),maxlen,vocabulary_size),dtype = np.bool)
y = np.zeros((len(text),vocabulary_size),dtype = np.bool)
for i,sent in enumerate(text):
    for t,word in enumerate(sent):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [None]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

In [None]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,vocabulary_size)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
model.fit(X, y, batch_size=128, nb_epoch=10)

In [None]:
## Model needs to be trained on a GPU
## The output needs to be predicted by using a mixture of sentences

## Using Masking Rather Than To Slice The Input

In [None]:
next_word = []
def mask(string):
    if len(string) < maxlen:
        ## Pad the string with no_word type of string
        next_word.append(string[-2])
        l = [no_word for i in range(maxlen - len(string) + 1)]
        string.remove(sentence_end_token)
        string[-1] = sentence_end_token
        string = l + string
    else:
        string = string[0:maxlen]
        next_word.append(string.pop(maxlen-1))
        string.append(sentence_end_token)
    return string

In [None]:
for i,sent in enumerate(tokenized_words):
    if len(sent) <= 3:
        tokenized_words.pop(i)
for i,sent in enumerate(tokenized_words):
    tokenized_words[i] = mask(tokenized_words[i])

In [None]:
print tokenized_words[0:10]
print '--------------------------------------------'
print next_word[0:10]

In [None]:
## Frequency distribution of the next word
dist = nltk.FreqDist(next_word)
for key in sorted(dist,key=dist.get,reverse=True)[0:10]:
    print str(key.encode('utf-8')) + " : " + str(dist[unicode(key)])

## Using the sliding window approach

In [20]:
## The above model seem to have not captured the essence of long words
## So we use the sliding window approach
def mask(string):
    if len(string) < maxlen:
        ## Pad the string with no_word type of string
        l = [no_word for i in range(maxlen - len(string))]
        string = l + string
    return string

In [21]:
for i in range(len(tokenized_words)):
    tokenized_words[i] = mask(tokenized_words[i])

In [22]:
## Sanity Check
for string in tokenized_words:
    if len(string) < 10:
        print "Warning"

In [23]:
## Converting the segregated sentences into one sentence (text)
text = []
for string in tokenized_words:
    for word in string:
        text.append(word)

In [24]:
dic = {}
for word in text:
    try:
        dic[word] += 1
    except:
        dic[word] = 1
print dic.keys()[dic.values().index(max(dic.values()))],max(dic.values())

UNKNOWN_TOKEN 59346


In [None]:
## Making a sentence matrix having sentences of length 'maxlen'
sentences,next_word = [],[]
step = 3
for i in range(0,len(text) - maxlen,step):
    sentences.append(text[i:i+maxlen])
    next_word.append(text[i+maxlen])
sentences = sentences[0:100000]
print('nb sequences:', len(sentences))

In [None]:
dist = nltk.FreqDist(next_word)
for key in sorted(dist,key=dist.get,reverse=True)[0:10]:
    print str(key.encode('utf-8')) + " : " + str(dist[unicode(key)])

In [None]:
## Sanity Check
for string in sentences:
    if len(string) != 10:
        print "Warning"

In [None]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(sentences),maxlen,vocabulary_size+1),dtype = np.bool)
y = np.zeros((len(sentences),vocabulary_size+1),dtype = np.bool)
for i,sent in enumerate(sentences):
    for t,word in enumerate(sent):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [None]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

In [None]:
print np.shape(X)
print np.shape(y)

In [None]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,vocabulary_size+1)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size+1))
model.add(Activation('softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adagrad')

In [None]:
model.load_weights('weights.hdf5')

In [None]:
model.fit(X, y, batch_size=128, nb_epoch=30)

In [None]:
model.save_weights('weights.hdf5')

In [None]:
string = 'Are you going out with him to the'
def convert(string):
    word = nltk.word_tokenize(string)
    word.insert(0,sentence_start_token)
    word.append(sentence_end_token)
    word = [word_to_index[w] if w in word_to_index else word_to_index[unknown_token] for w in word]
    return word
print convert(string)

In [None]:
word,i = convert(string),0
X_test = np.zeros((len(word),maxlen,vocabulary_size+1),dtype = np.bool)
for t,w in enumerate(word):
    X_test[i,t,w] = 1
    i += 1
print X_test

In [None]:
for i in range(10):
    yPred = model.predict_classes(X_test,verbose = 1)

In [None]:
print yPred
print yPred.shape

In [None]:
for word in yPred:
    print index_to_word[word]