# Implementing LSTM Based Next Word Prediction Using Keras

In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.utils.data_utils import get_file
import random

Using Theano backend.


In [3]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
with open('reddit-comments-2015-08.csv') as f:
    reader = csv.reader(f,skipinitialspace=True)
    ## Split tweets into sentences
    sentences  = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
print "Parsed %d sentences"%(len(sentences))

Parsed 79171 sentences


In [4]:
## Tokenize the sentences into words
count = 0
tokenized_words = [nltk.word_tokenize(word) for word in sentences]
for l in tokenized_words:
    count += len(l)
print "Found %d number of words"%(count)

Found 1716192 number of words


In [5]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(itertools.chain(*tokenized_words))
print "Found %d unique word tokens"%(len(word_freq))

Found 65751 unique word tokens


In [6]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times


In [7]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_words):
    tokenized_words[i] = [w if w in word_to_index else unknown_token for w in sent]

In [8]:
## Creating each sentence in the collection to be 10 words long
text = []
for sent in tokenized_words:
    if len(sent) < 10:
        continue
    if len(sent) == 10:
        text.append(sent)
    else:
        val = sent[0:9]
        val.append(sentence_end_token)
        text.append(val)

In [9]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in text])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in text])

In [10]:
print X_train
print "---------------------------------------------------------------------------------"
print y_train

[[   0    6 3513 ...,   25  223    8]
 [   0   11   17 ..., 7999 7999 6036]
 [   0  984 1490 ...,  774 3472 2973]
 ..., 
 [   0 7999    4 ...,   13   63    9]
 [   0   38  144 ..., 7999 7999    8]
 [   0    3 4319 ...,  174   12  232]]
---------------------------------------------------------------------------------
[[   6 3513    7 ...,  223    8    1]
 [  11   17    7 ..., 7999 6036    1]
 [ 984 1490  227 ..., 3472 2973    1]
 ..., 
 [7999    4   41 ...,   63    9    1]
 [  38  144 3544 ..., 7999    8    1]
 [   3 4319   19 ...,   12  232    1]]


In [11]:
print np.shape(X_train)
print np.shape(y_train)

(67035, 9)
(67035, 9)


In [12]:
X = np.zeros((np.shape(X_train)[0],np.shape(X_train)[1],vocabulary_size),dtype = np.bool)
for i in range(np.shape(X_train)[0]):
    for j in range(np.shape(X_train)[1]):
        l = np.zeros((vocabulary_size,))
        l[X_train[i][j]] = 1
        X[i][j] = l

In [13]:
y = np.zeros((np.shape(y_train)[0],np.shape(y_train)[1],vocabulary_size),dtype = np.bool)
for i in range(np.shape(y_train)[0]):
    for j in range(np.shape(y_train)[1]):
        l = np.zeros((vocabulary_size,))
        l[y_train[i][j]] = 1
        y[i][j] = l

In [None]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(np.shape(X_train)[1],vocabulary_size)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))

Build model...


In [46]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [48]:
model.fit(X_train, y_train, batch_size=128, nb_epoch=10)

Exception: A target array with shape (64182, 9) was passed for an output of shape (None, 10) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.