# Implementing LSTM Based Next Word Prediction Using Keras

In [None]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
import random

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
with open('reddit-comments-2015-08.csv') as f:
    reader = csv.reader(f,skipinitialspace=True)
    ## Split comments into sentences
    sentences  = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
print "Parsed %d sentences"%(len(sentences))

In [None]:
## Tokenize the sentences into words
count = 0
tokenized_words = [nltk.word_tokenize(word) for word in sentences]
for l in tokenized_words:
    count += len(l)
print "Found %d number of words"%(count)

In [None]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(itertools.chain(*tokenized_words))
print "Found %d unique word tokens"%(len(word_freq))

In [None]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

In [None]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_words):
    tokenized_words[i] = [w if w in word_to_index else unknown_token for w in sent]

In [None]:
## Creating each sentence in the collection to be 10 words long
## I have split the sentences unknowingly thus making sentence incomplete
## This methos needs to be checked
text,next_word = [],[]
for sent in tokenized_words:
    if len(sent) < 10:
        continue
    if len(sent) > 11:
        val = sent[1:11]
        text.append(val)
        next_word.append(sent[11])

In [None]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(text),10,vocabulary_size),dtype = np.bool)
y = np.zeros((len(text),vocabulary_size),dtype = np.bool)
for i,sent in enumerate(text):
    for t,word in enumerate(sent):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [None]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

In [None]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(10,vocabulary_size)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
model.fit(X, y, batch_size=128, nb_epoch=10)

In [None]:
## Model needs to be trained on a GPU
## The output needs to be predicted by using a mixture of sentences