In [1]:
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
file = open('proj.txt').read()

In [3]:
# Tokenization
# Standardization
def tokenize_words(input):
  input = input.lower()
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(input)
  filtered = filter(lambda token: token not in stopwords.words('english'),tokens)
  return "".join(filtered)
processed_inputs = tokenize_words(file)

In [4]:
# Char to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [5]:
# check if words to chars or chars to num works
input_len = len(processed_inputs)
vocab_len = len(chars)
print('Total no.of characters:', input_len)
print('total vocab:', vocab_len)

Total no.of characters: 15469
total vocab: 44


In [6]:
# seq length
seq_length = 100
x_data = []
y_data = []

In [7]:
# loop through sequence
for i in range(0, input_len - seq_length, 1):
  in_seq = processed_inputs[i:i + seq_length]
  out_seq = processed_inputs[i + seq_length]
  x_data.append([char_to_num[char] for char in in_seq])
  y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print('total patterns:', n_patterns)

total patterns: 15369


In [8]:
# convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [9]:
# one hot encoding
y = np_utils.to_categorical(y_data)

In [10]:
# creating model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [11]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
#saving model
filepath = 'model_weigths_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks= [checkpoint]

In [13]:
#fit model
model.fit(X,y, epochs=10, batch_size=256, callbacks=desired_callbacks)

Epoch 1/10

Epoch 00001: loss improved from inf to 3.16468, saving model to model_weigths_saved.hdf5
Epoch 2/10

Epoch 00002: loss improved from 3.16468 to 3.08765, saving model to model_weigths_saved.hdf5
Epoch 3/10

Epoch 00003: loss improved from 3.08765 to 3.08382, saving model to model_weigths_saved.hdf5
Epoch 4/10

Epoch 00004: loss improved from 3.08382 to 3.07498, saving model to model_weigths_saved.hdf5
Epoch 5/10

Epoch 00005: loss improved from 3.07498 to 3.07199, saving model to model_weigths_saved.hdf5
Epoch 6/10

Epoch 00006: loss improved from 3.07199 to 3.06690, saving model to model_weigths_saved.hdf5
Epoch 7/10

Epoch 00007: loss did not improve from 3.06690
Epoch 8/10

Epoch 00008: loss improved from 3.06690 to 3.06550, saving model to model_weigths_saved.hdf5
Epoch 9/10

Epoch 00009: loss improved from 3.06550 to 3.06433, saving model to model_weigths_saved.hdf5
Epoch 10/10

Epoch 00010: loss improved from 3.06433 to 3.06359, saving model to model_weigths_saved.hdf5

<tensorflow.python.keras.callbacks.History at 0x7f9794526c10>

In [14]:
# recompile model with saved weigths
filename = 'model_weigths_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
# output of model into char
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [16]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print('Random Seed:')
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" oodmankindfavoredamericanwritermarktwain1835191045mainz1840gutenbergdenkmalmedalgutenbergprintingpre "


In [17]:
# generate text
for i in range(10):
  x=numpy.reshape(pattern, (1,len(pattern), 1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1:len(pattern)]

eeeeeeeeee