In [1]:
import numpy as np
from keras.utils import np_utils
import sys

In [2]:
wonderlandfile = open('wonderland.txt', encoding = 'utf8')
rawtext = wonderlandfile.read()

In [3]:
rawtext = rawtext.lower()

In [4]:
characters = sorted(list(set(rawtext)))

In [5]:
print(characters)

['\n', ' ', '!', '#', '$', '%', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '‘', '’', '“', '”', '\ufeff']


In [6]:
bad_characters = ['#','*','@','_','\ufeff']

In [7]:
for i in range(len(bad_characters)):
    rawtext = rawtext.replace(bad_characters[i],"")
    
characters = sorted(list(set(rawtext)))
print(characters)

['\n', ' ', '!', '$', '%', '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '‘', '’', '“', '”']


In [8]:
textlen = len(rawtext)
charslen = len(characters)
vocabulary = charslen
print("Text Length : " + str(textlen))
print("No of characters : " + str(charslen))

Text Length : 163721
No of characters : 56


In [9]:
SEQ_LENGTH = 100
char_to_int = dict((c,i)for i ,c in enumerate(characters))
data_X = []
data_Y = []

for i in range(len(rawtext) - SEQ_LENGTH):
    X_test = rawtext[i: i + SEQ_LENGTH]
    X = [char_to_int[char] for char in X_test]
    data_X.append(X)
    Y = rawtext[i + SEQ_LENGTH]
    data_Y.append(char_to_int[Y])

In [10]:
length = len(data_X)
data_X = np.array(data_X)
data_X = np.reshape(data_X, (data_X.shape[0], data_X.shape[1], 1))
data_X = data_X/float(vocabulary)

data_Y = np.array(data_Y)
data_Y = np_utils.to_categorical(data_Y)
print(data_X.shape)
print(data_Y.shape)

(163621, 100, 1)
(163621, 56)


In [11]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ModelCheckpoint

In [12]:
model= Sequential()

In [13]:
model.add(LSTM(256, input_shape = (SEQ_LENGTH, 1), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))

In [14]:
model.add(Dense(vocabulary, activation = 'softmax'))

In [15]:
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam')

In [16]:
filepath="text_generation.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]



history = model.fit(data_X, data_Y, epochs =10 , batch_size = 128, callbacks = callbacks_list)

Epoch 1/10
Epoch 00001: loss improved from inf to 2.92565, saving model to text_generation.h5
Epoch 2/10
Epoch 00002: loss improved from 2.92565 to 2.62499, saving model to text_generation.h5
Epoch 3/10
Epoch 00003: loss improved from 2.62499 to 2.43792, saving model to text_generation.h5
Epoch 4/10
Epoch 00004: loss improved from 2.43792 to 2.29248, saving model to text_generation.h5
Epoch 5/10
Epoch 00005: loss improved from 2.29248 to 2.18005, saving model to text_generation.h5
Epoch 6/10
Epoch 00006: loss improved from 2.18005 to 2.09499, saving model to text_generation.h5
Epoch 7/10
Epoch 00007: loss improved from 2.09499 to 2.02350, saving model to text_generation.h5
Epoch 8/10
Epoch 00008: loss improved from 2.02350 to 1.96589, saving model to text_generation.h5
Epoch 9/10
Epoch 00009: loss improved from 1.96589 to 1.90818, saving model to text_generation.h5
Epoch 10/10
Epoch 00010: loss improved from 1.90818 to 1.86635, saving model to text_generation.h5


In [94]:
filename = 'text_generation.h5'
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam')

In [140]:
initial_text = 'max verstappen was quick among the competitiors and won the dutch grand pri for first time in career'
initial_text = [char_to_int[c] for c in initial_text]


In [141]:
test_text = initial_text

In [142]:
generated_text = []

In [143]:
int_to_char = dict((i, c) for i, c in enumerate(characters))

In [144]:
for i in range(100):
    x = np.reshape(test_text, (1, SEQ_LENGTH, 1))
    x = x / float(vocabulary)
    prediction = model.predict(x)
    index = np.argmax(prediction)
    result = int_to_char[index]
    generated_text.append(result)
    test_text.append(index)
    test_text = test_text[1:]
output = ''.join(generated_text)


In [145]:
output

'anly and the words was the words she was she was she was she was she was she was she was she was she'