In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
# from keras.optimizers import RMSprop
import numpy as np
import random
import sys
import string
from string import punctuation

#LOAD TEXT
#Save notepad as UTF-8 (select from dropdown during saving)
filename = "./train_40k.csv"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

#CLEAN TEXT
raw_text = ''.join(c for c in raw_text if c in string.ascii_lowercase)

#How many total characters do we have in our training text?
chars = sorted(list(set(raw_text))) #List of every character

#Each unique character will be assigned an integer value. 
#Create a dictionary of characters mapped to integer values
char_to_int = dict((c, i) for i, c in enumerate(chars))

#Do the reverse so we can print our predictions in characters and not integers
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters in the text; corpus length: ", n_chars)
print("Total Vocab: ", n_vocab)


seq_length = 6  #Length of each input sequence
step = 1   #Instead of moving 1 letter at a time, try skipping a few. 
sentences = []    # X values (Sentences)
next_chars = []   # Y values. The character that follows the sentence defined as X
for i in range(0, n_chars - seq_length, step):  #step=1 means each sentence is offset just by a single letter
    sentences.append(raw_text[i: i + seq_length])  #Sequence in
    next_chars.append(raw_text[i + seq_length])  #Sequence out
n_patterns = len(sentences)    
print('Number of sequences:', n_patterns)

x = np.zeros((len(sentences), seq_length, n_vocab), dtype= bool)
y = np.zeros((len(sentences), n_vocab), dtype= bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[next_chars[i]]] = 1
    
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, n_vocab)))
model.add(Dense(n_vocab, activation='softmax'))

# optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy')
model.summary()

# define the checkpoint
from keras.callbacks import ModelCheckpoint

filepath="saved_weights/saved_weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [checkpoint]

Total Characters in the text; corpus length:  13577039
Total Vocab:  26
Number of sequences: 13577033


2022-01-23 18:05:04.761526: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               79360     
                                                                 
 dense (Dense)               (None, 26)                3354      
                                                                 
Total params: 82,714
Trainable params: 82,714
Non-trainable params: 0
_________________________________________________________________


In [2]:
# Fit the model
history = model.fit(x, y, batch_size=128, epochs=1, callbacks=callbacks_list)
model.save('save_epochs_lstm_model.h5')

Epoch 00001: loss improved from inf to 1.79167, saving model to saved_weights/saved_weights-01-1.7917.hdf5


In [9]:
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds) 
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)

#Prediction
# load the network weights
filename = "save_epochs_lstm_model.h5"
model.load_weights(filename)

def generateText(sentence, seq_length, n_vocab, model, int_to_char, sample, length):
    generated = ''
    generated += sentence
    for i in range(length):  
        x_pred = np.zeros((1, seq_length, n_vocab))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_int[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_char = int_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

In [11]:
def miscalcRate(sentence, answer, missing):
    miscalc = 0
    if (len(sentence) != len(answer)):
        print("2 strings need to have same length")
        return
    for i in range(len(sentence)):
        if (sentence[i] != answer[i]):
            miscalc+=1
    return miscalc/missing

In [15]:
answer1  = 'helloworld'
string1  = 'hellow'
predict1 = generateText(string1, seq_length, n_vocab, model, int_to_char, sample, 4)
print("Answer:", answer1, "\tGuess:", string1, "\tPredict:", predict1, "\nMiscalculation Rate:", miscalcRate(predict1, answer1, 4), '\n')

answer2  = 'thisisit'
string2  = 'thisis'
predict2 = generateText(string2, seq_length, n_vocab, model, int_to_char, sample, 2)
print("Answer:", answer2, "\tGuess:", string2, "\tPredict:", predict2, "\nMiscalculation Rate:", miscalcRate(predict2, answer2, 2), '\n')

answer3  = 'thisisoberlin'
string3  = 'thisis'
predict3 = generateText(string3, seq_length, n_vocab, model, int_to_char, sample, 7)
print("Answer:", answer3, "\tGuess:", string3, "\tPredict:", predict3, "\nMiscalculation Rate:", miscalcRate(predict3, answer3, 7), '\n')

Answer: helloworld 	Guess: hellow 	Predict: hellowacxa 
Miscalculation Rate: 1.0 

Answer: thisisit 	Guess: thisis 	Predict: thisisre 
Miscalculation Rate: 1.0 

Answer: thisisoberlin 	Guess: thisis 	Predict: thisisstillal 
Miscalculation Rate: 0.8571428571428571 



We use the model implemented by Dr. Sreenivas Bhattiprolu 