In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense




In [2]:
text = open('Sherlock Holmes.txt').read().lower()
print('Given script has ' + str(len(text)) + ' characters')

Given script has 581877 characters


In [3]:
text = text[1302:]
for ch in ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '$', '%', '&', '~', '`', '(', ')', '*',
          '-', '/', ';', '@', '?', ':', '©', '¢', 'ã', '\xa0', '\n', '\r', '.']:
       if ch in text:
             text=text.replace(ch,' ')
print(set(text))                

{'j', "'", 'c', 'a', 'p', ' ', 's', 't', 'i', 'x', 'f', 'y', 'o', 'e', 'l', 'g', '¨', 'q', 'h', 'w', 'v', 'b', 'u', 'd', 'n', 'z', 'm', 'r', ',', 'k'}


In [4]:
def window_transform(text, window_size, step_size):    
    inputs = []
    outputs = []
    
    n_batches = int((len(text)-window_size) / step_size)    
    
    for i in range(n_batches-1):
        a = text[i * step_size:((i * step_size) + window_size)]
        inputs.append(a)
        b = text[(i * step_size) + window_size]
        outputs.append(b)

    return inputs,outputs

# Calling the window function
window_size = 50
step_size = 3
inputs, outputs = window_transform(text, window_size, step_size)

In [5]:
chars = sorted(list(set(text)))
print(chars)

[' ', "'", ',', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¨']


In [6]:
# Encoding 
chars_to_indices = dict((c, i) for i, c in enumerate(chars))

# Decoding
indices_to_chars = dict((i, c) for i, c in enumerate(chars))

In [7]:
def encode_io_pairs(text, window_size, step_size):    
    num_chars = len(chars)
    
    # cut up text into character input/output pairs
    inputs, outputs = window_transform(text, window_size, step_size)
    
    X = np.zeros((len(inputs), window_size, num_chars), dtype=bool)
    y = np.zeros((len(inputs), num_chars), dtype=bool)

    
    # loop over inputs/outputs and tranform and store in X/y
    for i, sentence in enumerate(inputs):
        for t, char in enumerate(sentence):
            X[i, t, chars_to_indices[char]] = 1
        y[i, chars_to_indices[outputs[i]]] = 1
        
    return X,y

X, y = encode_io_pairs(text, window_size, step_size)

In [8]:
# Designing the model
model = Sequential()
model.add(LSTM(120, input_shape=(window_size, len(chars))))
model.add(Dropout(0.22))
model.add(Dense(len(chars), activation='linear'))
model.add(Dense(y.shape[1], activation='softmax'))

# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Subsetting data for an example
Xsmall = X[:20000,:,:]
ysmall = y[:20000,:]

# Model training
model.fit(Xsmall, ysmall, batch_size=500, epochs=10)



Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1c78f49e470>

In [9]:
def predict_next_chars(model, input_chars, num_to_predict):     
    # create output
    predicted_chars = ''
    for i in range(num_to_predict):
        # convert this round's predicted characters to numerical input    
        x_test = np.zeros((1, window_size, len(chars)))
        for t, char in enumerate(input_chars):
            x_test[0, t, chars_to_indices[char]] = 1.

        # make this round's prediction
        test_predict = model.predict(x_test,verbose = 0)[0]

        # translate numerical prediction back to characters
        r = np.argmax(test_predict)                           # predict class of each test input
        d = indices_to_chars[r] 

        # update predicted_chars and input
        predicted_chars+=d
        input_chars+=d
        input_chars = input_chars[1:]
    return predicted_chars

In [10]:
start = 89
num_to_predict = 10
input_chars = text[start: start + window_size]
print('Complete sequence:', text[start:start + window_size + num_to_predict])
print('Input sequence:', input_chars)
print('Output sequence:', predict_next_chars(model, input_chars, num_to_predict = num_to_predict))

Complete sequence: otion akin to love for irene adler  all emotions, and that o
Input sequence: otion akin to love for irene adler  all emotions, 
Output sequence: an  he the
