In [1]:
import numpy as np

In [2]:
np.random.seed(42)

# Load the Data

In [3]:
book_text = open('davincicode.txt', encoding='utf8').read()

In [4]:
len(book_text)

842667

# Build Tokenizer

In [5]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters


In [6]:
t = Tokenizer(char_level=True)

In [7]:
t.fit_on_texts(book_text)

Number of unique characters

In [8]:
vocab_size = len(t.word_index)

In [9]:
vocab_size

86

In [10]:
t.word_index

{' ': 1,
 'e': 2,
 't': 3,
 'a': 4,
 'o': 5,
 'n': 6,
 'i': 7,
 'h': 8,
 's': 9,
 'r': 10,
 'd': 11,
 'l': 12,
 '\n': 13,
 'u': 14,
 'c': 15,
 'g': 16,
 'm': 17,
 '.': 18,
 'p': 19,
 'w': 20,
 'f': 21,
 'y': 22,
 'b': 23,
 ',': 24,
 '"': 25,
 'v': 26,
 'k': 27,
 'T': 28,
 'S': 29,
 "'": 30,
 'L': 31,
 'I': 32,
 'A': 33,
 'H': 34,
 '?': 35,
 'x': 36,
 'C': 37,
 'R': 38,
 'P': 39,
 'F': 40,
 'M': 41,
 '-': 42,
 'G': 43,
 'W': 44,
 'E': 45,
 'N': 46,
 'B': 47,
 'D': 48,
 'z': 49,
 'O': 50,
 'q': 51,
 '—': 52,
 'Y': 53,
 'V': 54,
 'j': 55,
 '!': 56,
 'J': 57,
 '1': 58,
 'U': 59,
 'K': 60,
 '5': 61,
 '3': 62,
 '2': 63,
 ':': 64,
 '0': 65,
 '8': 66,
 '/': 67,
 '4': 68,
 '9': 69,
 '6': 70,
 '7': 71,
 ';': 72,
 'Q': 73,
 'Z': 74,
 'X': 75,
 '&': 76,
 '(': 77,
 ')': 78,
 '$': 79,
 '°': 80,
 '^': 81,
 '¥': 82,
 '%': 83,
 '„': 84,
 '«': 85,
 '©': 86}

Convert characters to Numbers

In [11]:
book_num = t.texts_to_sequences(book_text)

In [12]:
book_num

[[28],
 [8],
 [2],
 [1],
 [48],
 [4],
 [1],
 [54],
 [7],
 [6],
 [15],
 [7],
 [1],
 [37],
 [5],
 [11],
 [2],
 [1],
 [13],
 [13],
 [13],
 [13],
 [48],
 [4],
 [6],
 [1],
 [47],
 [10],
 [5],
 [20],
 [6],
 [1],
 [13],
 [13],
 [13],
 [13],
 [40],
 [50],
 [38],
 [1],
 [47],
 [31],
 [53],
 [28],
 [34],
 [45],
 [18],
 [18],
 [18],
 [1],
 [33],
 [43],
 [33],
 [32],
 [46],
 [18],
 [1],
 [41],
 [50],
 [38],
 [45],
 [1],
 [28],
 [34],
 [33],
 [46],
 [1],
 [45],
 [54],
 [45],
 [38],
 [18],
 [1],
 [13],
 [13],
 [13],
 [13],
 [33],
 [15],
 [27],
 [6],
 [5],
 [20],
 [12],
 [2],
 [11],
 [16],
 [17],
 [2],
 [6],
 [3],
 [9],
 [1],
 [13],
 [13],
 [40],
 [7],
 [10],
 [9],
 [3],
 [1],
 [4],
 [6],
 [11],
 [1],
 [21],
 [5],
 [10],
 [2],
 [17],
 [5],
 [9],
 [3],
 [24],
 [1],
 [3],
 [5],
 [1],
 [17],
 [22],
 [1],
 [21],
 [10],
 [7],
 [2],
 [6],
 [11],
 [1],
 [4],
 [6],
 [11],
 [1],
 [2],
 [11],
 [7],
 [3],
 [5],
 [10],
 [24],
 [1],
 [57],
 [4],
 [9],
 [5],
 [6],
 [1],
 [60],
 [4],
 [14],
 [21],
 [17],
 [4],
 [6]

In [13]:
number_chars = len(book_num)

In [14]:
number_chars

842667

# Build Input and Output

In [15]:
sequence_length = 100 

Input and output container
- Input data will have sequences with 100 characters
- Output data will have one character which comes after 100 characters in the input data

In [16]:
input_data = []

In [17]:
output_data = []

In [18]:
for i in range(0, number_chars - sequence_length):
    input_seq = book_num[i : i + sequence_length]
    output_seq = book_num[i + sequence_length]
    input_data.append(input_seq)
    output_data.append(output_seq)

In [19]:
output_data[14]

[1]

In [20]:
input_data

[[[28],
  [8],
  [2],
  [1],
  [48],
  [4],
  [1],
  [54],
  [7],
  [6],
  [15],
  [7],
  [1],
  [37],
  [5],
  [11],
  [2],
  [1],
  [13],
  [13],
  [13],
  [13],
  [48],
  [4],
  [6],
  [1],
  [47],
  [10],
  [5],
  [20],
  [6],
  [1],
  [13],
  [13],
  [13],
  [13],
  [40],
  [50],
  [38],
  [1],
  [47],
  [31],
  [53],
  [28],
  [34],
  [45],
  [18],
  [18],
  [18],
  [1],
  [33],
  [43],
  [33],
  [32],
  [46],
  [18],
  [1],
  [41],
  [50],
  [38],
  [45],
  [1],
  [28],
  [34],
  [33],
  [46],
  [1],
  [45],
  [54],
  [45],
  [38],
  [18],
  [1],
  [13],
  [13],
  [13],
  [13],
  [33],
  [15],
  [27],
  [6],
  [5],
  [20],
  [12],
  [2],
  [11],
  [16],
  [17],
  [2],
  [6],
  [3],
  [9],
  [1],
  [13],
  [13],
  [40],
  [7],
  [10],
  [9],
  [3]],
 [[8],
  [2],
  [1],
  [48],
  [4],
  [1],
  [54],
  [7],
  [6],
  [15],
  [7],
  [1],
  [37],
  [5],
  [11],
  [2],
  [1],
  [13],
  [13],
  [13],
  [13],
  [48],
  [4],
  [6],
  [1],
  [47],
  [10],
  [5],
  [20],
  [6],
  [1],
  [1

Reshape and Normalize the input

In [21]:
input_data = np.reshape(input_data, (len(input_data),sequence_length,1))

In [22]:
input_data.shape

(842567, 100, 1)

In [23]:
input_data = input_data / vocab_size

In [24]:
input_data

array([[[0.3255814 ],
        [0.09302326],
        [0.02325581],
        ...,
        [0.11627907],
        [0.10465116],
        [0.03488372]],

       [[0.09302326],
        [0.02325581],
        [0.01162791],
        ...,
        [0.10465116],
        [0.03488372],
        [0.01162791]],

       [[0.02325581],
        [0.01162791],
        [0.55813953],
        ...,
        [0.03488372],
        [0.01162791],
        [0.04651163]],

       ...,

       [[0.01162791],
        [0.23255814],
        [0.09302326],
        ...,
        [0.48837209],
        [0.80232558],
        [0.01162791]],

       [[0.23255814],
        [0.09302326],
        [0.08139535],
        ...,
        [0.80232558],
        [0.01162791],
        [0.15116279]],

       [[0.09302326],
        [0.08139535],
        [0.10465116],
        ...,
        [0.01162791],
        [0.15116279],
        [0.15116279]]])

One hot encode the output

In [25]:
output_data


[[1],
 [4],
 [6],
 [11],
 [1],
 [21],
 [5],
 [10],
 [2],
 [17],
 [5],
 [9],
 [3],
 [24],
 [1],
 [3],
 [5],
 [1],
 [17],
 [22],
 [1],
 [21],
 [10],
 [7],
 [2],
 [6],
 [11],
 [1],
 [4],
 [6],
 [11],
 [1],
 [2],
 [11],
 [7],
 [3],
 [5],
 [10],
 [24],
 [1],
 [57],
 [4],
 [9],
 [5],
 [6],
 [1],
 [60],
 [4],
 [14],
 [21],
 [17],
 [4],
 [6],
 [24],
 [1],
 [21],
 [5],
 [10],
 [1],
 [20],
 [5],
 [10],
 [27],
 [7],
 [6],
 [16],
 [1],
 [9],
 [5],
 [1],
 [8],
 [4],
 [10],
 [11],
 [1],
 [5],
 [6],
 [1],
 [3],
 [8],
 [7],
 [9],
 [1],
 [19],
 [10],
 [5],
 [55],
 [2],
 [15],
 [3],
 [1],
 [4],
 [6],
 [11],
 [1],
 [13],
 [21],
 [5],
 [10],
 [1],
 [3],
 [10],
 [14],
 [12],
 [22],
 [1],
 [14],
 [6],
 [11],
 [2],
 [10],
 [9],
 [3],
 [4],
 [6],
 [11],
 [7],
 [6],
 [16],
 [1],
 [20],
 [8],
 [4],
 [3],
 [1],
 [3],
 [8],
 [7],
 [9],
 [1],
 [23],
 [5],
 [5],
 [27],
 [1],
 [7],
 [9],
 [1],
 [4],
 [12],
 [12],
 [1],
 [4],
 [23],
 [5],
 [14],
 [3],
 [18],
 [1],
 [33],
 [6],
 [11],
 [1],
 [3],
 [5],
 [1],
 [3],
 [8

In [26]:
from tensorflow.python.keras.utils import to_categorical

In [27]:
output_data = to_categorical(output_data,num_classes=vocab_size+1)

In [28]:
output_data[0:3]

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

# Build the Model

In [29]:
from tensorflow.python.keras.models import Sequential

In [30]:
from tensorflow.python.keras.layers import LSTM, Dense, Dropout

In [31]:
model = Sequential()

In [32]:
model.add(LSTM(128, input_shape=(input_data.shape[1],input_data.shape[2])))

In [33]:
model.add(Dropout(0.2))

In [34]:
model.add(Dense(vocab_size+1, activation='softmax'))
#vocab_size=86

In [35]:
model.compile(optimizer='adam',loss='categorical_crossentropy')

# Execute the model

Goal of the model is to minimize the loss

In [67]:
model.fit(input_data, output_data, batch_size=256, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c454177940>

In [68]:
model.save('davinci_lstm')

# Build randon Starting point for predicting

In [91]:
start = np.random.randint(0, input_data.shape[0]-1)

In [92]:
start

255724

In [93]:
data = book_num[start: start+sequence_length]
data


[[11],
 [1],
 [4],
 [1],
 [23],
 [12],
 [14],
 [2],
 [42],
 [10],
 [5],
 [23],
 [2],
 [11],
 [1],
 [54],
 [7],
 [10],
 [16],
 [7],
 [6],
 [1],
 [41],
 [4],
 [10],
 [22],
 [1],
 [9],
 [7],
 [3],
 [3],
 [7],
 [6],
 [16],
 [1],
 [20],
 [7],
 [3],
 [8],
 [1],
 [8],
 [2],
 [10],
 [1],
 [4],
 [10],
 [17],
 [1],
 [4],
 [10],
 [5],
 [14],
 [6],
 [11],
 [1],
 [4],
 [6],
 [1],
 [7],
 [6],
 [21],
 [4],
 [6],
 [3],
 [1],
 [15],
 [8],
 [7],
 [12],
 [11],
 [24],
 [1],
 [13],
 [19],
 [10],
 [2],
 [9],
 [14],
 [17],
 [4],
 [23],
 [12],
 [22],
 [1],
 [47],
 [4],
 [23],
 [22],
 [1],
 [57],
 [2],
 [9],
 [14],
 [9],
 [18],
 [1],
 [50],
 [19],
 [19],
 [5]]

In [94]:
data = [item for sublist in data for item in sublist]
data

[11,
 1,
 4,
 1,
 23,
 12,
 14,
 2,
 42,
 10,
 5,
 23,
 2,
 11,
 1,
 54,
 7,
 10,
 16,
 7,
 6,
 1,
 41,
 4,
 10,
 22,
 1,
 9,
 7,
 3,
 3,
 7,
 6,
 16,
 1,
 20,
 7,
 3,
 8,
 1,
 8,
 2,
 10,
 1,
 4,
 10,
 17,
 1,
 4,
 10,
 5,
 14,
 6,
 11,
 1,
 4,
 6,
 1,
 7,
 6,
 21,
 4,
 6,
 3,
 1,
 15,
 8,
 7,
 12,
 11,
 24,
 1,
 13,
 19,
 10,
 2,
 9,
 14,
 17,
 4,
 23,
 12,
 22,
 1,
 47,
 4,
 23,
 22,
 1,
 57,
 2,
 9,
 14,
 9,
 18,
 1,
 50,
 19,
 19,
 5]

In [95]:
#data = book_num[start: start+sequence_length]
#data = [item for sublist in data for item in sublist]


Build Int to Char routine

In [96]:
int_to_char = dict((i,c) for c, i in t.word_index.items())

Start Predicting String

In [97]:
print ('STARTING DATA: ')
print(''.join(int_to_char[char_val] for char_val in data))
print ('\nPREDICTED: ')

for i in range(100):
    #Predict for initial data
    prediction = model.predict(np.reshape(data,(1, len(data), 1))/vocab_size)
    
    #Get char with max probability
    char_index_predicted = np.argmax(prediction)
    
    #convert index to char
    char_predicted = int_to_char[char_index_predicted]
    
    print (char_predicted, end='')
    
    #Change data - append new char index and remove the first index
    data.append(char_index_predicted)
    data = data[1:len(data)]    

STARTING DATA: 
d a blue-robed Virgin Mary sitting with her arm around an infant child, 
presumably Baby Jesus. Oppo

PREDICTED: 
 the core far the core far the core far the core far the core far the 
are he the bor the 
ore the c

# Loading a trained Model

In [98]:
from tensorflow.python.keras.models import load_model

In [99]:
model = load_model('davinci_lstm')

In [100]:
prediction = model.predict(np.reshape(data,(1, len(data), 1))/vocab_size)

In [101]:
prediction.shape

(1, 87)

In [102]:
np.argmax(prediction)

5

In [103]:
int_to_char[5]

'o'