## Preprocessing

In [1]:
#import dependencies
import tensorflow as tf
import string

import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.optimizers import RMSprop

import re
import xgboost as xg

In [2]:
#read data
predictor_df = open('Resources/Storytelling.txt', encoding='utf-8')
predictor_df = predictor_df.read()
predictor_df = predictor_df.split('\n')
predictor_df

['To Sherlock Holmes she is always _the_ woman. I have seldom heard him',
 'mention her under any other name. In his eyes she eclipses and',
 'predominates the whole of her sex. It was not that he felt any emotion',
 'akin to love for Irene Adler. All emotions, and that one particularly,',
 'were abhorrent to his cold, precise but admirably balanced mind. He',
 'was, I take it, the most perfect reasoning and observing machine that',
 'the world has seen, but as a lover he would have placed himself in a',
 'false position. He never spoke of the softer passions, save with a gibe',
 'and a sneer. They were admirable things for the observer—excellent for',
 'drawing the veil from men’s motives and actions. But for the trained',
 'reasoner to admit such intrusions into his own delicate and finely',
 'adjusted temperament was to introduce a distracting factor which might',
 'throw a doubt upon all his mental results. Grit in a sensitive',
 'instrument, or a crack in one of his own high-power

In [3]:
clean_lines = []

for line in predictor_df:
  for puntc in line:
    if puntc in '!@#$%^&*()_+=,._;[]{}\|?""''':
      line = line.replace(puntc, '')
  clean_lines.append(line)


In [4]:
#tokenize data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_lines)
word_ammt = len(tokenizer.word_index) + 1

In [5]:
#Create inputs
input = []

for word in clean_lines:
  tokens = tokenizer.texts_to_sequences([word])[0]
  for i in range(1, len(tokens)):
        n_gram_sequence = tokens[:i+1]
        input.append(n_gram_sequence)

In [6]:
#pad
max_seq = max([len(seq) for seq in input])
input = np.array(pad_sequences(input, maxlen=max_seq, padding='pre'))

x, y = input[:, :-1], input[:, -1]

In [7]:
#vectorize y
y = np.array(tf.keras.utils.to_categorical(y, num_classes=word_ammt))

In [8]:
model = Sequential()
model.add(Embedding(word_ammt, 100, input_length=max_seq-1))
model.add(LSTM(300))
model.add(Dense(word_ammt, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 100)           235100    
                                                                 
 lstm (LSTM)                 (None, 300)               481200    
                                                                 
 dense (Dense)               (None, 2351)              707651    
                                                                 
Total params: 1423951 (5.43 MB)
Trainable params: 1423951 (5.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x, y, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x79fba39d9e70>

In [9]:
model = tf.keras.models.load_model('Model_1.h5')



In [16]:
results = model.evaluate(x,y, verbose=0)
print('Accuracy is : '+ str(100*results[1])+'%')

Accuracy is : 96.06333374977112%


In [17]:
model.save('Model_1.h5', save_format='h5')

In [10]:
learned_array = np.array(list(tokenizer.word_index.keys()))

In [12]:
text_to_predict = 'He said'
next_words = 7

for words in range(next_words):
  token_predict = tokenizer.texts_to_sequences([text_to_predict])
  pad_text = tf.keras.preprocessing.sequence.pad_sequences(token_predict, maxlen=16)

  prediction_prob = model.predict(pad_text)
  prediction_word = tokenizer.index_word[np.argmax(prediction_prob)]

  text_to_predict += " " + prediction_word

print(text_to_predict)

He said he enjoyed the use of the money
