In [1]:
import re 
import numpy as np 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os
import requests

In [2]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.utils import to_categorical 
import pickle 
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import tensorflow as tf
print(tf.__version__)
print(tf.keras.__version__)


2.17.0
3.4.1


In [4]:
data = pd.read_csv('Shakespeare_data.csv')
print(data.head)

<bound method NDFrame.head of         Dataline            Play  PlayerLinenumber ActSceneLine  \
0              1        Henry IV               NaN          NaN   
1              2        Henry IV               NaN          NaN   
2              3        Henry IV               NaN          NaN   
3              4        Henry IV               1.0        1.1.1   
4              5        Henry IV               1.0        1.1.2   
...          ...             ...               ...          ...   
111391    111392  A Winters Tale              38.0      5.3.180   
111392    111393  A Winters Tale              38.0      5.3.181   
111393    111394  A Winters Tale              38.0      5.3.182   
111394    111395  A Winters Tale              38.0      5.3.183   
111395    111396  A Winters Tale              38.0          NaN   

               Player                                         PlayerLine  
0                 NaN                                              ACT I  
1              

In [5]:
text = []
for i in data['PlayerLine']:
    text.append(i)
print(text[:5])

['ACT I', 'SCENE I. London. The palace.', 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others', 'So shaken as we are, so wan with care,', 'Find we a time for frighted peace to pant,']


In [6]:
def clean_text(text):
    pattern = re.compile('[^a-zA-z0-9\s]')
    text = re.sub(pattern,'',text)

    pattern = re.compile('/d+')
    text = re.sub(pattern,'',text)

    text = text.lower()
    return text 

texts = []
for t in text:
    new_text = clean_text(t)
    texts.append(new_text)

texts[:5]


['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant']

In [7]:
texts = texts[:10000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

text_sequences = tokenizer.texts_to_sequences(texts)
print('texts --> ', texts[0])
print('Embedding --> ', text_sequences[0])

max_sequence_len = max([len(x) for x in text_sequences])
text_sequences = pad_sequences(text_sequences, maxlen = max_sequence_len, padding = 'pre')

print('Maximum Sequence Length -->>',max_sequence_len) 
print('Text Sequence -->>\n',text_sequences[0]) 
print('Text Sequence Shape -->>',text_sequences.shape)

texts -->  act i
Embedding -->  [455, 4]
Maximum Sequence Length -->> 54
Text Sequence -->>
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 455   4]
Text Sequence Shape -->> (10000, 54)


In [8]:
X, Y = text_sequences[:,:-1], text_sequences[:,-1]
print('First input ', X[0])
print('First output ', Y[0])

word_index = tokenizer.word_index

total_words = len(word_index) + 1
print('Total number of words : ', total_words)

Y = to_categorical(Y, num_classes=total_words)

print('Input shape --> ', X.shape)
print('Output shape --> ', Y.shape)

First input  [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 455]
First output  4
Total number of words :  7870
Input shape -->  (10000, 53)
Output shape -->  (10000, 7870)


In [9]:
model = Sequential(name="LSTM_Model")

# adding embedding
model.add(Embedding(total_words,
                   max_sequence_len-1,
                   input_length=max_sequence_len-1))

# adding a LSTM layer
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.5))

# adding the final output with activation function of softmax
model.add(Dense(total_words, activation='softmax'))

# printing model summary 
print(model.summary())


None


In [10]:
# Compiling the model 
model.compile( 
	loss="categorical_crossentropy", 
	optimizer='adam', 
	metrics=['accuracy'] 
) 

# Training the LSTM model 
history = model.fit(X, Y, 
					epochs=50, 
					verbose=1)


Epoch 1/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 224ms/step - accuracy: 0.0099 - loss: 8.1875
Epoch 2/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 222ms/step - accuracy: 0.0150 - loss: 7.3694
Epoch 3/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 232ms/step - accuracy: 0.0260 - loss: 7.1221
Epoch 4/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 228ms/step - accuracy: 0.0261 - loss: 6.8438
Epoch 5/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 232ms/step - accuracy: 0.0299 - loss: 6.6272
Epoch 6/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 226ms/step - accuracy: 0.0344 - loss: 6.3351
Epoch 7/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m876s[0m 3s/step - accuracy: 0.0447 - loss: 5.9818
Epoch 8/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 220ms/step - accuracy: 0.0604 - loss: 5.5330
Epoch 9/50
[1m313

In [19]:
def autoCompletations(text, model): 
	# Tokenization and Text vectorization 
	text_sequences = tokenizer.texts_to_sequences(text) 
	# Pre-padding 
	testing = pad_sequences(text_sequences, maxlen = max_sequence_len-1, padding='pre') 
	# Prediction 
	y_pred_test = np.argmax(model.predict(testing,verbose=0)) 
	
	predicted_word = '' 
	for word, index in tokenizer.word_index.items(): 
		if index == y_pred_test: 
			predicted_word = word 
			break
	text += " " + predicted_word + '.'
	return text 
	
complete_sentence = autoCompletations('I have seen this', model) 
complete_sentence


'I have seen this .'

In [20]:
def generate_text(text, new_words): 
	for _ in range(new_words): 
		text = autoCompletations(text, model)[:-1] 
	return text 
	
generated_text = generate_text('I have seen', 5) 
generated_text


'I have seen     '

In [21]:
# saving the model 
model.save('sentence_completion.h5') 

# saving the tokenizer 
filename = 'tokenizer.pkl'
pickle.dump(tokenizer, open(filename, 'wb'))


