In [1]:
## Data Collection
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')


[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/saurabhbiswal/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
import pandas as pd

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

In [3]:
import os

# Define the full directory path for the file
base_path = '/Users/saurabhbiswal/Documents/Udemy/AI_Repo/E2E_LSTM_GRU/data/external'
os.makedirs(base_path, exist_ok=True)

# Define full file path
file_path = os.path.join(base_path, 'hamlet.txt')

# Write data to the file
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(data)

In [4]:
## Data Preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [5]:
## Load the dataset

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read().lower()

In [6]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 # 4818 words

In [7]:
# Create input sequences and labels
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)


In [8]:
input_sequences

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [9]:
# Apply padding to the sequences
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [10]:
input_sequences, max_sequence_length

(array([[   0,    0,    0, ...,    0,    1,  687],
        [   0,    0,    0, ...,    1,  687,    4],
        [   0,    0,    0, ...,  687,    4,   45],
        ...,
        [   0,    0,    0, ...,    4,   45, 1047],
        [   0,    0,    0, ...,   45, 1047,    4],
        [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32),
 14)

In [11]:
# Create predictors and label
import tensorflow as tf

x, y = input_sequences[:, :-1], input_sequences[:, -1]

In [12]:
x, y

(array([[   0,    0,    0, ...,    0,    0,    1],
        [   0,    0,    0, ...,    0,    1,  687],
        [   0,    0,    0, ...,    1,  687,    4],
        ...,
        [   0,    0,    0, ...,  687,    4,   45],
        [   0,    0,    0, ...,    4,   45, 1047],
        [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32),
 array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32))

# Output format for classification. Why OHE?

Your model outputs a probability distribution over all possible classes (words in the vocabulary). The output layer typically uses softmax activation to generate these probabilities.

To train the model, you need a target label in a compatible format — a vector indicating the “correct” class.

In [13]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
# This will convert the labels into a one-hot encoded format
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
## Train the Model - LSTM RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

In [22]:
 # Define the model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_length - 1))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_sequence_length))
model.summary()


In [26]:
# # Apply early stopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/100
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1096 - loss: 5.2202 - val_accuracy: 0.0769 - val_loss: 7.0216
Epoch 2/100
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1187 - loss: 5.0683 - val_accuracy: 0.0765 - val_loss: 7.0977
Epoch 3/100
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1231 - loss: 4.9628 - val_accuracy: 0.0732 - val_loss: 7.1767
Epoch 4/100
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.1289 - loss: 4.8550 - val_accuracy: 0.0754 - val_loss: 7.2639
Epoch 5/100
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1375 - loss: 4.7172 - val_accuracy: 0.0783 - val_loss: 7.3200
Epoch 6/100
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.1431 - loss: 4.6072 - val_accuracy: 0.0767 - val_loss: 7.3973
Epoch 7/100
[1m

In [27]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_length):
    # Preprocess the input text
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_length:
        token_list = token_list[-(max_sequence_length - 1):] # Keep only the last max_sequence_length - 1 tokens
    token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
    
    # Predict the next word
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    
    # Get the word from the index
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word

In [28]:
input_text = "to be or not to be"
max_sequence_len = model.input_shape[1] + 1
predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Input text: '{input_text}'")
print(f"Predicted next word: '{predicted_word}'")

Input text: 'to be or not to be'
Predicted next word: 'kinde'


In [29]:
# Save the model
base_path_model = '/Users/saurabhbiswal/Documents/Udemy/AI_Repo/E2E_LSTM_GRU/models'
model.save(os.path.join(base_path_model, 'hamlet_lstm_model.h5'))



In [32]:
# Save the tokenizer
import pickle
with open(os.path.join(base_path_model, 'hamlet_tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
input_text = "Fran. Nay answer me: Stand &"
max_sequence_len = model.input_shape[1] + 1
predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Input text: '{input_text}'")
print(f"Predicted next word: '{predicted_word}'")

Input text: 'Fran. Nay answer me: Stand &'
Predicted next word: 'vnfold'
