# LSTM Project

## Importing Libraries and Loading Dataset

In [1]:
import nltk 
nltk.download('gutenberg')  # download the gutenberg dataset which contains a lot of books
from nltk.corpus import gutenberg
import pandas as pd

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

with open('hamlet.txt', 'w') as f:
    f.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\bhavy\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
with open('hamlet.txt', 'r') as f:
    text = f.read().lower()
    
# Tokenize the text and Indexes for words as during tokenization the words are put in list and then indexed
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 
total_words

4818

## Data Preprocessing

### Create input sequences

In [3]:
input_sequences = [] # list of sequences of words in the text
for line in text.split('\n'): # split the text into each line
    token_list = tokenizer.texts_to_sequences([line])[0] # convert the line into tokens and get the first element of the list , 
    #first element beacuse the text_to_sequences returns a list of list and we need the first list only as the first list contains the tokens
    for i in range(1, len(token_list)): # iterate over the each token in the token list
        n_gram_sequence = token_list[:i+1] 
        input_sequences.append(n_gram_sequence)

In [None]:
input_sequences

### Add padding to sequences

In [4]:
# Pad sequences
max_sequence_length = max([len(x) for x in input_sequences]) # get the maximum length of the sequence
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')) # pad the sequences to make them of same length

### Making Predictors and Labels

In [5]:
# Create predictors and label
import tensorflow as tf
X, y = input_sequences[:,:-1],input_sequences[:,-1] # X is the sequence of words excluding the last word and y is the last word in the sequence ,[:, -1] means the last element of the sequence : here means all the elements of the sequence
y = tf.keras.utils.to_categorical(y, num_classes=total_words) # convert the y into one hot encoding

### Splitting the data into training and test sets

In [6]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Building

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Create the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1)) # Embedding layer to convert the words into vectors , input_length is max_sequence_length-1 as we have removed the last word from the sequence , input_length is the length of the sequence
model.add(LSTM(150, return_sequences = True)) # LSTM layer with 150 units , return_sequences = True means that the layer will return the output of each time step
model.add(Dropout(0.2)) # Dropout layer to prevent overfitting , disables 20% of the neurons randomly to prevent overfitting
model.add(LSTM(100)) # LSTM layer with 100 units
model.add(Dense(total_words, activation='softmax')) # Dense layer with total_words units and softmax activation function as it is a multi class classification problem as we have to predict the next word from the total_words words

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # categorical_crossentropy as it is a multi class classification problem
model.summary()

In [12]:
# Train the model

history = model.fit(X_train, y_train, epochs=50, verbose=1, validation_data=(X_test, y_test))

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.0320 - loss: 7.1742 - val_accuracy: 0.0336 - val_loss: 6.7004
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0376 - loss: 6.4974 - val_accuracy: 0.0396 - val_loss: 6.7846
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.0420 - loss: 6.3619 - val_accuracy: 0.0482 - val_loss: 6.8145
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0517 - loss: 6.1933 - val_accuracy: 0.0486 - val_loss: 6.8084
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.0524 - loss: 6.0765 - val_accuracy: 0.0503 - val_loss: 6.8599
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0590 - loss: 5.9281 - val_accuracy: 0.0573 - val_loss: 6.8787
Epoch 7/50
[1m644/64

In [None]:
model.save('hamlet.h5')



In [13]:
model.summary()

In [None]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len: # If the length of the token list is greater than or equal to max_sequence_len, return None
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1 else pad the sequence , -(max_sequence_len-1) , - sign means the last (max_sequence_len-1) elements of the list , we use last elem to predict as the last element is the word to be predicted
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') 
    predicted = model.predict(token_list, verbose=0) # verbose = 0 means no output will be printed
    predicted_word_index = np.argmax(predicted, axis=1) # get the index of the word with maximum probability from the predicted list
    for word, index in tokenizer.word_index.items(): # iterate over the words and their indexes , it searches in the word_index dictionary made from the tokenizer made using the text 
        if index == predicted_word_index: # if the index of the word is equal to the predicted_word_index, return the word
            return word
    return None # return None if the word is not found

In [15]:
input_text="To be or not to be"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word Prediction:{next_word}")

Input text:To be or not to be
Next Word Prediction:buried


In [None]:
## Save the model
model.save("next_word_lstm.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)



In [16]:
input_text=" Barn. Haue you had quiet"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word Prediction:{next_word}")

Input text: Barn. Haue you had quiet
Next Word Prediction:guard
