<a href="https://colab.research.google.com/github/rsshan5388/assignmentchirag/blob/main/lstmdataset_rsudarshan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
with open('/content/drive/My Drive/lstm/LSTMDATA.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
import re
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = text.lower()

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(2, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_seq_len = max(len(x) for x in input_sequences)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))

# Split features and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [6]:
def create_lstm_model():
    model = Sequential([
        Embedding(total_words, 100, input_length=max_seq_len - 1),
        LSTM(150),
        Dense(total_words, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
lstm_model = create_lstm_model()
lstm_model.fit(X, y, epochs=20, verbose=1, callbacks=[EarlyStopping(patience=2)])



Epoch 1/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 39ms/step - accuracy: 0.0542 - loss: 6.4545
Epoch 2/20


  current = self.get_monitor_value(logs)


[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 38ms/step - accuracy: 0.1209 - loss: 5.4360
Epoch 3/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 39ms/step - accuracy: 0.1493 - loss: 4.9890
Epoch 4/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 39ms/step - accuracy: 0.1715 - loss: 4.6551
Epoch 5/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 39ms/step - accuracy: 0.1880 - loss: 4.3839
Epoch 6/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 40ms/step - accuracy: 0.2082 - loss: 4.0955
Epoch 7/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 39ms/step - accuracy: 0.2320 - loss: 3.8497
Epoch 8/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 41ms/step - accuracy: 0.2599 - loss: 3.6176
Epoch 9/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 39ms/step - accuracy: 0.2901 - loss: 3.4019
Epoch 10/20

<keras.src.callbacks.history.History at 0x7a35a2bb9010>

In [8]:
def create_gru_model():
    model = Sequential([
        Embedding(total_words, 100, input_length=max_seq_len - 1),
        GRU(150),
        Dense(total_words, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

gru_model = create_gru_model()
gru_model.fit(X, y, epochs=20, verbose=1, callbacks=[EarlyStopping(patience=2)])

Epoch 1/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 41ms/step - accuracy: 0.0645 - loss: 6.3482
Epoch 2/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 43ms/step - accuracy: 0.1365 - loss: 5.2046
Epoch 3/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 42ms/step - accuracy: 0.1649 - loss: 4.7608
Epoch 4/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 43ms/step - accuracy: 0.1901 - loss: 4.3625
Epoch 5/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 42ms/step - accuracy: 0.2190 - loss: 4.0122
Epoch 6/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 42ms/step - accuracy: 0.2503 - loss: 3.6927
Epoch 7/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 42ms/step - accuracy: 0.2923 - loss: 3.3795
Epoch 8/20
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 43ms/step - accuracy: 0.3346 - loss: 3.1115


<keras.src.callbacks.history.History at 0x7a35a2a3a390>

In [10]:
def predict_next_word(model, seed_text, n_words=1):
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + output_word
    return seed_text

In [11]:
print(predict_next_word(lstm_model, "it is a truth"))
print(predict_next_word(gru_model, "i hope you"))

it is a truth universally
i hope you will


In [13]:
print(predict_next_word(lstm_model, "what does it "))
print(predict_next_word(gru_model, "should i "))

what does it  you
should i  believe
