<a href="https://colab.research.google.com/github/suhashgampa1/Deep_Learning/blob/main/LSTM_TXT_PRED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample text data
with open('/content/1661-0.txt', 'r') as file:
    text = file.read().lower()

# Step 1: Preprocessing the text--NLP
tokenizer = Tokenizer(char_level=True)  # Tokenize at the character level
tokenizer.fit_on_texts(text)
char_index = tokenizer.word_index
index_char = tokenizer.index_word
vocab_size = len(char_index) + 1

# Convert text to sequences
sequences = tokenizer.texts_to_sequences([text])

# Flatten the list of sequences
sequences = [item for sublist in sequences for item in sublist]

# Create input-output pairs
seq_length = 40
X, y = [], []
for i in range(len(sequences) - seq_length):
    X.append(sequences[i:i + seq_length])
    y.append(sequences[i + seq_length])

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

# Step 2: Building the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length),
    LSTM(150, return_sequences=False),
    Dense(vocab_size, activation='softmax')#Activation tanh
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Step 3: Training the model
model.fit(X, y, epochs=40, batch_size=64)



Epoch 1/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 7ms/step - accuracy: 0.3496 - loss: 2.2597
Epoch 2/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 6ms/step - accuracy: 0.4897 - loss: 1.7048
Epoch 3/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 7ms/step - accuracy: 0.5258 - loss: 1.5732
Epoch 4/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 7ms/step - accuracy: 0.5456 - loss: 1.5041
Epoch 5/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 7ms/step - accuracy: 0.5562 - loss: 1.4637
Epoch 6/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 7ms/step - accuracy: 0.5653 - loss: 1.4319
Epoch 7/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 6ms/step - accuracy: 0.5716 - loss: 1.4039
Epoch 8/40
[1m9092/9092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 6ms/step - accuracy: 0.5778 - loss: 1.3809
Epoch 9/40
[1m9

<keras.src.callbacks.history.History at 0x7e6c582024e0>

In [None]:
def generate_text(seed_text, num_chars):
    for _ in range(num_chars):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, truncating='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_char_index = np.argmax(predicted_probs)
        predicted_char = index_char[predicted_char_index]
        seed_text += predicted_char
    return seed_text

In [None]:
seed_text = "Sherlock Holmes"
generated_text = generate_text(seed_text, num_chars=200)
print("Generated text:\n", generated_text)

Generated text:
 Sherlock Holmes with a street. “i am so much as i have no doubt that i have not some street. i should be a street. the man who was a street. the man who was a street. the man who was a street. the man who was a stre
