<a href="https://colab.research.google.com/github/rishisg/ChatGPT/blob/main/LSTM_ASSIGNMENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

print("Step 1: Setup and Data Preparation")
# Load the dataset
with open('LSTM DATA.txt', 'r', encoding='utf-8') as file:
    data = file.read()

print("\nDataset loaded successfully.")
print(f"First 200 characters of the dataset:\n{data[:200]}")

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1  # Add 1 for padding token

print(f"\nTotal unique words in the dataset: {total_words}")
print("Tokenization completed.")

# Generate input sequences
input_sequences = []
for line in data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

print(f"\nMax sequence length: {max_sequence_len}")
print("Input sequences generated and padded.")

# Split into predictors (X) and labels (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert labels to categorical format
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

print(f"\nShape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
print("Data preprocessing completed.")

print("\nStep 2: LSTM Model Architecture")
# Define the LSTM model architecture
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

print("\nModel architecture summary:")
model.summary()

print("\nStep 3: Building the LSTM Model")
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("\nModel compiled successfully.")

print("\nStep 4: Model Training")
# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X, y,
    epochs=50,
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping]
)

print("\nModel training completed.")



Step 1: Setup and Data Preparation

Dataset loaded successfully.
First 200 characters of the dataset:
﻿The Project Gutenberg eBook of Pride and Prejudice
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions


Total unique words in the dataset: 7561
Tokenization completed.

Max sequence length: 25
Input sequences generated and padded.

Shape of X: (121111, 24)
Shape of y: (121111, 7561)
Data preprocessing completed.

Step 2: LSTM Model Architecture

Model architecture summary:





Step 3: Building the LSTM Model

Model compiled successfully.

Step 4: Model Training
Epoch 1/50
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 235ms/step - accuracy: 0.0343 - loss: 6.7754 - val_accuracy: 0.0496 - val_loss: 6.3628
Epoch 2/50
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 224ms/step - accuracy: 0.0583 - loss: 6.0743 - val_accuracy: 0.0644 - val_loss: 6.1716
Epoch 3/50
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 235ms/step - accuracy: 0.0812 - loss: 5.7909 - val_accuracy: 0.0883 - val_loss: 6.0591
Epoch 4/50
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 237ms/step - accuracy: 0.0998 - loss: 5.6061 - val_accuracy: 0.0924 - val_loss: 6.0043
Epoch 5/50
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 235ms/step - accuracy: 0.1131 - loss: 5.4490 - val_accuracy: 0.1074 - val_loss: 5.9547
Epoch 6/50
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 234m

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import numpy as np

# Step 1: Define the Model
model = Sequential([
    Embedding(input_dim=1000, output_dim=64, input_length=100),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification example
])

# Step 2: Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Prepare Training Data
X_train = np.random.randint(0, 1000, size=(1000, 100))  # Random integer sequences
y_train = np.random.randint(0, 2, size=(1000,))        # Binary labels

# Train the Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Step 4: Prepare Test Data
X_test = np.random.randint(0, 1000, size=(200, 100))  # Random integer sequences
y_test = np.random.randint(0, 2, size=(200,))        # Binary labels

# Step 5: Evaluate the Model
print("\nStep 5: Model Evaluation")
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Loss: {loss:.2f}, Test Accuracy: {accuracy:.2f}")

Epoch 1/5




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.5377 - loss: 0.6920 - val_accuracy: 0.4800 - val_loss: 0.6945
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9879 - loss: 0.5657 - val_accuracy: 0.4800 - val_loss: 0.7054
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 1.0000 - loss: 0.3273 - val_accuracy: 0.4850 - val_loss: 0.7455
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 1.0000 - loss: 0.0573 - val_accuracy: 0.4250 - val_loss: 0.8173
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 1.0000 - loss: 0.0120 - val_accuracy: 0.4400 - val_loss: 0.8458

Step 5: Model Evaluation
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5096 - loss: 0.8053 

Test Loss: 0.78, Test Accuracy: 0.55


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Step 1: Load and Preprocess Data
texts = [
    "im getting on borderlands and i will murder you all",
    "So I spent a few hours making something for fun",
    "Rock-Hard La Varlope, RARE & POWERFUL",
    "that was the first borderlands session in a long time where i actually had a really satisfying combat experience"
]
labels = [1, 1, 0, 1]  # Example binary labels (1 = Positive, 0 = Neutral/Negative)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

X = pad_sequences(sequences, maxlen=100)
y = np.array(labels)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Define the Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Fine-tune the Model
print("\nStep 6: Fine-tuning and Optimization")
print("Fine-tuning the model...")

# Adjust the learning rate
model.optimizer.learning_rate = 0.001

# Fine-tune the model on the training data
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val)
)

# Step 4: Evaluate the Fine-tuned Model
print("\nEvaluating the fine-tuned model...")
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss:.2f}, Validation Accuracy: {val_accuracy:.2f}")


Step 6: Fine-tuning and Optimization
Fine-tuning the model...
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.6667 - loss: 0.6893 - val_accuracy: 0.0000e+00 - val_loss: 0.7009
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 635ms/step - accuracy: 0.3333 - loss: 0.6925 - val_accuracy: 0.0000e+00 - val_loss: 0.7015
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.6667 - loss: 0.6836 - val_accuracy: 0.0000e+00 - val_loss: 0.7017
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - accuracy: 1.0000 - loss: 0.6824 - val_accuracy: 0.0000e+00 - val_loss: 0.7007
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step - accuracy: 1.0000 - loss: 0.6764 - val_accuracy: 0.0000e+00 - val_loss: 0.6986
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 1.0000 - loss: 0.6577 - 

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

print("\nStep 7: Next Word Prediction Function")
# Function to predict the next word

# Step 1: Load and preprocess the data
texts = [
    "im getting on borderlands and i will murder you all",
    "So I spent a few hours making something for fun",
    "Rock-Hard La Varlope, RARE & POWERFUL",
    "that was the first borderlands session in a long time where i actually had a really satisfying combat experience"
]

# Tokenize the texts
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to 5000 words
tokenizer.fit_on_texts(texts)

# Determine max_sequence_len (maximum length of sequences in the dataset)
max_sequence_len = max(len(seq.split()) for seq in texts)

# Example model (replace this with your trained model)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_len - 1),
    LSTM(64, return_sequences=False),
    Dense(5000, activation='softmax')  # Output layer for vocabulary size
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 2: Define the prediction function
def predict_next_word(seed_text, num_words):
    """
    Predict the next words given a seed text.

    Args:
        seed_text (str): The input text to generate predictions from.
        num_words (int): Number of words to predict.

    Returns:
        str: The predicted sentence.
    """
    for _ in range(num_words):
        # Convert the seed text into a sequence of tokens
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad the sequence to match the model's input length
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Predict the probabilities for the next word
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]

        # Map the predicted index back to a word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break

        # Append the predicted word to the seed text
        seed_text += " " + output_word

    return seed_text

# Step 3: Test the prediction function
test_sentence = "I love"
predicted_sentence = predict_next_word(test_sentence, num_words=3)
print(f"\nPredicted sentence for '{test_sentence}': {predicted_sentence}")


Step 7: Next Word Prediction Function

Predicted sentence for 'I love': I love   
