<a href="https://colab.research.google.com/github/rishisg/ChatGPT/blob/main/RNN_ASSIGNMENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import pickle

print("Step 1: Loading the dataset...")
# Load the dataset into a Pandas DataFrame
df = pd.read_csv('twitter_training.csv', header=None, names=['ID', 'Topic', 'Sentiment', 'Text'])

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(df.head())

# Check for missing values in the dataset
print("\nChecking for missing values in the dataset:")
print(df.isnull().sum())

print("\nStep 2: Cleaning the dataset...")
# Filter out irrelevant sentiments ('Irrelevant')
df = df[df['Sentiment'].isin(['Positive', 'Negative', 'Neutral'])]

# Replace missing values in the 'Text' column with empty strings
df['Text'] = df['Text'].fillna("")
print("Dataset cleaned. Irrelevant sentiments removed and missing values handled.")

print("\nStep 3: Encoding sentiment labels...")
# Encode sentiment labels ('Positive', 'Negative', 'Neutral') into numerical values
label_encoder = LabelEncoder()
df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

# Map encoded labels back to original for reference
sentiment_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
print("Sentiment Mapping (Encoded to Original):")
print(sentiment_mapping)

print("\nStep 4: Tokenizing and padding text data...")
# Define tokenizer parameters
max_vocab_size = 5000  # Limit vocabulary size
max_len = 100          # Maximum sequence length

# Tokenize the text
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(df['Text'])
sequences = tokenizer.texts_to_sequences(df['Text'])

# Pad sequences to ensure uniform input length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
print(f"Text tokenized and padded. Vocabulary size: {max_vocab_size}, Max sequence length: {max_len}")

print("\nStep 5: Splitting data into training and testing sets...")
# Features (text) and labels (sentiment)
X = padded_sequences
y = df['Sentiment']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Ensure data types are correct
X_train = X_train.astype('int32')
X_test = X_test.astype('int32')
y_train = y_train.astype('int32')
y_test = y_test.astype('int32')

print("Data split into training and testing sets.")
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

print("\nStep 6: Building the RNN model...")
# Build the RNN model
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes: Positive, Negative, Neutral
])

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Print model summary
print("Model architecture summary:")
model.summary()

print("\nStep 7: Training the model...")
# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping]
)

print("\nStep 8: Evaluating the model...")
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.2f}, Test Accuracy: {accuracy:.2f}")

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

print("\nStep 9: Saving the model and tokenizer...")
# Save the trained model
model.save('sentiment_rnn_model.h5')
print("Model saved as 'sentiment_rnn_model.h5'.")

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved as 'tokenizer.pkl'.")

print("\nStep 10: Loading the saved model and making predictions...")
# Load the saved model
from tensorflow.keras.models import load_model

loaded_model = load_model('sentiment_rnn_model.h5')
print("Model loaded successfully.")

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as f:
    loaded_tokenizer = pickle.load(f)
print("Tokenizer loaded successfully.")

# Function to predict sentiment for new text
def predict_sentiment(text):
    sequence = loaded_tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = loaded_model.predict(padded)
    predicted_class = np.argmax(prediction, axis=1)
    return label_encoder.inverse_transform(predicted_class)[0]

# Test the function with an example
example_text = "I love this game!"
predicted_sentiment = predict_sentiment(example_text)
print(f"\nExample Prediction:\nText: '{example_text}'\nPredicted Sentiment: {predicted_sentiment}")

Step 1: Loading the dataset...

First few rows of the dataset:
     ID        Topic Sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                Text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  

Checking for missing values in the dataset:
ID             0
Topic          0
Sentiment      0
Text         686
dtype: int64

Step 2: Cleaning the dataset...
Dataset cleaned. Irrelevant sentiments removed and missing values handled.

Step 3: Encoding sentiment labels...
Sentiment Mapping (Encoded to Original):
{0: 'Negative', 1: 'Neutral', 2: 'Positive'}

Step 4: Tokenizing and padding text data...




Step 7: Training the model...
Epoch 1/10
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 271ms/step - accuracy: 0.3552 - loss: 1.0970 - val_accuracy: 0.3375 - val_loss: 1.0958
Epoch 2/10
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 267ms/step - accuracy: 0.3638 - loss: 1.0956 - val_accuracy: 0.3654 - val_loss: 1.0952
Epoch 3/10
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 268ms/step - accuracy: 0.3677 - loss: 1.0951 - val_accuracy: 0.3654 - val_loss: 1.0952
Epoch 4/10
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 269ms/step - accuracy: 0.3601 - loss: 1.0960 - val_accuracy: 0.3654 - val_loss: 1.0953
Epoch 5/10
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 268ms/step - accuracy: 0.3661 - loss: 1.0953 - val_accuracy: 0.3654 - val_loss: 1.0951
Epoch 6/10
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 270ms/step - accuracy: 0.3669 - loss: 1.0950 - val_accuracy:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
              precision    recall  f1-score   support

    Negative       0.37      1.00      0.54      4509
     Neutral       0.00      0.00      0.00      3664
    Positive       0.00      0.00      0.00      4166

    accuracy                           0.37     12339
   macro avg       0.12      0.33      0.18     12339
weighted avg       0.13      0.37      0.20     12339


Step 9: Saving the model and tokenizer...
Model saved as 'sentiment_rnn_model.h5'.
Tokenizer saved as 'tokenizer.pkl'.

Step 10: Loading the saved model and making predictions...
Model loaded successfully.
Tokenizer loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step

Example Prediction:
Text: 'I love this game!'
Predicted Sentiment: Negative
