In [1]:
# ============================================
# LEVEL 2: COMPLETE LSTM SENTIMENT ANALYSIS
# Using EXACT preprocessing and split from requirements
# ============================================

import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("=" * 70)
print("LEVEL 2: LSTM SENTIMENT ANALYSIS")
print("=" * 70)
print("Includes RNN from Level 1 and adds LSTM for comparison")

# ======================
# 1. SETUP
# ======================
BASE_DIR = os.getcwd()
DATASETS_DIR = os.path.join(BASE_DIR, 'Datasets')
MODELS_DIR = os.path.join(BASE_DIR, 'saved_models')
os.makedirs(MODELS_DIR, exist_ok=True)

print(f"Working directory: {BASE_DIR}")
print(f"Datasets directory: {DATASETS_DIR}")

# ======================
# 2. LOAD DATASET
# ======================
print("\n" + "="*50)
print("LOADING DATASET")
print("="*50)

# Look for dataset file
dataset_files = [f for f in os.listdir(DATASETS_DIR) 
                if 'training' in f.lower() and f.endswith('.csv')]

if not dataset_files:
    print("‚ùå ERROR: Dataset file not found in Datasets folder!")
    print("Please make sure 'training.1600000.processed.noemoticon.csv' is in Datasets folder")
    raise FileNotFoundError("Dataset file not found")

file_path = os.path.join(DATASETS_DIR, dataset_files[0])
print(f"‚úÖ Found dataset: {file_path}")

try:
    # Load dataset as per requirements
    dataset = pd.read_csv(file_path, engine="python", header=None)
    print("‚úÖ Dataset loaded successfully")
    
    # Reset headers for the dataset
    dataset.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
    
    # Drop useless columns vertically
    df = dataset.drop(['id', 'date', 'query', 'user_id'], axis=1)
    
    # Check Label Categories
    print("\nüìä Label distribution:")
    print(df['sentiment'].value_counts())
    
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    raise

# ======================
# 3. TEXT PREPROCESSING (EXACT from requirements)
# ======================
print("\n" + "="*50)
print("TEXT PREPROCESSING")
print("="*50)

# EXACT preprocessing function from requirements
text_cleaning_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'

def preprocessing(text, stem=False):
    """EXACT preprocessing function from requirements"""
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        tokens.append(token)  # Save word directly (no stopwords removal as in requirements)
    return ' '.join(tokens)

# Clean each row of text in the text column of the dataset
df.text = df.text.apply(lambda x: preprocessing(x))

print("‚úÖ Text preprocessing completed")

# ======================
# 4. TRAIN-TEST SPLIT (EXACT from requirements)
# ======================
print("\n" + "="*50)
print("TRAIN-TEST SPLIT")
print("="*50)

MAX_WORDS = 100000  # Maximum vocabulary size 100,000
MAX_SEQ_LENGTH = 30  # Maximum sequence length 30

# Split train and test sets (EXACT from requirements)
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=666, shuffle=True)

print(f"üìö Training set size: {len(train_dataset):,}")
print(f"üß™ Test set size: {len(test_dataset):,}")

# ======================
# 5. TOKENIZATION (EXACT from requirements)
# ======================
print("\n" + "="*50)
print("TOKENIZATION")
print("="*50)

# Tokenization (EXACT from requirements)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_dataset.text)

# Each word corresponds to an index
word_index = tokenizer.word_index
# Training set vocabulary size
vocab_size = len(word_index) + 1
print(f"üìñ Vocabulary size: {vocab_size:,}")

# Fix the length of each text
# Convert words to sequences (EXACT from requirements)
x_train = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(train_dataset.text),
    maxlen=MAX_SEQ_LENGTH)

x_test = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(test_dataset.text),
    maxlen=MAX_SEQ_LENGTH)

# Perform LabelEncoding on label categories, encode categories into continuous numbers
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_dataset.sentiment.tolist())
y_test = encoder.fit_transform(test_dataset.sentiment.tolist())
y_train = y_train.reshape(-1, 1)  # Reshape
y_test = y_test.reshape(-1, 1)

print(f"\nüìê Data shapes (EXACT from requirements):")
print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_test:  {x_test.shape}, y_test: {y_test.shape}")

# For validation split
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.1, random_state=42
)

print(f"x_val:   {x_val.shape}, y_val: {y_val.shape}")

# ======================
# 6. BUILD RNN MODEL (Level 1)
# ======================
print("\n" + "="*50)
print("REBUILDING RNN MODEL (Level 1)")
print("="*50)

def build_rnn_model():
    """Build RNN model for Level 1"""
    model = keras.Sequential([
        layers.Embedding(
            input_dim=MAX_WORDS,
            output_dim=128,
            input_length=MAX_SEQ_LENGTH,
            mask_zero=True
        ),
        layers.SimpleRNN(128, dropout=0.2, return_sequences=False),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create and train RNN model
print("üöÄ Training RNN model...")
rnn_model = build_rnn_model()
rnn_history = rnn_model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=5,
    batch_size=128,
    verbose=1
)

print("‚úÖ RNN training completed!")

# ======================
# 7. BUILD LSTM MODEL (Level 2)
# ======================
print("\n" + "="*50)
print("BUILDING LSTM MODEL (Level 2)")
print("="*50)

def build_lstm_model():
    """Build LSTM model for Level 2"""
    model = keras.Sequential([
        # Embedding layer
        layers.Embedding(
            input_dim=MAX_WORDS,
            output_dim=128,
            input_length=MAX_SEQ_LENGTH,
            mask_zero=True
        ),
        
        # LSTM layer
        layers.LSTM(
            128,
            dropout=0.2,
            recurrent_dropout=0.2,
            return_sequences=False
        ),
        
        # Dense layer
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        
        # Output layer
        layers.Dense(1, activation='sigmoid')
    ])
    
    # Compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create LSTM model
lstm_model = build_lstm_model()
print("LSTM Model Summary:")
lstm_model.summary()

# ======================
# 8. TRAIN LSTM MODEL
# ======================
print("\n" + "="*50)
print("TRAINING LSTM MODEL")
print("="*50)

# Train LSTM model
print("üöÄ Training LSTM model...")
lstm_history = lstm_model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=5,
    batch_size=128,
    verbose=1
)

print("‚úÖ LSTM training completed!")

# ======================
# 9. PLOT TRAINING CURVES FOR BOTH MODELS
# ======================
print("\n" + "="*50)
print("TRAINING CURVES COMPARISON")
print("="*50)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# RNN Loss
axes[0, 0].plot(rnn_history.history['loss'], label='Training Loss', linewidth=2)
axes[0, 0].plot(rnn_history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0, 0].set_title('RNN - Loss Curve', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# RNN Accuracy
axes[0, 1].plot(rnn_history.history['accuracy'], label='Training Accuracy', linewidth=2)
axes[0, 1].plot(rnn_history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[0, 1].set_title('RNN - Accuracy Curve', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# LSTM Loss
axes[1, 0].plot(lstm_history.history['loss'], label='Training Loss', linewidth=2)
axes[1, 0].plot(lstm_history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[1, 0].set_title('LSTM - Loss Curve', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# LSTM Accuracy
axes[1, 1].plot(lstm_history.history['accuracy'], label='Training Accuracy', linewidth=2)
axes[1, 1].plot(lstm_history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[1, 1].set_title('LSTM - Accuracy Curve', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# ======================
# 10. EVALUATE BOTH MODELS
# ======================
print("\n" + "="*50)
print("MODEL EVALUATION & COMPARISON")
print("="*50)

def evaluate_model(model, model_name):
    """Evaluate a model and return metrics"""
    # Make predictions
    y_pred_proba = model.predict(x_test)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nüìä {model_name} Evaluation:")
    print(f"  Test Accuracy:  {accuracy:.4f}")
    print(f"  Test F1-Score:  {f1:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])
    plt.title(f'{model_name} - Confusion Matrix', fontsize=14, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    return accuracy, f1

# Evaluate RNN
rnn_accuracy, rnn_f1 = evaluate_model(rnn_model, "RNN")

# Evaluate LSTM
lstm_accuracy, lstm_f1 = evaluate_model(lstm_model, "LSTM")

# ======================
# 11. MODEL COMPARISON
# ======================
print("\n" + "="*50)
print("RNN vs LSTM COMPARISON")
print("="*50)

print("\nüìà Performance Comparison:")
print("-" * 50)
print(f"{'Metric':<15} {'RNN':<10} {'LSTM':<10} {'Difference':<12}")
print("-" * 50)
print(f"{'Accuracy':<15} {rnn_accuracy:.4f}     {lstm_accuracy:.4f}     {lstm_accuracy - rnn_accuracy:+.4f}")
print(f"{'F1-Score':<15} {rnn_f1:.4f}     {lstm_f1:.4f}     {lstm_f1 - rnn_f1:+.4f}")
print("-" * 50)

print("\nüîç Architectural Comparison:")
print("1. RNN (SimpleRNN):")
print("   - Simple recurrent connections")
print("   - Prone to vanishing gradient problem")
print("   - Less parameters")
print("   - Faster to train")

print("\n2. LSTM (Long Short-Term Memory):")
print("   - Has memory cells and gates (input, forget, output)")
print("   - Solves vanishing gradient problem")
print("   - Better at capturing long-term dependencies")
print("   - More parameters, slower to train")

print("\nüìä Summary:")
if lstm_accuracy > rnn_accuracy:
    print("‚úÖ LSTM performs better than RNN on this sentiment analysis task.")
    print("   This is expected due to LSTM's ability to capture long-range dependencies.")
else:
    print("‚ö†Ô∏è  RNN performs similarly or better than LSTM.")
    print("   This could be because sentiment analysis often relies on short-range dependencies.")

# ======================
# 12. LIVE PREDICTION DEMO
# ======================
print("\n" + "="*50)
print("LIVE PREDICTION DEMONSTRATION")
print("="*50)

def predict_single_text(text):
    """Predict sentiment for a single text using both models"""
    # Preprocess using the EXACT preprocessing function
    cleaned_text = preprocessing(text)
    
    # Tokenize
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = tf.keras.preprocessing.sequence.pad_sequences(
        sequence,
        maxlen=MAX_SEQ_LENGTH,
        padding='post',
        truncating='post'
    )
    
    # Predict with RNN
    rnn_pred = rnn_model.predict(padded, verbose=0)[0][0]
    rnn_sentiment = "positive" if rnn_pred > 0.5 else "negative"
    
    # Predict with LSTM
    lstm_pred = lstm_model.predict(padded, verbose=0)[0][0]
    lstm_sentiment = "positive" if lstm_pred > 0.5 else "negative"
    
    return {
        'RNN': {'sentiment': rnn_sentiment, 'score': float(rnn_pred)},
        'LSTM': {'sentiment': lstm_sentiment, 'score': float(lstm_pred)}
    }

# Test with the teacher's example
test_text = "I like reading."
print(f"\nüß™ Testing with teacher's example: '{test_text}'")
predictions = predict_single_text(test_text)

for model_name, pred in predictions.items():
    print(f"{model_name} prediction result: {pred['sentiment']}, score: {pred['score']:.8f}")

# ======================
# 13. SAVE MODELS
# ======================
print("\n" + "="*50)
print("SAVING MODELS")
print("="*50)

# Save RNN model
rnn_model_path = os.path.join(MODELS_DIR, 'rnn_model.h5')
rnn_model.save(rnn_model_path)
print(f"‚úÖ RNN model saved to: {rnn_model_path}")

# Save LSTM model
lstm_model_path = os.path.join(MODELS_DIR, 'lstm_model.h5')
lstm_model.save(lstm_model_path)
print(f"‚úÖ LSTM model saved to: {lstm_model_path}")

# Save tokenizer (overwrite if exists)
import pickle
tokenizer_path = os.path.join(MODELS_DIR, 'tokenizer.pickle')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"‚úÖ Tokenizer saved to: {tokenizer_path}")

print("\n" + "="*70)
print("LEVEL 2 COMPLETED: RNN & LSTM MODELS TRAINED AND SAVED")
print("="*70)
print(f"üìä RNN Accuracy: {rnn_accuracy:.4f}, F1: {rnn_f1:.4f}")
print(f"üìä LSTM Accuracy: {lstm_accuracy:.4f}, F1: {lstm_f1:.4f}")
print(f"üìÅ Models saved in: {MODELS_DIR}")
print("\nFor teacher's demonstration with 'I like reading.':")
print("Expected output: RNN prediction result: positive, score: 0.61676633")
print("Expected output: LSTM prediction result: positive, score: 0.7692368")

2025-12-03 22:06:39.763578: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


LEVEL 2: LSTM SENTIMENT ANALYSIS
Includes RNN from Level 1 and adds LSTM for comparison
Working directory: /home/sakhawat/workspace/Python/ml/Emotion-Analysis
Datasets directory: /home/sakhawat/workspace/Python/ml/Emotion-Analysis/Datasets

LOADING DATASET
‚úÖ Found dataset: /home/sakhawat/workspace/Python/ml/Emotion-Analysis/Datasets/training.1600000.processed.noemoticon.csv
‚ùå Error loading dataset: 'utf-8' codec can't decode bytes in position 7970-7971: invalid continuation byte


UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 7970-7971: invalid continuation byte