In [1]:
# ============================================
# LEVEL 1: COMPLETE RNN SENTIMENT ANALYSIS
# Using EXACT preprocessing and split from requirements
# ============================================

import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("=" * 70)
print("LEVEL 1: RNN SENTIMENT ANALYSIS")
print("=" * 70)

# ======================
# 1. SETUP
# ======================
BASE_DIR = os.getcwd()
DATASETS_DIR = os.path.join(BASE_DIR, 'Datasets')
MODELS_DIR = os.path.join(BASE_DIR, 'saved_models')
os.makedirs(MODELS_DIR, exist_ok=True)

print(f"Working directory: {BASE_DIR}")
print(f"Datasets directory: {DATASETS_DIR}")

# ======================
# 2. LOAD DATASET
# ======================
print("\n" + "="*50)
print("LOADING DATASET")
print("="*50)

# Look for dataset file
dataset_files = [f for f in os.listdir(DATASETS_DIR) 
                if 'training' in f.lower() and f.endswith('.csv')]

if not dataset_files:
    print("‚ùå ERROR: Dataset file not found in Datasets folder!")
    print("Please make sure 'training.1600000.processed.noemoticon.csv' is in Datasets folder")
    raise FileNotFoundError("Dataset file not found")

file_path = os.path.join(DATASETS_DIR, dataset_files[0])
print(f"‚úÖ Found dataset: {file_path}")

try:
    # Load dataset as per requirements
    dataset = pd.read_csv(file_path, engine="python", header=None)
    print("‚úÖ Dataset loaded successfully")
    
    # Reset headers for the dataset
    dataset.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
    
    # Drop useless columns vertically
    df = dataset.drop(['id', 'date', 'query', 'user_id'], axis=1)
    
    # Check Label Categories
    print("\nüìä Label distribution:")
    print(df['sentiment'].value_counts())
    
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    raise

# ======================
# 3. TEXT PREPROCESSING (EXACT from requirements)
# ======================
print("\n" + "="*50)
print("TEXT PREPROCESSING")
print("="*50)

# EXACT preprocessing function from requirements
text_cleaning_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'

def preprocessing(text, stem=False):
    """EXACT preprocessing function from requirements"""
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        tokens.append(token)  # Save word directly (no stopwords removal as in requirements)
    return ' '.join(tokens)

# Clean each row of text in the text column of the dataset
df.text = df.text.apply(lambda x: preprocessing(x))

# Show cleaned sample
print("‚úÖ Text preprocessing completed")
print(f"\nüìù Cleaned sample (df.text[2]):")
print(df.text.iloc[2])

# ======================
# 4. TRAIN-TEST SPLIT (EXACT from requirements)
# ======================
print("\n" + "="*50)
print("TRAIN-TEST SPLIT")
print("="*50)

MAX_WORDS = 100000  # Maximum vocabulary size 100,000
MAX_SEQ_LENGTH = 30  # Maximum sequence length 30

# Split train and test sets (EXACT from requirements)
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=666, shuffle=True)

print(f"üìö Training set size: {len(train_dataset):,}")
print(f"üß™ Test set size: {len(test_dataset):,}")

# ======================
# 5. TOKENIZATION (EXACT from requirements)
# ======================
print("\n" + "="*50)
print("TOKENIZATION")
print("="*50)

# Tokenization (EXACT from requirements)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_dataset.text)

# Each word corresponds to an index
word_index = tokenizer.word_index
# Training set vocabulary size
vocab_size = len(word_index) + 1
print(f"üìñ Vocabulary size: {vocab_size:,}")

# Fix the length of each text
# Convert words to sequences (EXACT from requirements)
x_train = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(train_dataset.text),
    maxlen=MAX_SEQ_LENGTH)

x_test = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(test_dataset.text),
    maxlen=MAX_SEQ_LENGTH)

# Perform LabelEncoding on label categories, encode categories into continuous numbers
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_dataset.sentiment.tolist())
y_test = encoder.fit_transform(test_dataset.sentiment.tolist())
y_train = y_train.reshape(-1, 1)  # Reshape
y_test = y_test.reshape(-1, 1)

print(f"\nüìê Data shapes (EXACT from requirements):")
print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_test:  {x_test.shape}, y_test: {y_test.shape}")

# For validation split
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.1, random_state=42
)

print(f"x_val:   {x_val.shape}, y_val: {y_val.shape}")

# ======================
# 6. BUILD RNN MODEL
# ======================
print("\n" + "="*50)
print("BUILDING RNN MODEL")
print("="*50)

def build_rnn_model():
    """Build RNN model for Level 1"""
    model = keras.Sequential([
        # Embedding layer
        layers.Embedding(
            input_dim=MAX_WORDS,
            output_dim=128,
            input_length=MAX_SEQ_LENGTH,
            mask_zero=True
        ),
        
        # Simple RNN layer
        layers.SimpleRNN(
            128,
            dropout=0.2,
            recurrent_dropout=0.2,
            return_sequences=False
        ),
        
        # Dense layer
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        
        # Output layer
        layers.Dense(1, activation='sigmoid')
    ])
    
    # Compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create model
rnn_model = build_rnn_model()
print("RNN Model Summary:")
rnn_model.summary()

# ======================
# 7. TRAIN RNN MODEL
# ======================
print("\n" + "="*50)
print("TRAINING RNN MODEL")
print("="*50)

# Training parameters
EPOCHS = 10
BATCH_SIZE = 128

# Callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        verbose=1
    )
]

# Train model
print("üöÄ Starting training...")
history = rnn_model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("‚úÖ Training completed!")

# ======================
# 8. PLOT TRAINING CURVES
# ======================
print("\n" + "="*50)
print("TRAINING CURVES")
print("="*50)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curve
axes[0].plot(history.history['loss'], label='Training Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_title('RNN - Loss Curve', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy curve
axes[1].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[1].set_title('RNN - Accuracy Curve', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# ======================
# 9. EVALUATE MODEL
# ======================
print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)

# Make predictions
y_pred_proba = rnn_model.predict(x_test)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"üìä Test Accuracy:  {accuracy:.4f}")
print(f"üéØ Test F1-Score:  {f1:.4f}")

# Classification report
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (4)']))

# Confusion Matrix
print("\nüìä Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('RNN Model - Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# ======================
# 10. ERROR ANALYSIS
# ======================
print("\n" + "="*50)
print("ERROR ANALYSIS")
print("="*50)

# Find incorrect predictions
incorrect_idx = np.where(y_pred.flatten() != y_test.flatten())[0]

print(f"üìä Total test samples: {len(y_test):,}")
print(f"‚ùå Incorrect predictions: {len(incorrect_idx):,} ({len(incorrect_idx)/len(y_test):.2%})")

if len(incorrect_idx) > 0:
    print("\nüìù Examples of misclassified texts:")
    for i in incorrect_idx[:3]:
        original_text = test_dataset.text.iloc[i]
        true_sentiment = test_dataset.sentiment.iloc[i]
        pred_sentiment = 4 if y_pred[i][0] == 1 else 0
        
        print(f"\nText: {original_text[:100]}...")
        print(f"True sentiment: {true_sentiment} ({'Positive' if true_sentiment == 4 else 'Negative'})")
        print(f"Predicted sentiment: {pred_sentiment} ({'Positive' if pred_sentiment == 4 else 'Negative'})")
        print(f"Prediction confidence: {y_pred_proba[i][0]:.4f}")
        print("-" * 50)

# ======================
# 11. LIVE PREDICTION DEMO
# ======================
print("\n" + "="*50)
print("LIVE PREDICTION DEMONSTRATION")
print("="*50)

def predict_single_text(text):
    """Predict sentiment for a single text"""
    # Preprocess using the EXACT preprocessing function
    cleaned_text = preprocessing(text)
    
    # Tokenize
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = tf.keras.preprocessing.sequence.pad_sequences(
        sequence,
        maxlen=MAX_SEQ_LENGTH,
        padding='post',
        truncating='post'
    )
    
    # Predict
    prediction = rnn_model.predict(padded, verbose=0)[0][0]
    
    # Convert to original labels (0 or 4)
    if prediction > 0.5:
        sentiment = "positive"
        score = float(prediction)
    else:
        sentiment = "negative"
        score = float(1 - prediction)
    
    return sentiment, float(prediction)

# Test with the teacher's example
test_text = "I like reading."
print(f"\nüß™ Testing with teacher's example: '{test_text}'")
sentiment, score = predict_single_text(test_text)
print(f"RNN prediction result: {sentiment}, score: {score:.8f}")

# Test with more examples
test_cases = [
    "This movie was terrible!",
    "Great service, highly recommended.",
    "I'm not happy with the product.",
    "Absolutely amazing experience!"
]

print("\nüß™ Testing with more examples:")
print("-" * 60)
for text in test_cases:
    sentiment, score = predict_single_text(text)
    print(f"Text: '{text[:40]}{'...' if len(text) > 40 else ''}'")
    print(f"RNN prediction: {sentiment}, score: {score:.8f}")
    print("-" * 40)

# ======================
# 12. SAVE MODEL
# ======================
print("\n" + "="*50)
print("SAVING MODEL")
print("="*50)

# Save RNN model
rnn_model_path = os.path.join(MODELS_DIR, 'rnn_model.h5')
rnn_model.save(rnn_model_path)
print(f"‚úÖ RNN model saved to: {rnn_model_path}")

# Save tokenizer
import pickle
tokenizer_path = os.path.join(MODELS_DIR, 'tokenizer.pickle')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"‚úÖ Tokenizer saved to: {tokenizer_path}")

# Save parameters
import json
params = {
    'MAX_SEQ_LENGTH': MAX_SEQ_LENGTH,
    'MAX_WORDS': MAX_WORDS,
    'preprocessing_function': 'preprocessing'
}
params_path = os.path.join(MODELS_DIR, 'params.json')
with open(params_path, 'w') as f:
    json.dump(params, f, indent=2)
print(f"‚úÖ Parameters saved to: {params_path}")

print("\n" + "="*70)
print("LEVEL 1 COMPLETED: RNN MODEL TRAINED AND SAVED")
print("="*70)
print(f"üìä Test Accuracy: {accuracy:.4f}")
print(f"üéØ F1-Score: {f1:.4f}")
print(f"üìÅ Models saved in: {MODELS_DIR}")
print("\nFor teacher's demonstration with 'I like reading.':")
print("Expected output: RNN prediction result: positive, score: 0.61676633")

2025-12-03 22:06:11.881660: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


LEVEL 1: RNN SENTIMENT ANALYSIS
Working directory: /home/sakhawat/workspace/Python/ml/Emotion-Analysis
Datasets directory: /home/sakhawat/workspace/Python/ml/Emotion-Analysis/Datasets

LOADING DATASET
‚úÖ Found dataset: /home/sakhawat/workspace/Python/ml/Emotion-Analysis/Datasets/training.1600000.processed.noemoticon.csv
‚ùå Error loading dataset: 'utf-8' codec can't decode bytes in position 7970-7971: invalid continuation byte


UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 7970-7971: invalid continuation byte