# AISentinel - Sentiment Analysis Model Evaluation

This notebook demonstrates the complete pipeline for training and evaluating a custom TensorFlow sentiment analysis model for AI tool reviews.

## Overview

1. **Data Preparation**: Load and preprocess training data
2. **Model Architecture**: Build custom LSTM/Transformer models
3. **Training**: Train with proper validation and callbacks
4. **Evaluation**: Comprehensive metrics and visualizations
5. **Testing**: Real-world AI tool review analysis

In [None]:
# Setup and imports
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import tensorflow as tf
from tensorflow import keras

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

## 1. Data Preparation

Load and explore the training dataset.

In [None]:
# Load data
data_dir = PROJECT_ROOT / "data" / "processed"

# Check if data exists, if not prepare it
if not (data_dir / "train.csv").exists():
    print("Preparing training data...")
    from src.data_collection.prepare_training_data import prepare_training_data
    train_df, val_df, test_df = prepare_training_data(output_dir=data_dir)
else:
    print("Loading existing data...")
    train_df = pd.read_csv(data_dir / "train.csv")
    val_df = pd.read_csv(data_dir / "val.csv")
    test_df = pd.read_csv(data_dir / "test.csv")

print(f"\nDataset sizes:")
print(f"  Train: {len(train_df):,}")
print(f"  Val:   {len(val_df):,}")
print(f"  Test:  {len(test_df):,}")

In [None]:
# Explore data distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Sentiment distribution
train_df['sentiment'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Sentiment Distribution (Train)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Source distribution
train_df['source'].value_counts().plot(kind='bar', ax=axes[1], color='lightcoral')
axes[1].set_title('Data Source Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Source')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Text length distribution
train_df['text_length'] = train_df['text'].str.len()
train_df['text_length'].hist(bins=50, ax=axes[2], color='lightgreen', edgecolor='black')
axes[2].set_title('Text Length Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Character Count')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"\nText length statistics:")
print(train_df['text_length'].describe())

In [None]:
# Sample reviews
print("Sample reviews from each sentiment class:\n")
for sentiment in ['positive', 'neutral', 'negative']:
    print(f"\n{'='*60}")
    print(f"{sentiment.upper()} Examples")
    print('='*60)
    samples = train_df[train_df['sentiment'] == sentiment].sample(3)
    for i, (_, row) in enumerate(samples.iterrows(), 1):
        print(f"\n{i}. {row['text'][:100]}...")

## 2. Model Architecture

Build custom TensorFlow models for sentiment analysis.

In [None]:
from src.ml.model import build_sentiment_model, build_transformer_based_model

# Build LSTM model
lstm_model = build_sentiment_model(
    vocab_size=10000,
    embedding_dim=128,
    max_length=128,
    num_classes=3,
    lstm_units=64,
    dropout_rate=0.5,
    use_attention=True,
)

print("\nLSTM Model with Attention Mechanism")
print("="*60)
lstm_model.summary()

In [None]:
# Visualize model architecture
from tensorflow.keras.utils import plot_model

plot_model(
    lstm_model,
    to_file='model_architecture.png',
    show_shapes=True,
    show_layer_names=True,
    rankdir='TB',
    dpi=150
)

from IPython.display import Image
Image('model_architecture.png')

## 3. Model Training

Train the model with proper data preprocessing and callbacks.

In [None]:
from src.ml.train_model import SentimentModelTrainer
from datetime import datetime

# Initialize trainer
output_dir = PROJECT_ROOT / "models" / f"notebook_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

trainer = SentimentModelTrainer(
    data_dir=data_dir,
    output_dir=output_dir,
    max_vocab_size=10000,
    max_length=128,
    model_type="lstm",
)

print(f"Output directory: {output_dir}")

In [None]:
# Prepare data
print("Preparing tokenizer and sequences...")
trainer.load_data()
tokenizer = trainer.prepare_tokenizer(train_df)

X_train, y_train = trainer.prepare_sequences(train_df)
X_val, y_val = trainer.prepare_sequences(val_df)
X_test, y_test = trainer.prepare_sequences(test_df)

print(f"\nData shapes:")
print(f"  X_train: {X_train.shape}")
print(f"  X_val:   {X_val.shape}")
print(f"  X_test:  {X_test.shape}")

In [None]:
# Build and train model
model = trainer.build_model()

print("\nStarting training...")
history = trainer.train(
    X_train, y_train,
    X_val, y_val,
    epochs=10,  # Reduced for notebook demo
    batch_size=32,
)

## 4. Training Analysis

Visualize training progress and metrics.

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Accuracy
axes[0, 0].plot(history.history['accuracy'], label='Train', linewidth=2)
axes[0, 0].plot(history.history['val_accuracy'], label='Validation', linewidth=2)
axes[0, 0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Loss
axes[0, 1].plot(history.history['loss'], label='Train', linewidth=2)
axes[0, 1].plot(history.history['val_loss'], label='Validation', linewidth=2)
axes[0, 1].set_title('Model Loss', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Precision
if 'precision' in history.history:
    axes[1, 0].plot(history.history['precision'], label='Train', linewidth=2)
    axes[1, 0].plot(history.history['val_precision'], label='Validation', linewidth=2)
    axes[1, 0].set_title('Model Precision', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Precision')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)

# Recall
if 'recall' in history.history:
    axes[1, 1].plot(history.history['recall'], label='Train', linewidth=2)
    axes[1, 1].plot(history.history['val_recall'], label='Validation', linewidth=2)
    axes[1, 1].set_title('Model Recall', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Recall')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / 'training_history_detailed.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Model Evaluation

Comprehensive evaluation on the test set.

In [None]:
# Evaluate on test set
metrics = trainer.evaluate(X_test, y_test)

print(f"\nTest Accuracy: {metrics['test_accuracy']:.4f}")

In [None]:
# Confusion Matrix
y_pred = np.argmax(model.predict(X_test), axis=1)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Negative', 'Neutral', 'Positive'],
    yticklabels=['Negative', 'Neutral', 'Positive'],
    cbar_kws={'label': 'Count'},
)
plt.title('Confusion Matrix - Test Set', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig(output_dir / 'confusion_matrix_detailed.png', dpi=150, bbox_inches='tight')
plt.show()

# Calculate per-class accuracy
print("\nPer-class accuracy:")
for i, label in enumerate(['Negative', 'Neutral', 'Positive']):
    class_acc = cm[i, i] / cm[i, :].sum()
    print(f"  {label}: {class_acc:.2%}")

In [None]:
# Classification Report
from sklearn.metrics import classification_report

report = classification_report(
    y_test,
    y_pred,
    target_names=['Negative', 'Neutral', 'Positive'],
    output_dict=True
)

# Convert to DataFrame for nice display
report_df = pd.DataFrame(report).transpose()
print("\nClassification Report:")
print(report_df.round(3))

In [None]:
# Prediction confidence distribution
y_pred_probs = model.predict(X_test)
confidence_scores = np.max(y_pred_probs, axis=1)

plt.figure(figsize=(12, 5))

# Overall confidence
plt.subplot(1, 2, 1)
plt.hist(confidence_scores, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Confidence Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Prediction Confidence Distribution', fontsize=14, fontweight='bold')
plt.axvline(confidence_scores.mean(), color='red', linestyle='--', label=f'Mean: {confidence_scores.mean():.3f}')
plt.legend()
plt.grid(True, alpha=0.3)

# Confidence by correctness
plt.subplot(1, 2, 2)
correct_mask = y_pred == y_test
plt.hist(confidence_scores[correct_mask], bins=30, alpha=0.6, label='Correct', edgecolor='black')
plt.hist(confidence_scores[~correct_mask], bins=30, alpha=0.6, label='Incorrect', edgecolor='black')
plt.xlabel('Confidence Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Confidence: Correct vs Incorrect', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nMean confidence (correct predictions): {confidence_scores[correct_mask].mean():.3f}")
print(f"Mean confidence (incorrect predictions): {confidence_scores[~correct_mask].mean():.3f}")

## 6. Real-World Testing

Test the model on real AI tool reviews.

In [None]:
# Save the model first
trainer.save_model()
print(f"Model saved to: {output_dir}")

In [None]:
# Test with real AI tool reviews
from src.sentiment_analysis.analyzer import AdvancedSentimentAnalyzer

analyzer = AdvancedSentimentAnalyzer(
    use_custom_model=True,
    custom_model_path=output_dir / "sentiment_model.keras",
    custom_tokenizer_path=output_dir / "tokenizer.pkl",
)

# Sample reviews
test_reviews = [
    "ChatGPT is absolutely amazing! It helps me code so much faster.",
    "Claude is terrible, keeps giving me wrong answers.",
    "GitHub Copilot is okay, works for basic stuff.",
    "Midjourney creates stunning images! Best AI art tool ever.",
    "Concerned about privacy with DeepSeek. Where is my data stored?",
]

print("\nTesting on real AI tool reviews:")
print("="*70)

results = []
for review in test_reviews:
    result = analyzer.analyze(review)
    results.append({
        'text': review,
        'sentiment': result.label,
        'confidence': result.confidence,
        'score': result.score,
    })
    
    print(f"\nReview: {review}")
    print(f"â†’ Sentiment: {result.label.upper()} (confidence: {result.confidence:.3f})")

# Display as DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*70)
print("\nResults Summary:")
print(results_df)

In [None]:
# Visualize results
fig, ax = plt.subplots(figsize=(12, 6))

colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
bar_colors = [colors[s] for s in results_df['sentiment']]

bars = ax.barh(range(len(results_df)), results_df['confidence'], color=bar_colors, alpha=0.7)
ax.set_yticks(range(len(results_df)))
ax.set_yticklabels([f"Review {i+1}" for i in range(len(results_df))])
ax.set_xlabel('Confidence Score', fontsize=12)
ax.set_title('Sentiment Analysis Results - AI Tool Reviews', fontsize=14, fontweight='bold')
ax.set_xlim(0, 1)
ax.grid(True, alpha=0.3, axis='x')

# Add sentiment labels
for i, (idx, row) in enumerate(results_df.iterrows()):
    ax.text(row['confidence'] + 0.02, i, row['sentiment'].capitalize(), 
            va='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Model Summary

Final model statistics and next steps.

In [None]:
print("="*70)
print("MODEL TRAINING SUMMARY")
print("="*70)

print(f"\nModel Architecture: LSTM with Attention")
print(f"Training Samples: {len(train_df):,}")
print(f"Validation Samples: {len(val_df):,}")
print(f"Test Samples: {len(test_df):,}")

print(f"\nFinal Metrics:")
print(f"  Test Accuracy: {metrics['test_accuracy']:.2%}")
print(f"  Vocabulary Size: {len(tokenizer.word_index):,}")
print(f"  Max Sequence Length: {trainer.max_length}")

print(f"\nModel saved to:")
print(f"  {output_dir}")

print(f"\nNext Steps:")
print("  1. Deploy model to production")
print("  2. Integrate with data collection pipeline")
print("  3. Update dashboard to use custom model")
print("  4. Monitor performance on real data")

print("\n" + "="*70)