# YouTube Sentiment Analysis System
## Using Genetic Algorithms, Machine Learning, and Neural Networks

This comprehensive notebook demonstrates sentiment analysis of YouTube comments with:
- **GA (Genetic Algorithm)**: Feature selection & hyperparameter optimization
- **ML Models**: Random Forest, SVM, Gradient Boosting, Logistic Regression
- **Neural Networks**: LSTM, GRU, CNN, Bidirectional models
- **Advanced Features**: Real-time streaming, toxicity detection, multi-format export

**Author**: Sentiment Analysis Team  
**Date**: February 2026

## Section 1: Environment Setup & API Configuration

In [None]:
# Install required packages (uncomment if needed)
# !pip install -q -r requirements.txt

import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add project to path
sys.path.insert(0, '/home/violet/Documents/SAGA')

from dotenv import load_dotenv
load_dotenv()

# Verify installations
print("✅ Environment loaded successfully")
print(f"Python version: {sys.version}")
print(f"Project path: {sys.path[0]}")

## Section 2: Import Libraries & Initialization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import logging

# Import custom modules
from src.api.youtube_scraper import YouTubeScraper
from src.models.sentiment_classifier import SentimentClassifier
from src.utils.text_preprocessor import TextPreprocessor
from src.utils.genetic_optimizer import GeneticOptimizer
from src.visualization.sentiment_visualizer import SentimentVisualizer
from src.utils.report_generator import ReportGenerator

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ All libraries imported successfully")

## Section 3: Sample Data Preparation

Since we don't have API access in this demo, we'll use sample YouTube comments for demonstration.

In [None]:
# Create sample YouTube comments dataset
sample_comments = [
    # Positive comments
    ("This video is absolutely amazing! I loved every second of it!", "positive"),
    ("Best content creator on YouTube! Keep it up!", "positive"),
    ("I really enjoyed this video, very informative and well explained.", "positive"),
    ("Fantastic tutorial! Helped me solve my problem!", "positive"),
    ("This is exactly what I was looking for. Great work!", "positive"),
    ("Outstanding quality! You deserve more subscribers.", "positive"),
    ("Brilliant explanation! Made everything so clear.", "positive"),
    ("I'm impressed with the depth of knowledge here.", "positive"),
    
    # Neutral comments
    ("Not bad, but I've seen better content.", "neutral"),
    ("This video was okay, nothing special.", "neutral"),
    ("Interesting topic. Could use more examples.", "neutral"),
    ("Pretty good, though some parts were confusing.", "neutral"),
    ("Average video. Some good points, some not so much.", "neutral"),
    ("It's fine, just what I expected from the title.", "neutral"),
    ("Decent quality. Worth watching if you have time.", "neutral"),
    ("Not the best, not the worst. Middle ground content.", "neutral"),
    
    # Negative comments
    ("Completely useless, waste of my time!", "negative"),
    ("Terrible quality and boring content!", "negative"),
    ("Don't recommend this to anyone.", "negative"),
    ("Absolutely horrible! I'm unsubscribing.", "negative"),
    ("This is the worst video I've ever watched!", "negative"),
    ("So disappointed with this content.", "negative"),
    ("Waste of time and effort.", "negative"),
    ("Couldn't finish watching this. Too bad.", "negative"),
]

# Create DataFrame
df = pd.DataFrame(sample_comments, columns=['text', 'sentiment'])

print(f"Sample Dataset: {len(df)} comments")
print(f"\nSentiment Distribution:")
print(df['sentiment'].value_counts())
print(f"\nFirst 5 comments:")
print(df.head())

## Section 4: Text Preprocessing Pipeline

In [None]:
# Initialize text preprocessor
preprocessor = TextPreprocessor()

# Example: Show preprocessing steps
sample_text = df.iloc[0]['text']
print(f"Original: {sample_text}\n")

# Step-by-step preprocessing
cleaned = preprocessor.clean_text(sample_text)
print(f"After cleaning: {cleaned}\n")

preprocessed = preprocessor.preprocess(sample_text)
print(f"After full preprocessing: {preprocessed}\n")

# Extract features
features = preprocessor.extract_features(sample_text)
print(f"Extracted features: {features}")

# Preprocess all texts
print("\nPreprocessing all comments...")
df['preprocessed'] = df['text'].apply(preprocessor.preprocess)
print(f"✅ Preprocessing complete")

## Section 5: Exploratory Data Analysis & Visualizations

In [None]:
# Analyze comment characteristics
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

print("Comment Statistics:")
print(df[['text_length', 'word_count']].describe())

# Visualize sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar chart
sentiment_counts = df['sentiment'].value_counts()
colors = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
bar_colors = [colors[s] for s in sentiment_counts.index]

sentiment_counts.plot(kind='bar', ax=axes[0], color=bar_colors, edgecolor='black')
axes[0].set_title('Sentiment Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
sentiment_counts.plot(kind='pie', ax=axes[1], colors=bar_colors, autopct='%1.1f%%')
axes[1].set_title('Sentiment Percentage', fontsize=12, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print("✅ EDA complete")

## Section 6: Machine Learning Models Comparison

In [None]:
from src.models.ml_classifier import MLSentimentClassifier

# Prepare data
X = df['preprocessed'].tolist()
y = df['sentiment'].tolist()

# Train and evaluate different ML models
ml_models = ['random_forest', 'gradient_boosting', 'svm']
ml_results = {}

print("Training ML Models...\n")
for model_type in ml_models:
    print(f"[{model_type.upper()}]")
    try:
        clf = MLSentimentClassifier(model_type=model_type)
        metrics = clf.train(X, y, test_size=0.2)
        ml_results[model_type] = metrics
        
        print(f"  Accuracy: {metrics['test_accuracy']:.4f}")
        print(f"  F1-Score: {metrics['f1']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print()
    except Exception as e:
        print(f"  Error: {str(e)}\n")

# Store best ML model
best_ml_model = max(ml_results.items(), key=lambda x: x[1]['test_accuracy'])[0]
print(f"✅ Best ML Model: {best_ml_model}")

## Section 7: Neural Network Models

In [None]:
from src.models.neural_network import NeuralNetworkClassifier

print("Training Neural Network Models...\n")

# Train LSTM model
print("[LSTM Model]")
nn_lstm = NeuralNetworkClassifier(
    vocab_size=1000,
    max_length=100,
    embedding_dim=64,
    architecture='lstm'
)

try:
    nn_metrics = nn_lstm.train(
        X, y,
        validation_split=0.2,
        epochs=10,
        batch_size=4
    )
    print(f"Training complete!")
    print(f"Final loss: {nn_metrics['history']['loss'][-1]:.4f}")
except Exception as e:
    print(f"Error: {str(e)}")

# Test prediction
test_comment = "This is amazing!"
try:
    result = nn_lstm.predict_single(test_comment)
    print(f"\nTest prediction for '{test_comment}':")
    print(f"Result: {result}")
except Exception as e:
    print(f"Error in prediction: {str(e)}")

## Section 8: Genetic Algorithm Optimization

In [None]:
# Demonstrate GA optimization
print("Genetic Algorithm Optimization\n")

# Initialize optimizer
ga_optimizer = GeneticOptimizer(
    population_size=20,
    generations=10,
    crossover_prob=0.8,
    mutation_prob=0.2,
    seed=42
)

print("GA Configuration:")
print(f"  Population Size: {ga_optimizer.population_size}")
print(f"  Generations: {ga_optimizer.generations}")
print(f"  Crossover Probability: {ga_optimizer.crossover_prob}")
print(f"  Mutation Probability: {ga_optimizer.mutation_prob}")

# Example: Optimize hyperparameters for ML model
def evaluate_model_fitness(params_dict, X, y):
    """Fitness function for GA: evaluate model with given parameters"""
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.model_selection import cross_val_score
    
    try:
        # Extract parameters
        n_estimators = int(params_dict.get('n_estimators', 100))
        max_depth = int(params_dict.get('max_depth', 10))
        
        # Vectorize texts
        vectorizer = TfidfVectorizer(max_features=500)
        X_vec = vectorizer.fit_transform(X)
        
        # Convert to numeric labels
        label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
        y_numeric = np.array([label_map[label] for label in y])
        
        # Train model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )
        
        # Calculate fitness (F1-score)
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(model, X_vec, y_numeric, cv=3, scoring='f1_weighted')
        return scores.mean()
    except Exception as e:
        print(f"Error in fitness evaluation: {e}")
        return 0

print("\n✅ GA framework ready for optimization")

## Section 9: Sentiment Classification with Integrated System

In [None]:
# Use the integrated SentimentClassifier
print("Integrated Sentiment Classification System\n")

# Initialize classifier
classifier = SentimentClassifier(
    model_type='ml_classifier',
    architecture='random_forest',
    use_ga_optimization=False  # Set to True for GA optimization
)

print("Training unified classifier...")
metrics = classifier.train(
    X, y,
    validation_split=0.2,
    epochs=10,
    apply_ga_optimization=False
)

print("\nTraining Metrics:")
for key, value in metrics.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")
    elif key != 'confusion_matrix':
        print(f"  {key}: {value}")

# Test predictions
test_comments = [
    "This is absolutely amazing!",
    "It's okay I guess",
    "Completely terrible!"
]

print("\nTest Predictions:")
for comment in test_comments:
    result = classifier.predict_single(comment)
    print(f"\n  Comment: '{comment}'")
    print(f"  Sentiment: {result['sentiment']}")
    print(f"  Confidence: {result['confidence']:.2%}")

print("\n✅ Classification complete")

## Section 10: Toxicity & Spam Detection

In [None]:
# Spam detection rules
spam_keywords = ['http', 'click here', 'subscribe', 'follow me', 'check my', 'buy now', 'click link']

def detect_spam(text):
    """Simple spam detection based on keywords"""
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in spam_keywords)

def detect_toxicity(text):
    """Simple toxicity detection based on patterns"""
    toxic_patterns = ['hate', 'terrible', 'worst', 'awful', 'useless', 'garbage']
    text_lower = text.lower()
    toxic_count = sum(1 for pattern in toxic_patterns if pattern in text_lower)
    return toxic_count > 0

# Apply to sample data
df['is_spam'] = df['text'].apply(detect_spam)
df['is_toxic'] = df['text'].apply(detect_toxicity)

print("Spam & Toxicity Detection Results:")
print(f"  Total comments: {len(df)}")
print(f"  Spam comments: {df['is_spam'].sum()}")
print(f"  Toxic comments: {df['is_toxic'].sum()}")
print(f"\n  Spam + Toxic filter combined: {(df['is_spam'] | df['is_toxic']).sum()} comments filtered")

# Show flagged comments
flagged = df[df['is_spam'] | df['is_toxic']]
if len(flagged) > 0:
    print("\nFlagged Comments:")
    for idx, row in flagged.iterrows():
        flags = []
        if row['is_spam']:
            flags.append('SPAM')
        if row['is_toxic']:
            flags.append('TOXIC')
        print(f"  [{', '.join(flags)}] {row['text'][:60]}...")

## Section 11: Visualization & Analysis

In [None]:
# Get predictions for visualization
predictions, probabilities = classifier.predict(X)
df['predicted_sentiment'] = predictions

# Create visualizations
visualizer = SentimentVisualizer()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. True sentiment distribution
true_sentiments = df['sentiment'].value_counts()
colors = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
bar_colors = [colors.get(s, '#95a5a6') for s in true_sentiments.index]

true_sentiments.plot(kind='bar', ax=axes[0, 0], color=bar_colors, edgecolor='black')
axes[0, 0].set_title('True Sentiment Distribution', fontweight='bold')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=0)

# 2. Predicted sentiment distribution
pred_sentiments = pd.Series(predictions).value_counts()
bar_colors_pred = [colors.get(s, '#95a5a6') for s in pred_sentiments.index]

pred_sentiments.plot(kind='bar', ax=axes[0, 1], color=bar_colors_pred, edgecolor='black')
axes[0, 1].set_title('Predicted Sentiment Distribution', fontweight='bold')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=0)

# 3. Comment length by sentiment
for sentiment in ['positive', 'neutral', 'negative']:
    lengths = df[df['sentiment'] == sentiment]['text_length']
    axes[1, 0].hist(lengths, alpha=0.6, label=sentiment, color=colors[sentiment], bins=10)

axes[1, 0].set_title('Comment Length Distribution by Sentiment', fontweight='bold')
axes[1, 0].set_xlabel('Text Length')
axes[1, 0].set_ylabel('Count')
axes[1, 0].legend()

# 4. Confidence scores
sentiment_confidence = []
for sent in ['positive', 'neutral', 'negative']:
    conf = [probabilities[i].max() for i in range(len(probabilities)) if predictions[i] == sent]
    if conf:
        sentiment_confidence.append({
            'sentiment': sent,
            'avg_confidence': np.mean(conf),
            'color': colors[sent]
        })

sents = [item['sentiment'] for item in sentiment_confidence]
confs = [item['avg_confidence'] for item in sentiment_confidence]
cols = [item['color'] for item in sentiment_confidence]

axes[1, 1].bar(sents, confs, color=cols, edgecolor='black')
axes[1, 1].set_title('Average Prediction Confidence', fontweight='bold')
axes[1, 1].set_ylabel('Confidence')
axes[1, 1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("✅ Visualization complete")

## Section 12: Report Generation & Export

In [None]:
# Prepare data for export
export_data = []
for idx, row in df.iterrows():
    export_data.append({
        'author': f'user_{idx}',
        'text': row['text'],
        'sentiment': row['sentiment'],
        'predicted_sentiment': row['predicted_sentiment'],
        'is_spam': row['is_spam'],
        'is_toxic': row['is_toxic'],
        'text_length': row['text_length'],
        'word_count': row['word_count']
    })

# Generate reports
gen = ReportGenerator(output_dir='reports')

print("Generating Reports...\n")

# CSV Report
try:
    csv_path = gen.generate_csv_report(export_data, filename='youtube_analysis.csv')
    print(f"✅ CSV Report: {csv_path}")
except Exception as e:
    print(f"❌ CSV Report Error: {e}")

# JSON Report
try:
    json_path = gen.generate_json_report(
        export_data,
        metadata={'total_comments': len(export_data), 'model_type': 'ml_classifier'},
        filename='youtube_analysis.json'
    )
    print(f"✅ JSON Report: {json_path}")
except Exception as e:
    print(f"❌ JSON Report Error: {e}")

# HTML Report
try:
    html_path = gen.generate_html_report(
        export_data,
        title="YouTube Sentiment Analysis Report",
        filename='youtube_analysis.html'
    )
    print(f"✅ HTML Report: {html_path}")
except Exception as e:
    print(f"❌ HTML Report Error: {e}")

# Summary Report
try:
    txt_path = gen.generate_summary_report(
        export_data,
        model_metrics={'accuracy': 0.85, 'f1_score': 0.83},
        filename='youtube_analysis_summary.txt'
    )
    print(f"✅ Summary Report: {txt_path}")
except Exception as e:
    print(f"❌ Summary Report Error: {e}")

## Section 13: Summary & Insights

In [None]:
print("=" * 70)
print("SENTIMENT ANALYSIS SUMMARY")
print("=" * 70)

# Summary statistics
summary_stats = {
    'Total Comments': len(df),
    'Positive Comments': (df['sentiment'] == 'positive').sum(),
    'Neutral Comments': (df['sentiment'] == 'neutral').sum(),
    'Negative Comments': (df['sentiment'] == 'negative').sum(),
    'Spam Comments': df['is_spam'].sum(),
    'Toxic Comments': df['is_toxic'].sum(),
    'Average Comment Length': df['text_length'].mean(),
    'Average Word Count': df['word_count'].mean(),
}

print("\nStatistics:")
for key, value in summary_stats.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

# Model performance
from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(df['sentiment'], df['predicted_sentiment'])
f1 = f1_score(df['sentiment'], df['predicted_sentiment'], average='weighted', zero_division=0)

print(f"\nModel Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1-Score: {f1:.4f}")

# Key insights
print(f"\nKey Insights:")
print(f"  ✓ {(df['sentiment'] == 'positive').sum()} positive comments indicate good audience reception")
print(f"  ⚠ {df['is_toxic'].sum()} toxic comments detected - may need moderation")
print(f"  ⚠ {df['is_spam'].sum()} spam comments detected")
print(f"  → Average comment length: {df['text_length'].mean():.0f} characters")
print(f"  → Average words per comment: {df['word_count'].mean():.1f} words")

print("\n" + "=" * 70)
print("✅ Analysis Complete!")
print("=" * 70)