# Improved YouTube Comment Bot Detection Analysis

This notebook demonstrates the improved, modular architecture for bot detection analysis.

In [2]:
# Import the improved modules
import sys
from pathlib import Path

# Add the project root to Python path if needed
# sys.path.append(str(Path.cwd()))

from bot_detector import BotDetector
from config import setup_logging, analysis_config, model_config
from visualization import BotDetectionVisualizer
from ml_processing import CommentAnalyzer
import polars as pl

# Setup logging
logger = setup_logging()
print("Modules imported successfully!")

Modules imported successfully!


## 1. Configuration and Setup

The new architecture uses proper configuration management.

In [3]:
# Display current configuration
print("Analysis Configuration:")
print(f"  Default date range: {analysis_config.default_start_date} to {analysis_config.default_end_date}")
print(f"  Suspicious threshold: {analysis_config.suspicious_threshold}")
print(f"  User ID sampling: mod {analysis_config.user_id_mod}")

print("\nModel Configuration:")
print(f"  Model: {model_config.model_name}")
print(f"  Batch size: {model_config.batch_size}")
print(f"  PCA components: {model_config.pca_components}")
print(f"  Number of clusters: {model_config.n_clusters}")

Analysis Configuration:
  Default date range: 2023-10-31 to 2024-10-31
  Suspicious threshold: 10
  User ID sampling: mod 100

Model Configuration:
  Model: DeepPavlov/rubert-base-cased-sentence
  Batch size: 32
  PCA components: 50
  Number of clusters: 10


## 2. Initialize Bot Detector

The main orchestrator handles all complex operations.

In [4]:
# Initialize the bot detector
detector = BotDetector()

# Validate database connection
if detector.validate_database_connection():
    print("✅ Database connection validated successfully")
else:
    print("❌ Database connection failed")
    # You might want to stop here if database connection fails

2025-09-04 23:44:21,258 - database - INFO - Database connection pool created successfully
2025-09-04 23:44:21,259 - ml_processing - INFO - Loading model: DeepPavlov/rubert-base-cased-sentence
2025-09-04 23:44:23,559 - ml_processing - INFO - Model loaded successfully
2025-09-04 23:44:23,559 - config - INFO - Bot detector initialized successfully
  return pl.DataFrame(results, schema=columns)
2025-09-04 23:44:23,571 - config - INFO - Table youtube_comments: 10 columns
2025-09-04 23:44:23,573 - config - INFO - Table youtube_users: 5 columns
2025-09-04 23:44:23,588 - config - INFO - Table youtube_videos: 9 columns
2025-09-04 23:44:23,591 - config - INFO - Table youtube_channels: 7 columns


✅ Database connection validated successfully


## 3. Load and Analyze Comment Data

Load comment statistics using the optimized query.

In [5]:
# Load comment data - using optimized query by default
start_date = '2023-10-31'
end_date = '2024-10-31'

print(f"Loading comment data from {start_date} to {end_date}...")
comment_data = detector.load_comment_data(
    start_date=start_date,
    end_date=end_date,
    use_optimized_query=True
)

print(f"Loaded {len(comment_data)} records")
print(f"Columns: {comment_data.columns}")
comment_data.head()

2025-09-04 23:44:27,431 - config - INFO - Loading comment data from 2023-10-31 to 2024-10-31
2025-09-04 23:44:27,434 - database - INFO - Executing query from template: queries/minutely_comments_optimized.sql


Loading comment data from 2023-10-31 to 2024-10-31...


## 4. Identify Suspicious Commenters

Use statistical analysis to identify potential bot accounts.

In [None]:
# Identify suspicious commenters
suspicious_commenters = detector.identify_suspicious_commenters(
    comment_data, 
    threshold=10  # Comments per period threshold
)

print(f"Found {len(suspicious_commenters)} suspicious commenters")

# Show top suspicious commenters
top_suspicious = (
    suspicious_commenters
    .sort('max_CPP_this_CH', descending=True)
    .head(10)
    .select(['username', 'number_comments_all_time', 'max_CPP_this_CH', 'channel_title'])
)

print("\nTop 10 most suspicious commenters:")
top_suspicious

## 5. Create Basic Visualizations

Use the improved visualization system.

In [None]:
# Initialize visualizer
visualizer = BotDetectionVisualizer(output_dir="notebook_plots")

# Create cumulative distribution plot
fig1 = visualizer.plot_cumulative_comments_distribution(comment_data)
fig1.show()

# Create suspicious commenter heatmap
if len(suspicious_commenters) > 0:
    fig2 = visualizer.plot_suspicious_commenter_heatmap(suspicious_commenters)
    fig2.show()

## 6. Text Analysis and Clustering (Optional)

Analyze comment text patterns using machine learning.

In [None]:
# Load comment text for suspicious users (last month only for performance)
text_start_date = '2024-10-01'
text_end_date = '2024-10-31'

print("Loading comment text for suspicious users...")
comments_with_text = detector.load_suspicious_comments_text(
    suspicious_commenters,
    start_date=text_start_date,
    end_date=text_end_date
)

print(f"Loaded {len(comments_with_text)} comments for text analysis")
if len(comments_with_text) > 0:
    print(f"Sample comment: {comments_with_text.select('text').head(1).item()}")

In [None]:
# Perform ML analysis if we have comments
if len(comments_with_text) > 0:
    print("Starting ML analysis...")
    
    # This will take some time - the model needs to download and process embeddings
    clustered_comments = detector.analyze_comment_patterns(comments_with_text)
    
    print(f"Clustering completed. Found {clustered_comments.select('cluster_label').n_unique().item()} clusters")
    
    # Show cluster distribution
    cluster_dist = (
        clustered_comments.group_by('cluster_label')
        .agg(pl.len().alias('count'))
        .sort('cluster_label')
    )
    print("\nCluster distribution:")
    print(cluster_dist)
else:
    print("No comments available for ML analysis")
    clustered_comments = None

## 7. Advanced Visualizations

Create cluster visualizations if we have ML results.

In [None]:
# Create cluster visualizations if available
if clustered_comments is not None and 'cluster_label' in clustered_comments.columns:
    # t-SNE visualization
    fig3 = visualizer.plot_cluster_tsne(clustered_comments)
    fig3.show()
    
    # Cluster statistics
    # First merge with original comment data to get statistics
    enhanced_data = clustered_comments.join(
        comment_data.select(['user_id', 'number_comments_all_time', 'max_CPP_this_CH', 'mean_likes_per_comment']),
        on='user_id',
        how='left'
    )
    
    if len(enhanced_data) > 0:
        fig4 = visualizer.plot_cluster_statistics(enhanced_data)
        fig4.show()
else:
    print("No cluster data available for advanced visualizations")

## 8. Examine Cluster Samples

Look at actual comments from different clusters.

In [None]:
# Examine sample comments from each cluster
if clustered_comments is not None:
    analyzer = CommentAnalyzer()
    
    n_clusters = clustered_comments.select('cluster_label').n_unique().item()
    
    for cluster_id in range(min(5, n_clusters)):  # Show first 5 clusters
        samples = analyzer.get_cluster_samples(
            clustered_comments, 
            cluster_id, 
            n_samples=3
        )
        
        print(f"\n=== Cluster {cluster_id} Samples ===")
        for i, comment in enumerate(samples, 1):
            print(f"{i}. {comment[:100]}{'...' if len(comment) > 100 else ''}")
else:
    print("No clustered comments available")

## 9. Generate Complete Report

Use the automated reporting system.

In [None]:
# Generate comprehensive report
print("Generating comprehensive analysis report...")

report = detector.generate_report(
    comment_data, 
    clustered_comments,
    output_dir="notebook_analysis_output"
)

print("\n=== Analysis Report Generated ===")
print(f"Output directory: {report['output_directory']}")
print(f"Files generated: {list(report['files_generated'].keys())}")
print(f"Analysis summary: {report['analysis_summary']}")

# Display summary statistics
summary = visualizer.create_summary_report(comment_data)
print("\n=== Summary Statistics ===")
print(f"Total users analyzed: {summary['total_users']:,}")
print(f"Total comments: {summary['total_comments']:,}")
print(f"Suspicious users: {summary['suspicious_users']} ({summary['suspicious_percentage']:.1f}%)")
print(f"Average comments per user: {summary['avg_comments_per_user']:.1f}")

## 10. Alternative: Run Full Analysis Pipeline

You can also run the entire analysis in one command.

In [None]:
# Alternative: Run the complete pipeline in one go
# Uncomment and run this cell if you want to run everything automatically

"""
# Create a new detector instance
full_detector = BotDetector()

# Run complete analysis
full_results = full_detector.run_full_analysis(
    start_date='2024-01-01',
    end_date='2024-12-31',
    include_text_analysis=True,  # Set to False to skip ML analysis for speed
    output_dir='full_analysis_results'
)

print("Full analysis completed:")
print(full_results)
"""

## Summary

This improved analysis system provides:

1. **Modular Architecture**: Separated concerns with dedicated modules
2. **Better Performance**: Optimized SQL queries and efficient data processing
3. **Comprehensive Validation**: Data quality checks and error handling
4. **Security**: Environment-based configuration and SQL injection prevention
5. **Scalability**: Connection pooling and batch processing
6. **Maintainability**: Type hints, logging, and clear documentation

The system can be used both interactively (as demonstrated here) and programmatically through the main API.