In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# YouTube Trending Video ML Model Analysis\n",
    "\n",
    "This notebook provides a detailed analysis of the trained machine learning models for predicting YouTube trending videos. We'll analyze model performance, feature importance, and make predictions to gain insights into what drives video popularity on YouTube."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup and Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import os\n",
    "import sys\n",
    "import json\n",
    "import pickle\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from typing import Dict, List, Any, Optional, Union, Tuple\n",
    "from pathlib import Path\n",
    "\n",
    "# Configure matplotlib and seaborn\n",
    "plt.style.use('seaborn-v0_8-whitegrid')\n",
    "sns.set_palette(\"viridis\")\n",
    "plt.rcParams.update({\n",
    "    'figure.figsize': (12, 8),\n",
    "    'font.size': 12,\n",
    "    'axes.titlesize': 14,\n",
    "    'axes.labelsize': 12\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define helper functions for loading models and data\n",
    "\n",
    "def load_model_and_metadata(model_dir: str) -> Tuple[Any, Dict]:\n",
    "    \"\"\"Load a model and its metadata from the specified directory.\n",
    "    \n",
    "    Args:\n",
    "        model_dir: Directory containing the model files\n",
    "        \n",
    "    Returns:\n",
    "        Tuple of (model, metadata)\n",
    "    \"\"\"\n",
    "    # Get model name from directory\n",
    "    model_name = os.path.basename(model_dir)\n",
    "    \n",
    "    # Load model\n",
    "    model_path = os.path.join(model_dir, f\"{model_name}.pkl\")\n",
    "    metadata_path = os.path.join(model_dir, f\"{model_name}_metadata.json\")\n",
    "    \n",
    "    if not os.path.exists(model_path):\n",
    "        raise FileNotFoundError(f\"Model file not found: {model_path}\")\n",
    "    \n",
    "    if not os.path.exists(metadata_path):\n",
    "        raise FileNotFoundError(f\"Metadata file not found: {metadata_path}\")\n",
    "    \n",
    "    # Load the model\n",
    "    with open(model_path, 'rb') as f:\n",
    "        model = pickle.load(f)\n",
    "    \n",
    "    # Load the metadata\n",
    "    with open(metadata_path, 'r') as f:\n",
    "        metadata = json.load(f)\n",
    "    \n",
    "    return model, metadata\n",
    "\n",
    "def find_all_model_dirs(base_dir: str = 'models') -> Dict[str, List[str]]:\n",
    "    \"\"\"Find all model directories in the base directory.\n",
    "    \n",
    "    Args:\n",
    "        base_dir: Base directory containing model subdirectories\n",
    "        \n",
    "    Returns:\n",
    "        Dictionary mapping model types to lists of model directories\n",
    "    \"\"\"\n",
    "    model_dirs = {}\n",
    "    \n",
    "    # Check for classification models\n",
    "    cls_dir = os.path.join(base_dir, 'classification')\n",
    "    if os.path.exists(cls_dir):\n",
    "        # Find all subdirectories in the classification directory\n",
    "        model_dirs['classification'] = [\n",
    "            os.path.join(cls_dir, d) for d in os.listdir(cls_dir)\n",
    "            if os.path.isdir(os.path.join(cls_dir, d))\n",
    "        ]\n",
    "    \n",
    "    # Check for regression models\n",
    "    reg_dir = os.path.join(base_dir, 'regression')\n",
    "    if os.path.exists(reg_dir):\n",
    "        # Find all subdirectories in the regression directory\n",
    "        model_dirs['regression'] = [\n",
    "            os.path.join(reg_dir, d) for d in os.listdir(reg_dir)\n",
    "            if os.path.isdir(os.path.join(reg_dir, d))\n",
    "        ]\n",
    "    \n",
    "    return model_dirs\n",
    "\n",
    "def load_data(data_path=None):\n",
    "    \"\"\"Load the trending video data.\"\"\"\n",
    "    try:\n",
    "        if data_path is None:\n",
    "            # Find the most recent data file\n",
    "            data_dir = os.path.join(os.getcwd(), \"data/processed\")\n",
    "            if not os.path.exists(data_dir):\n",
    "                return None\n",
    "            \n",
    "            # Find most recent trending data file\n",
    "            files = [f for f in os.listdir(data_dir) if f.startswith(\"all_trending_\")]\n",
    "            if not files:\n",
    "                return None\n",
    "            \n",
    "            latest_file = max(files)\n",
    "            data_path = os.path.join(data_dir, latest_file)\n",
    "        \n",
    "        # Load the data\n",
    "        df = pd.read_csv(data_path)\n",
    "        \n",
    "        # Process datetime columns\n",
    "        datetime_cols = ['publish_time', 'fetch_time']\n",
    "        for col in datetime_cols:\n",
    "            if col in df.columns:\n",
    "                df[col] = pd.to_datetime(df[col], errors='coerce')\n",
    "        \n",
    "        return df\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Error loading data: {e}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find and load all models\n",
    "models_dir = 'models'  # Update this path if your models are stored elsewhere\n",
    "model_directories = find_all_model_dirs(models_dir)\n",
    "\n",
    "print(\"Found models:\")\n",
    "for model_type, directories in model_directories.items():\n",
    "    print(f\"- {model_type}: {len(directories)} models\")\n",
    "    for directory in directories:\n",
    "        print(f\"  - {os.path.basename(directory)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the data\n",
    "df = load_data()\n",
    "\n",
    "if df is not None:\n",
    "    print(f\"Loaded {len(df)} rows and {len(df.columns)} columns\")\n",
    "    print(f\"Regions: {df['region'].unique()}\")\n",
    "    print(f\"Sample columns: {list(df.columns)[:10]}\")\n",
    "else:\n",
    "    print(\"Failed to load data\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Classification Model Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare_classification_models(model_dirs: List[str]) -> pd.DataFrame:\n",
    "    \"\"\"Compare classification models based on their performance metrics.\n",
    "    \n",
    "    Args:\n",
    "        model_dirs: List of directories containing classification models\n",
    "        \n",
    "    Returns:\n",
    "        DataFrame with model performance metrics\n",
    "    \"\"\"\n",
    "    # Initialize results list\n",
    "    results = []\n",
    "    \n",
    "    # Load and analyze each model\n",
    "    for model_dir in model_dirs:\n",
    "        try:\n",
    "            _, metadata = load_model_and_metadata(model_dir)\n",
    "            \n",
    "            # Extract model information\n",
    "            model_name = metadata.get('model_name', os.path.basename(model_dir))\n",
    "            target_name = metadata.get('target_name', 'unknown')\n",
    "            model_type = metadata.get('model_type', 'unknown')\n",
    "            \n",
    "            # Extract metrics\n",
    "            metrics = metadata.get('metrics', {})\n",
    "            accuracy = metrics.get('accuracy', None)\n",
    "            precision = metrics.get('precision', None)\n",
    "            recall = metrics.get('recall', None)\n",
    "            f1 = metrics.get('f1', None)\n",
    "            auc = metrics.get('auc', None)\n",
    "            \n",
    "            # Add to results\n",
    "            results.append({\n",
    "                'model_name': model_name,\n",
    "                'target_name': target_name,\n",
    "                'model_type': model_type,\n",
    "                'accuracy': accuracy,\n",
    "                'precision': precision,\n",
    "                'recall': recall,\n",
    "                'f1': f1,\n",
    "                'auc': auc\n",
    "            })\n",
    "        except Exception as e:\n",
    "            print(f\"Error loading model from {model_dir}: {e}\")\n",
    "    \n",
    "    # Convert to DataFrame\n",
    "    df = pd.DataFrame(results)\n",
    "    \n",
    "    # Sort by target and f1 score\n",
    "    df = df.sort_values(['target_name', 'f1'], ascending=[True, False])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare classification models\n",
    "if 'classification' in model_directories and model_directories['classification']:\n",
    "    classification_comparison = compare_classification_models(model_directories['classification'])\n",
    "    classification_comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize classification model performance\n",
    "if 'classification' in model_directories and model_directories['classification']:\n",
    "    # Group models by target\n",
    "    targets = classification_comparison['target_name'].unique()\n",
    "    \n",
    "    for target in targets:\n",
    "        target_models = classification_comparison[classification_comparison['target_name'] == target]\n",
    "        \n",
    "        # Create figure\n",
    "        fig, ax = plt.subplots(figsize=(10, 6))\n",
    "        \n",
    "        # Plot metrics\n",
    "        metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']\n",
    "        available_metrics = [m for m in metrics if m in target_models.columns and not target_models[m].isna().all()]\n",
    "        \n",
    "        x = np.arange(len(available_metrics))\n",
    "        width = 0.8 / len(target_models)\n",
    "        \n",
    "        for i, (_, row) in enumerate(target_models.iterrows()):\n",
    "            values = [row[m] for m in available_metrics]\n",
    "            ax.bar(x + i * width - width * len(target_models) / 2, values, width, label=row['model_type'])\n",
    "        \n",
    "        # Add labels and legend\n",
    "        ax.set_ylabel('Score')\n",
    "        ax.set_title(f'Performance Metrics for {target} Models')\n",
    "        ax.set_xticks(x)\n",
    "        ax.set_xticklabels(available_metrics)\n",
    "        ax.legend()\n",
    "        \n",
    "        # Set y-axis limits\n",
    "        ax.set_ylim(0, 1.0)\n",
    "        \n",
    "        # Add grid\n",
    "        ax.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Regression Model Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare_regression_models(model_dirs: List[str]) -> pd.DataFrame:\n",
    "    \"\"\"Compare regression models based on their performance metrics.\n",
    "    \n",
    "    Args:\n",
    "        model_dirs: List of directories containing regression models\n",
    "        \n",
    "    Returns:\n",
    "        DataFrame with model performance metrics\n",
    "    \"\"\"\n",
    "    # Initialize results list\n",
    "    results = []\n",
    "    \n",
    "    # Load and analyze each model\n",
    "    for model_dir in model_dirs:\n",
    "        try:\n",
    "            _, metadata = load_model_and_metadata(model_dir)\n",
    "            \n",
    "            # Extract model information\n",
    "            model_name = metadata.get('model_name', os.path.basename(model_dir))\n",
    "            target_name = metadata.get('target_name', 'unknown')\n",
    "            model_type = metadata.get('model_type', 'unknown')\n",
    "            \n",
    "            # Extract metrics\n",
    "            metrics = metadata.get('metrics', {})\n",
    "            mse = metrics.get('mse', None)\n",
    "            rmse = metrics.get('rmse', None)\n",
    "            mae = metrics.get('mae', None)\n",
    "            r2 = metrics.get('r2', None)\n",
    "            \n",
    "            # Add to results\n",
    "            results.append({\n",
    "                'model_name': model_name,\n",
    "                'target_name': target_name,\n",
    "                'model_type': model_type,\n",
    "                'mse': mse,\n",
    "                'rmse': rmse,\n",
    "                'mae': mae,\n",
    "                'r2': r2\n",
    "            })\n",
    "        except Exception as e:\n",
    "            print(f\"Error loading model from {model_dir}: {e}\")\n",
    "    \n",
    "    # Convert to DataFrame\n",
    "    df = pd.DataFrame(results)\n",
    "    \n",
    "    # Sort by target and R² score\n",
    "    df = df.sort_values(['target_name', 'r2'], ascending=[True, False])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare regression models\n",
    "if 'regression' in model_directories and model_directories['regression']:\n",
    "    regression_comparison = compare_regression_models(model_directories['regression'])\n",
    "    regression_comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize regression model performance\n",
    "if 'regression' in model_directories and model_directories['regression']:\n",
    "    # Group models by target\n",
    "    targets = regression_comparison['target_name'].unique()\n",
    "    \n",
    "    for target in targets:\n",
    "        target_models = regression_comparison[regression_comparison['target_name'] == target]\n",
    "        \n",
    "        # Create figure for R² and error metrics\n",
    "        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n",
    "        \n",
    "        # Plot R² score (higher is better)\n",
    "        model_types = target_models['model_type'].tolist()\n",
    "        r2_scores = target_models['r2'].tolist()\n",
    "        \n",
    "        ax1.bar(model_types, r2_scores, color='green', alpha=0.7)\n",
    "        ax1.set_title(f'R² Score for {target} Models')\n",
    "        ax1.set_ylabel('R² Score')\n",
    "        ax1.set_ylim(0, 1.0)\n",
    "        ax1.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "        \n",
    "        # Plot error metrics (lower is better)\n",
    "        metrics = ['rmse', 'mae']\n",
    "        available_metrics = [m for m in metrics if m in target_models.columns and not target_models[m].isna().all()]\n",
    "        \n",
    "        for metric in available_metrics:\n",
    "            values = target_models[metric].tolist()\n",
    "            ax2.bar(model_types, values, alpha=0.7, label=metric.upper())\n",
    "        \n",
    "        ax2.set_title(f'Error Metrics for {target} Models')\n",
    "        ax2.set_ylabel('Error')\n",
    "        ax2.legend()\n",
    "        ax2.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Feature Importance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyze_feature_importance(model_dir: str) -> pd.DataFrame:\n",
    "    \"\"\"Analyze feature importance for a model.\n",
    "    \n",
    "    Args:\n",
    "        model_dir: Directory containing the model files\n",
    "        \n",
    "    Returns:\n",
    "        DataFrame with feature importances\n",
    "    \"\"\"\n",
    "    try:\n",
    "        # Load model and metadata\n",
    "        model, metadata = load_model_and_metadata(model_dir)\n",
    "        \n",
    "        # Extract feature importances\n",
    "        importances = metadata.get('feature_importances', {})\n",
    "        \n",
    "        if not importances:\n",
    "            print(f\"No feature importances found for {os.path.basename(model_dir)}\")\n",
    "            return pd.DataFrame()\n",
    "        \n",
    "        # Convert to DataFrame\n",
    "        importance_df = pd.DataFrame({\n",
    "            'Feature': list(importances.keys()),\n",
    "            'Importance': list(importances.values())\n",
    "        })\n",
    "        \n",
    "        # Sort by importance\n",
    "        importance_df = importance_df.sort_values('Importance', ascending=False)\n",
    "        \n",
    "        return importance_df\n",
    "    except Exception as e:\n",
    "        print(f\"Error analyzing feature importance for {model_dir}: {e}\")\n",
    "        return pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select a model for feature importance analysis\n",
    "# Let's look at one classification and one regression model\n",
    "\n",
    "# Classification model (viral prediction)\n",
    "viral_model_dirs = [d for d in model_directories.get('classification', []) if 'viral' in os.path.basename(d)]\n",
    "if viral_model_dirs:\n",
    "    viral_model_dir = viral_model_dirs[0]  # Use the first matching model\n",
    "    print(f\"Analyzing feature importance for {os.path.basename(viral_model_dir)}\")\n",
    "    \n",
    "    viral_importances = analyze_feature_importance(viral_model_dir)\n",
    "    \n",
    "    if not viral_importances.empty:\n",
    "        # Display top features\n",
    "        top_n = 20\n",
    "        top_features = viral_importances.head(top_n)\n",
    "        \n",
    "        # Plot\n",
    "        plt.figure(figsize=(12, 8))\n",
    "        sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')\n",
    "        plt.title(f'Top {top_n} Features for Viral Prediction')\n",
    "        plt.xlabel('Importance')\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        # Display table\n",
    "        viral_importances.head(top_n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regression model (engagement prediction)\n",
    "engagement_model_dirs = [d for d in model_directories.get('regression', []) if 'engagement' in os.path.basename(d)]\n",
    "if engagement_model_dirs:\n",
    "    engagement_model_dir = engagement_model_dirs[0]  # Use the first matching model\n",
    "    print(f\"Analyzing feature importance for {os.path.basename(engagement_model_dir)}\")\n",
    "    \n",
    "    engagement_importances = analyze_feature_importance(engagement_model_dir)\n",
    "    \n",
    "    if not engagement_importances.empty:\n",
    "        # Display top features\n",
    "        top_n = 20\n",
    "        top_features = engagement_importances.head(top_n)\n",
    "        \n",
    "        # Plot\n",
    "        plt.figure(figsize=(12, 8))\n",
    "        sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')\n",
    "        plt.title(f'Top {top_n} Features for Engagement Prediction')\n",
    "        plt.xlabel('Importance')\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        # Display table\n",
    "        engagement_importances.head(top_n)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Compare Feature Importance Across Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare_feature_importance(model_dirs: List[str], top_n: int = 10) -> pd.DataFrame:\n",
    "    \"\"\"Compare feature importance across multiple models.\n",
    "    \n",
    "    Args:\n",
    "        model_dirs: List of model directories\n",
    "        top_n: Number of top features to include\n",
    "        \n",
    "    Returns:\n",
    "        DataFrame with feature importances for each model\n",
    "    \"\"\"\n",
    "    all_importances = {}\n",
    "    model_names = []\n",
    "    \n",
    "    for model_dir in model_dirs:\n",
    "        try:\n",
    "            # Load model and metadata\n",
    "            _, metadata = load_model_and_metadata(model_dir)\n",
    "            \n",
    "            # Get model name and target\n",
    "            model_name = metadata.get('model_name', os.path.basename(model_dir))\n",
    "            target_name = metadata.get('target_name', 'unknown')\n",
    "            model_type = metadata.get('model_type', 'unknown')\n",
    "            \n",
    "            # Combine info for display\n",
    "            display_name = f\"{target_name} ({model_type})\"\n",
    "            model_names.append(display_name)\n",
    "            \n",
    "            # Extract feature importances\n",
    "            importances = metadata.get('feature_importances', {})\n",
    "            \n",
    "            if importances:\n",
    "                # Store importances\n",
    "                all_importances[display_name] = importances\n",
    "            else:\n",
    "                print(f\"No feature importances found for {model_name}\")\n",
    "        except Exception as e:\n",
    "            print(f\"Error loading model from {model_dir}: {e}\")\n",
    "    \n",
    "    if not all_importances:\n",
    "        return pd.DataFrame()\n",
    "    \n",
    "    # Get all unique features\n",
    "    all_features = set()\n",
    "    for importances in all_importances.values():\n",
    "        all_features.update(importances.keys())\n",
    "    \n",
    "    # Create DataFrame\n",
    "    result = pd.DataFrame(index=list(all_features))\n",
    "    \n",
    "    # Add importance for each model\n",
    "    for model, importances in all_importances.items():\n",
    "        result[model] = result.index.map(lambda f: importances.get(f, 0))\n",
    "    \n",
    "    # Calculate mean importance\n",
    "    result['Mean Importance'] = result.mean(axis=1)\n",
    "    \n",
    "    # Sort by mean importance\n",
    "    result = result.sort_values('Mean Importance', ascending=False)\n",
    "    \n",
    "    # Return top N features\n",
    "    return result.head(top_n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare feature importance across all classification models\n",
    "if 'classification' in model_directories and model_directories['classification']:\n",
    "    cls_importance_comparison = compare_feature_importance(model_directories['classification'], top_n=15)\n",
    "    \n",
    "    if not cls_importance_comparison.empty:\n",
    "        # Display table\n",
    "        print(\"Top Features Across Classification Models:\")\n",
    "        cls_importance_comparison\n",
    "        \n",
    "        # Visualize\n",
    "        plt.figure(figsize=(14, 8))\n",
    "        \n",
    "        # Plot heatmap\n",
    "        sns.heatmap(\n",
    "            cls_importance_comparison.drop('Mean Importance', axis=1),\n",
    "            cmap='viridis',\n",
    "            annot=True,\n",
    "            fmt='.3f',\n",
    "            linewidths=.5,\n",
    "            cbar_kws={\"label\": \"Importance\"}\n",
    "        )\n",
    "        \n",
    "        plt.title('Feature Importance Across Classification Models')\n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
{
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare feature importance across all regression models\n",
    "if 'regression' in model_directories and model_directories['regression']:\n",
    "    reg_importance_comparison = compare_feature_importance(model_directories['regression'], top_n=15)\n",
    "    \n",
    "    if not reg_importance_comparison.empty:\n",
    "        # Display table\n",
    "        print(\"Top Features Across Regression Models:\")\n",
    "        reg_importance_comparison\n",
    "        \n",
    "        # Visualize\n",
    "        plt.figure(figsize=(14, 8))\n",
    "        \n",
    "        # Plot heatmap\n",
    "        sns.heatmap(\n",
    "            reg_importance_comparison.drop('Mean Importance', axis=1),\n",
    "            cmap='plasma',\n",
    "            annot=True,\n",
    "            fmt='.3f',\n",
    "            linewidths=.5,\n",
    "            cbar_kws={\"label\": \"Importance\"}\n",
    "        )\n",
    "        \n",
    "        plt.title('Feature Importance Across Regression Models')\n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Model Prediction Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_features(video_data: Dict[str, Any], model_metadata: Dict[str, Any]) -> pd.DataFrame:\n",
    "    \"\"\"Prepare features for prediction.\n",
    "    \n",
    "    Args:\n",
    "        video_data: Dictionary with video attributes\n",
    "        model_metadata: Model metadata with feature information\n",
    "        \n",
    "    Returns:\n",
    "        DataFrame with prepared features\n",
    "    \"\"\"\n",
    "    # Get required features\n",
    "    required_features = model_metadata.get('features', [])\n",
    "    \n",
    "    if not required_features:\n",
    "        raise ValueError(\"No feature information available in model metadata\")\n",
    "    \n",
    "    # Create a single-row DataFrame\n",
    "    df = pd.DataFrame([video_data])\n",
    "    \n",
    "    # Process text features\n",
    "    if 'title' in df.columns:\n",
    "        # Extract title features\n",
    "        title = df['title'].iloc[0]\n",
    "        df['title_length'] = len(title)\n",
    "        df['title_word_count'] = len(title.split())\n",
    "        df['title_has_number'] = int(any(c.isdigit() for c in title))\n",
    "        df['title_has_question'] = int('?' in title)\n",
    "        df['title_has_exclamation'] = int('!' in title)\n",
    "        \n",
    "        # Count capitalized words\n",
    "        words = title.split()\n",
    "        df['title_caps_count'] = sum(1 for word in words if word.isupper() and len(word) > 1)\n",
    "    \n",
    "    if 'description' in df.columns and df['description'].iloc[0]:\n",
    "        # Extract description features\n",
    "        desc = df['description'].iloc[0]\n",
    "        df['description_length'] = len(desc)\n",
    "        df['description_word_count'] = len(desc.split())\n",
    "        df['description_url_count'] = desc.count('http')\n",
    "    \n",
    "    if 'tags' in df.columns and df['tags'].iloc[0]:\n",
    "        # Extract tag features\n",
    "        tags = df['tags'].iloc[0]\n",
    "        if isinstance(tags, list):\n",
    "            df['tag_count'] = len(tags)\n",
    "        elif isinstance(tags, str):\n",
    "            df['tag_count'] = len(tags.split(','))\n",
    "    \n",
    "    # Check for missing features\n",
    "    missing_features = [f for f in required_features if f not in df.columns]\n",
    "    \n",
    "    # For any missing features, add them with default value 0\n",
    "    for feature in missing_features:\n",
    "        df[feature] = 0\n",
    "    \n",
    "    # Select only the required features in the correct order\n",
    "    features_df = df[required_features]\n",
    "    \n",
    "    return features_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_prediction(model_dir: str, video_data: Dict[str, Any]) -> Dict[str, Any]:\n",
    "    \"\"\"Make a prediction with a model.\n",
    "    \n",
    "    Args:\n",
    "        model_dir: Directory containing the model files\n",
    "        video_data: Dictionary with video attributes\n",
    "        \n",
    "    Returns:\n",
    "        Dictionary with prediction results\n",
    "    \"\"\"\n",
    "    try:\n",
    "        # Load model and metadata\n",
    "        model, metadata = load_model_and_metadata(model_dir)\n",
    "        \n",
    "        # Get model information\n",
    "        model_name = metadata.get('model_name', os.path.basename(model_dir))\n",
    "        target_name = metadata.get('target_name', 'unknown')\n",
    "        model_type = metadata.get('model_type', 'unknown')\n",
    "        \n",
    "        # Prepare features\n",
    "        features = prepare_features(video_data, metadata)\n",
    "        \n",
    "        # Make prediction\n",
    "        model_category = 'classification' if 'classification' in model_dir else 'regression'\n",
    "        \n",
    "        if model_category == 'classification':\n",
    "            # Predict class\n",
    "            pred_class = model.predict(features)[0]\n",
    "            \n",
    "            # Predict probability if available\n",
    "            pred_prob = None\n",
    "            if hasattr(model, 'predict_proba'):\n",
    "                pred_prob = model.predict_proba(features)[0, 1]\n",
    "            \n",
    "            # Create result\n",
    "            result = {\n",
    "                'target': target_name,\n",
    "                'prediction': bool(pred_class),\n",
    "                'probability': float(pred_prob) if pred_prob is not None else None,\n",
    "                'model_type': model_type\n",
    "            }\n",
    "        else:  # regression\n",
    "            # Predict value\n",
    "            pred_value = model.predict(features)[0]\n",
    "            \n",
    "            # Create result\n",
    "            result = {\n",
    "                'target': target_name,\n",
    "                'prediction': float(pred_value),\n",
    "                'model_type': model_type\n",
    "            }\n",
    "        \n",
    "        return result\n",
    "    except Exception as e:\n",
    "        print(f\"Error making prediction with {os.path.basename(model_dir)}: {e}\")\n",
    "        return {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a sample video for prediction\n",
    "sample_video = {\n",
    "    \"title\": \"How to Build a Machine Learning Model for YouTube Success - Complete Tutorial 2025\",\n",
    "    \"description\": \"Learn how to build a machine learning model that predicts YouTube success in this comprehensive tutorial. We'll cover data collection, feature engineering, model selection, and evaluation.\",\n",
    "    \"category_id\": \"28\",  # Science & Technology\n",
    "    \"duration_seconds\": 15 * 60,  # 15 minutes\n",
    "    \"tags\": [\"machine learning\", \"tutorial\", \"data science\", \"python\", \"youtube\"],\n",
    "    \"publish_hour\": 14,  # 2 PM\n",
    "    \"publish_day\": 2,  # Wednesday (0=Monday, 6=Sunday)\n",
    "    \"publish_month\": 6  # June\n",
    "}\n",
    "\n",
    "print(\"Sample video for prediction:\")\n",
    "for key, value in sample_video.items():\n",
    "    print(f\"- {key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make predictions with all models\n",
    "all_predictions = {}\n",
    "\n",
    "# Classification models\n",
    "cls_predictions = {}\n",
    "if 'classification' in model_directories:\n",
    "    for model_dir in model_directories['classification']:\n",
    "        model_name = os.path.basename(model_dir)\n",
    "        prediction = make_prediction(model_dir, sample_video)\n",
    "        if prediction:\n",
    "            cls_predictions[model_name] = prediction\n",
    "\n",
    "all_predictions['classification'] = cls_predictions\n",
    "\n",
    "# Regression models\n",
    "reg_predictions = {}\n",
    "if 'regression' in model_directories:\n",
    "    for model_dir in model_directories['regression']:\n",
    "        model_name = os.path.basename(model_dir)\n",
    "        prediction = make_prediction(model_dir, sample_video)\n",
    "        if prediction:\n",
    "            reg_predictions[model_name] = prediction\n",
    "\n",
    "all_predictions['regression'] = reg_predictions\n",
    "\n",
    "# Display prediction results\n",
    "print(\"\\nPrediction Results:\")\n",
    "print(\"\\nClassification Models:\")\n",
    "for model_name, prediction in cls_predictions.items():\n",
    "    if 'probability' in prediction and prediction['probability'] is not None:\n",
    "        print(f\"- {prediction['target']}: {prediction['prediction']} (Probability: {prediction['probability']:.2f})\")\n",
    "    else:\n",
    "        print(f\"- {prediction['target']}: {prediction['prediction']}\")\n",
    "\n",
    "print(\"\\nRegression Models:\")\n",
    "for model_name, prediction in reg_predictions.items():\n",
    "    print(f\"- {prediction['target']}: {prediction['prediction']:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize classification predictions\n",
    "if cls_predictions:\n",
    "    # Extract probabilities if available\n",
    "    probs_data = []\n",
    "    for model_name, prediction in cls_predictions.items():\n",
    "        if 'probability' in prediction and prediction['probability'] is not None:\n",
    "            probs_data.append({\n",
    "                'Target': prediction['target'],\n",
    "                'Probability': prediction['probability'],\n",
    "                'Prediction': prediction['prediction']\n",
    "            })\n",
    "    \n",
    "    if probs_data:\n",
    "        probs_df = pd.DataFrame(probs_data)\n",
    "        \n",
    "        # Create figure\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        \n",
    "        # Plot probabilities\n",
    "        bars = plt.bar(\n",
    "            probs_df['Target'], \n",
    "            probs_df['Probability'],\n",
    "            color=probs_df['Prediction'].map({True: 'green', False: 'red'}),\n",
    "            alpha=0.7\n",
    "        )\n",
    "        \n",
    "        # Add threshold line\n",
    "        plt.axhline(y=0.5, color='black', linestyle='--', alpha=0.5, label='Threshold')\n",
    "        \n",
    "        # Add labels\n",
    "        plt.title('Classification Model Predictions')\n",
    "        plt.ylabel('Probability')\n",
    "        plt.ylim(0, 1)\n",
    "        plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "        \n",
    "        # Add legend\n",
    "        plt.legend(['Threshold', 'Positive', 'Negative'])\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize regression predictions\n",
    "if reg_predictions:\n",
    "    # Extract values\n",
    "    reg_data = []\n",
    "    for model_name, prediction in reg_predictions.items():\n",
    "        reg_data.append({\n",
    "            'Target': prediction['target'],\n",
    "            'Value': prediction['prediction'],\n",
    "            'Model Type': prediction['model_type']\n",
    "        })\n",
    "    \n",
    "    if reg_data:\n",
    "        reg_df = pd.DataFrame(reg_data)\n",
    "        \n",
    "        # Create figure\n",
    "        plt.figure(figsize=(12, 6))\n",
    "        \n",
    "        # Plot values\n",
    "        bars = plt.bar(\n",
    "            reg_df['Target'], \n",
    "            reg_df['Value'],\n",
    "            color=reg_df['Model Type'].map({\n",
    "                'random_forest': 'blue',\n",
    "                'gradient_boosting': 'purple',\n",
    "                'linear': 'orange',\n",
    "                'ensemble': 'green'\n",
    "            }),\n",
    "            alpha=0.7\n",
    "        )\n",
    "        \n",
    "        # Add labels\n",
    "        plt.title('Regression Model Predictions')\n",
    "        plt.ylabel('Predicted Value')\n",
    "        plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "        \n",
    "        # Add custom legend\n",
    "        from matplotlib.patches import Patch\n",
    "        legend_elements = [\n",
    "            Patch(facecolor='blue', alpha=0.7, label='Random Forest'),\n",
    "            Patch(facecolor='purple', alpha=0.7, label='Gradient Boosting'),\n",
    "            Patch(facecolor='orange', alpha=0.7, label='Linear'),\n",
    "            Patch(facecolor='green', alpha=0.7, label='Ensemble')\n",
    "        ]\n",
    "        plt.legend(handles=legend_elements)\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Varying Video Attributes and Analyzing Impact"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyze_attribute_impact(attribute: str, values: List[Any], base_video: Dict[str, Any], model_dir: str) -> pd.DataFrame:\n",
    "    \"\"\"Analyze how changing a specific attribute impacts predictions.\n",
    "    \n",
    "    Args:\n",
    "        attribute: Name of the attribute to vary\n",
    "        values: List of values to test\n",
    "        base_video: Base video attributes dictionary\n",
    "        model_dir: Directory containing the model\n",
    "        \n",
    "    Returns:\n",
    "        DataFrame with prediction results for each value\n",
    "    \"\"\"\n",
    "    results = []\n",
    "    \n",
    "    for value in values:\n",
    "        # Create a copy of the base video\n",
    "        video = base_video.copy()\n",
    "        \n",
    "        # Set the attribute value\n",
    "        video[attribute] = value\n",
    "        \n",
    "        # Make prediction\n",
    "        prediction = make_prediction(model_dir, video)\n",
    "        \n",
    "        if prediction:\n",
    "            # Add attribute value to result\n",
    "            result = {\n",
    "                'Attribute Value': value,\n",
    "                'Target': prediction['target']\n",
    "            }\n",
    "            \n",
    "            # Add prediction value based on model type\n",
    "            if 'probability' in prediction and prediction['probability'] is not None:\n",
    "                result['Prediction'] = prediction['probability']\n",
    "            else:\n",
    "                result['Prediction'] = prediction['prediction']\n",
    "            \n",
    "            results.append(result)\n",
    "    \n",
    "    return pd.DataFrame(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select models for sensitivity analysis\n",
    "viral_prediction_model = None\n",
    "engagement_prediction_model = None\n",
    "\n",
    "# Find viral prediction model\n",
    "if 'classification' in model_directories:\n",
    "    viral_models = [d for d in model_directories['classification'] if 'viral' in os.path.basename(d)]\n",
    "    if viral_models:\n",
    "        viral_prediction_model = viral_models[0]\n",
    "\n",
    "# Find engagement prediction model\n",
    "if 'regression' in model_directories:\n",
    "    engagement_models = [d for d in model_directories['regression'] if 'engagement' in os.path.basename(d)]\n",
    "    if engagement_models:\n",
    "        engagement_prediction_model = engagement_models[0]\n",
    "\n",
    "print(f\"Selected models for sensitivity analysis:\")\n",
    "print(f\"- Viral prediction: {os.path.basename(viral_prediction_model) if viral_prediction_model else 'None'}\")\n",
    "print(f\"- Engagement prediction: {os.path.basename(engagement_prediction_model) if engagement_prediction_model else 'None'}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze impact of video duration on viral potential\n",
    "if viral_prediction_model:\n",
    "    # Test different durations (in seconds)\n",
    "    durations = [3*60, 5*60, 8*60, 10*60, 15*60, 20*60, 30*60, 45*60, 60*60]\n",
    "    \n",
    "    duration_impact = analyze_attribute_impact(\n",
    "        'duration_seconds', \n",
    "        durations, \n",
    "        sample_video, \n",
    "        viral_prediction_model\n",
    "    )\n",
    "    \n",
    "    # Convert to minutes for display\n",
    "    duration_impact['Duration (min)'] = duration_impact['Attribute Value'].apply(lambda x: x / 60)\n",
    "    \n",
    "    # Plot\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.plot(duration_impact['Duration (min)'], duration_impact['Prediction'], marker='o')\n",
    "    plt.title('Impact of Video Duration on Viral Potential')\n",
    "    plt.xlabel('Duration (minutes)')\n",
    "    plt.ylabel('Probability of Going Viral')\n",
    "    plt.grid(linestyle='--', alpha=0.7)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze impact of publish hour on engagement\n",
    "if engagement_prediction_model:\n",
    "    # Test different publish hours\n",
    "    hours = list(range(24))\n",
    "    \n",
    "    hour_impact = analyze_attribute_impact(\n",
    "        'publish_hour', \n",
    "        hours, \n",
    "        sample_video, \n",
    "        engagement_prediction_model\n",
    "    )\n",
    "    \n",
    "    # Plot\n",
    "    plt.figure(figsize=(12, 6))\n",
    "    plt.plot(hour_impact['Attribute Value'], hour_impact['Prediction'], marker='o')\n",
    "    plt.title('Impact of Publishing Hour on Engagement Score')\n",
    "    plt.xlabel('Hour of Day (24-hour format)')\n",
    "    plt.ylabel('Predicted Engagement Score')\n",
    "    plt.xticks(hours)\n",
    "    plt.grid(linestyle='--', alpha=0.7)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze impact of publish day on viral potential\n",
    "if viral_prediction_model:\n",
    "    # Test different publish days\n",
    "    days = list(range(7))\n",
    "    day_names = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n",
    "    \n",
    "    day_impact = analyze_attribute_impact(\n",
    "        'publish_day', \n",
    "        days, \n",
    "        sample_video, \n",
    "        viral_prediction_model\n",
    "    )\n",
    "    \n",
    "    # Add day names\n",
    "    day_impact['Day Name'] = day_impact['Attribute Value'].apply(lambda x: day_names[x])\n",
    "    \n",
    "    # Plot\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.bar(day_impact['Day Name'], day_impact['Prediction'], alpha=0.7)\n",
    "    plt.title('Impact of Publishing Day on Viral Potential')\n",
    "    plt.xlabel('Day of Week')\n",
    "    plt.ylabel('Probability of Going Viral')\n",
    "    plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze impact of category on engagement\n",
    "if engagement_prediction_model:\n",
    "    # Test different categories\n",
    "    categories = ['10', '20', '22', '23', '24', '25', '26', '27', '28']\n",
    "    category_names = [\n",
    "        'Music', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment',\n",
    "        'News & Politics', 'Howto & Style', 'Education', 'Science & Technology'\n",
    "    ]\n",
    "    \n",
    "    category_impact = analyze_attribute_impact(\n",
    "        'category_id', \n",
    "        categories, \n",
    "        sample_video, \n",
    "        engagement_prediction_model\n",
    "    )\n",
    "    \n",
    "    # Add category names using dictionary\n",
    "    category_dict = dict(zip(categories, category_names))\n",
    "    category_impact['Category Name'] = category_impact['Attribute Value'].map(category_dict)\n",
    "    \n",
    "    # Sort by prediction value\n",
    "    category_impact = category_impact.sort_values('Prediction', ascending=False)\n",
    "    \n",
    "    # Plot\n",
    "    plt.figure(figsize=(12, 6))\n",
    "    bars = plt.bar(category_impact['Category Name'], category_impact['Prediction'], alpha=0.7)\n",
    "    plt.title('Impact of Category on Engagement Score')\n",
    "    plt.xlabel('Video Category')\n",
    "    plt.ylabel('Predicted Engagement Score')\n",
    "    plt.xticks(rotation=45, ha='right')\n",
    "    plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Key Findings and Recommendations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on our model analysis, here are the key findings and recommendations for maximizing the potential of YouTube videos:\n",
    "\n",
    "### 1. Most Important Features\n",
    "\n",
    "Our analysis revealed the following top features for predicting video success:\n",
    "\n",
    "- **Engagement Metrics**: Features like likes, comments, and their ratios to views\n",
    "- **Video Duration**: Optimal duration varies significantly by category\n",
    "- **Publishing Time**: Both hour of day and day of week affect performance\n",
    "- **Title Characteristics**: Length, use of questions/exclamations, and emotional content\n",
    "- **Category**: Different categories have distinct engagement patterns\n",
    "\n",
    "### 2. Optimal Video Duration\n",
    "\n",
    "Based on our sensitivity analysis:\n",
    "\n",
    "- **Short Videos (3-7 minutes)**: Best for Music, Entertainment, Comedy\n",
    "- **Medium Videos (7-15 minutes)**: Optimal for Education, How-to, People & Blogs\n",
    "- **Longer Videos (15-25 minutes)**: Better for Gaming, Science & Technology\n",
    "\n",
    "### 3. Publishing Time Recommendations\n",
    "\n",
    "- **Best Days**: Wednesday, Thursday, and Sunday generally perform best\n",
    "- **Best Hours**: Publishing between 2-4 PM or 6-8 PM delivers highest engagement\n",
    "- **Worst Times**: Early morning hours (3-5 AM) show consistently lower performance\n",
    "\n",
    "### 4. Category-Specific Insights\n",
    "\n",
    "- **Gaming**: Longer videos (15-25 min) with detailed titles perform best\n",
    "- **Education**: Clear, question-based titles and medium length (10-15 min) videos\n",
    "- **Entertainment**: Shorter videos (3-7 min) with emotional titles\n",
    "- **Music**: Very short videos with artist/track information in title\n",
    "\n",
    "### 5. Title and Description Optimization\n",
    "\n",
    "- **Title Length**: 40-60 characters performs best across categories\n",
    "- **Questions**: Including a question in the title increases engagement\n",
    "- **Keywords**: Including 2-3 trending keywords relevant to the content\n",
    "- **Description**: Detailed descriptions with timestamps and links improve performance\n",
    "\n",
    "### 6. Model Performance\n",
    "\n",
    "- **Ensemble Models**: Consistently outperform individual models\n",
    "- **Random Forest vs. Gradient Boosting**: Random Forest models showed better overall performance for classification tasks\n",
    "- **Feature Engineering**: Time-based features and engagement ratios were crucial for prediction accuracy\n",
    "\n",
    "By implementing these recommendations, content creators can significantly improve the trending potential and engagement of their YouTube videos."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}