<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality/blob/main/notebooks/regression_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📈 YouTube Video Virality – Regression Modeling
This notebook trains regression models to predict:
- `viewCount`
- `likeCount`
- `commentCount`

using pre-extracted video features (image, text, and ResNet embeddings).


In [1]:
!pip install xgboost



In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v8.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,channel_id,description,tags,title,avg_red,avg_green,avg_blue,brightness,contrast,video_id,...,description_link_count,tag_count,tag_sentiment,num_unique_tags,title_embedding,embedding_distance_to_known_viral,title_readability,title_embedding_distance_to_viral,dominant_color_hue,thumbnail_edge_density
0,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",im on a horse mf 🎶,73.374931,65.419201,77.057917,71.950683,63.507962,ehEhzzttOvI,...,37,9,0.0,9,[-1.29806036e-02 -9.70325321e-02 7.53702000e-...,0.248691,117.16,0.819275,0.0,0.085625
1,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",KSI GETS TROLLED,88.901267,74.133906,71.210868,78.082014,53.159171,Zz--BUJ4VuU,...,37,9,0.0,9,[-6.63425922e-02 2.56073568e-02 2.16348972e-...,0.534146,119.19,0.741365,0.0,0.055017
2,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",Oh Baby A Triple 🎶,83.173628,83.075104,83.087812,83.112182,65.593829,n9HpZHxhkVw,...,37,9,0.0,9,[-6.80741072e-02 -3.87456194e-02 -7.43182795e-...,0.318269,75.875,0.816851,0.0,0.06151
3,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",KSIMON on Among Us,61.804375,48.056736,54.434601,54.765237,57.291874,Txk6EyuHi0A,...,37,9,0.0,9,[-8.89404044e-02 -5.75461239e-03 1.83530767e-...,0.454985,75.875,0.747618,11.0,0.056163
4,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",Deji is the GOAT Jester,31.690226,19.795156,22.93283,24.806071,28.240555,7WVjn7yB4-Q,...,37,9,0.0,9,[ 1.14647886e-02 8.77908915e-02 3.60484645e-...,0.413802,83.32,0.743162,0.0,0.030382


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Drop non-numeric or identifier columns
drop_cols = [
    "video_id", "thumbnail_text", "title", "channel_id",
    "viewCount", "likeCount", "commentCount", "viral"
]

# Drop ResNet columns
drop_cols = [
    "video_id", "title", "channel_id",
    "viewCount", "likeCount", "commentCount", "viral", "description", "tags", "title_embedding"
]

X = df.drop(columns=drop_cols)
y_view = df["viewCount"]
y_like = df["likeCount"]
y_comment = df["commentCount"]

# Optional: log-transform to normalize skewed counts
y_view_log = np.log1p(y_view)
y_like_log = np.log1p(y_like)
y_comment_log = np.log1p(y_comment)

# Scale X
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, yv_train, yv_test = train_test_split(X_scaled, y_view_log, test_size=0.2, random_state=42)
_, _, yl_train, yl_test = train_test_split(X_scaled, y_like_log, test_size=0.2, random_state=42)
_, _, yc_train, yc_test = train_test_split(X_scaled, y_comment_log, test_size=0.2, random_state=42)

In [9]:
# Defining our regression models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [16]:
# Defining our evaluation function
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

def evaluate_model(model, X_train, y_train, X_test, y_test, target_name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    print(f"📈 {type(model).__name__} on {target_name}")
    print("R^2:", round(r2_score(y_test, preds), 4))
    print("MAE:", round(mean_absolute_error(y_test, preds), 2))
    print("RMSE:", round(np.sqrt(mean_squared_error(y_test, preds)), 2))

    # plt.figure(figsize=(5, 5))
    # plt.scatter(y_test, preds, alpha=0.3)
    # plt.xlabel("Actual")
    # plt.ylabel("Predicted")
    # plt.title(f"{type(model).__name__} - {target_name}")
    # plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
    # plt.show()

In [17]:
# Feature Importance Function
import seaborn as sns

def plot_feature_importance(model, feature_names, title, top_n=15, exclude_prefix="resnet_"):
    # Handle model-specific importance attributes
    if hasattr(model, "coef_"):
        importance = model.coef_.flatten() if model.coef_.ndim > 1 else model.coef_
    elif hasattr(model, "feature_importances_"):
        importance = model.feature_importances_
    else:
        print(f"No feature importance attribute for {type(model).__name__}")
        return

    # Create DataFrame for plotting
    feat_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importance
    })

    # Filter out resnet embeddings
    feat_df = feat_df[~feat_df["feature"].str.startswith(exclude_prefix)]

    # Sort by absolute importance
    feat_df["abs_importance"] = feat_df["importance"].abs()
    feat_df = feat_df.sort_values(by="abs_importance", ascending=False).head(top_n)

    # Plot
    plt.figure(figsize=(8, 6))
    sns.barplot(x="importance", y="feature", data=feat_df, palette="viridis")
    plt.title(f"Top {top_n} Non-ResNet Features - {title}")
    plt.tight_layout()
    plt.show()

In [18]:
# Training & Evaluating for all our target variables

# for plotting
feature_names = X.columns.tolist()

def print_feature_importance(model, feature_names, title, top_n=15, exclude_prefix="resnet_"):
    # Handle model-specific importance attributes
    if hasattr(model, "coef_"):
        importance = model.coef_.flatten() if model.coef_.ndim > 1 else model.coef_
    elif hasattr(model, "feature_importances_"):
        importance = model.feature_importances_
    else:
        print(f"No feature importance attribute for {type(model).__name__}")
        return

    # Create DataFrame for sorting
    feat_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importance
    })

    # Filter out resnet embeddings
    feat_df = feat_df[~feat_df["feature"].str.startswith(exclude_prefix)]

    # Sort by absolute importance
    feat_df["abs_importance"] = feat_df["importance"].abs()
    feat_df = feat_df.sort_values(by="abs_importance", ascending=False).head(top_n)

    print(f"\nTop {top_n} Features - {title}:")
    for index, row in feat_df.iterrows():
        print(f"- {row['feature']}: {row['importance']:.4f}")


for name, model in models.items():
    print("\n")
    evaluate_model(model, X_train, yv_train, X_test, yv_test, "viewCount")
    print_feature_importance(model, feature_names, f"{name} - viewCount")

    print("\n")
    evaluate_model(model, X_train, yl_train, X_test, yl_test, "likeCount")
    print_feature_importance(model, feature_names, f"{name} - likeCount")

    print("\n")
    evaluate_model(model, X_train, yc_train, X_test, yc_test, "commentCount")
    print_feature_importance(model, feature_names, f"{name} - commentCount")



📈 LinearRegression on viewCount
R^2: 0.5011
MAE: 0.58
RMSE: 0.74

Top 15 Features - Linear Regression - viewCount:
- embedding_distance_to_known_viral: 0.4219
- description_link_count: 0.3673
- description_has_keywords: -0.3574
- word_count: 0.3129
- percent_letters_uppercase: 0.2240
- contrast: 0.2094
- title_embedding_distance_to_viral: 0.1941
- thumbnail_edge_density: 0.1934
- title_sentiment: -0.0748
- num_unique_tags: 0.0707
- tag_count: 0.0707
- uppercase_word_count: -0.0701
- has_exclamation: -0.0683
- title_readability: -0.0629
- dominant_color_hue: 0.0538


📈 LinearRegression on likeCount
R^2: 0.4576
MAE: 0.53
RMSE: 0.68

Top 15 Features - Linear Regression - likeCount:
- description_link_count: 0.6417
- description_length: -0.4970
- embedding_distance_to_known_viral: 0.3637
- word_count: 0.2495
- percent_letters_uppercase: 0.2357
- title_embedding_distance_to_viral: 0.2008
- contrast: 0.1923
- description_has_keywords: -0.1721
- uppercase_word_count: -0.1274
- thumbnail_edg