<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality/blob/main/notebooks/regression_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📈 YouTube Video Virality – Regression Modeling
This notebook trains regression models to predict:
- `viewCount`
- `likeCount`
- `commentCount`

using pre-extracted video features (image, text, and ResNet embeddings).


In [34]:
!pip install xgboost



In [35]:
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v8.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,channel_id,description,tags,title,avg_red,avg_green,avg_blue,brightness,contrast,video_id,...,num_unique_tags,title_embedding,embedding_distance_to_known_viral,title_readability,title_embedding_distance_to_viral,dominant_color_hue,thumbnail_edge_density,viewCount_log,likeCount_log,commentCount_log
0,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",im on a horse mf 🎶,73.374931,65.419201,77.057917,71.950683,63.507962,ehEhzzttOvI,...,9,[-1.29806036e-02 -9.70325321e-02 7.53702000e-...,0.248691,117.16,0.819275,-1.0,0.0,11.051588,7.468513,3.258097
1,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",KSI GETS TROLLED,88.901267,74.133906,71.210868,78.082014,53.159171,Zz--BUJ4VuU,...,9,[-6.63425922e-02 2.56073568e-02 2.16348972e-...,0.534146,119.19,0.741365,-1.0,0.0,11.320069,8.046229,3.73767
2,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",Oh Baby A Triple 🎶,83.173628,83.075104,83.087812,83.112182,65.593829,n9HpZHxhkVw,...,9,[-6.80741072e-02 -3.87456194e-02 -7.43182795e-...,0.318269,75.875,0.816851,-1.0,0.0,12.29852,8.980676,3.89182
3,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",KSIMON on Among Us,61.804375,48.056736,54.434601,54.765237,57.291874,Txk6EyuHi0A,...,9,[-8.89404044e-02 -5.75461239e-03 1.83530767e-...,0.454985,75.875,0.747618,-1.0,0.0,12.469014,9.200997,3.610918
4,UCh5mLn90vUaB1PbRRx_AiaA,🍗: Order food NOW at: https://www.eatsides.com...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",Deji is the GOAT Jester,31.690226,19.795156,22.93283,24.806071,28.240555,7WVjn7yB4-Q,...,9,[ 1.14647886e-02 8.77908915e-02 3.60484645e-...,0.413802,83.32,0.743162,-1.0,0.0,12.116952,8.861775,3.806662


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Drop non-numeric or identifier columns
drop_cols = [
    "video_id", "thumbnail_text", "title", "channel_id",
    "viewCount", "likeCount", "commentCount", "viral"
]

# Drop ResNet columns
drop_cols = [
    "video_id", "title", "channel_id",
    "viewCount", "likeCount", "commentCount", "viral", "description", "tags", "title_embedding", "viewCount_log", "likeCount_log", "commentCount_log"
]

X = df.drop(columns=drop_cols)
y_view = df["viewCount"]
y_like = df["likeCount"]
y_comment = df["commentCount"]

# Optional: log-transform to normalize skewed counts
y_view_log = np.log1p(y_view)
y_like_log = np.log1p(y_like)
y_comment_log = np.log1p(y_comment)

# Scale X
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, yv_train, yv_test = train_test_split(X_scaled, y_view_log, test_size=0.2, random_state=42)
_, _, yl_train, yl_test = train_test_split(X_scaled, y_like_log, test_size=0.2, random_state=42)
_, _, yc_train, yc_test = train_test_split(X_scaled, y_comment_log, test_size=0.2, random_state=42)

In [42]:
# Defining our regression models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [43]:
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold

def evaluate_model_cv(model, X, y, target_name, cv_folds=10):
    print(f"\n📈 {type(model).__name__} on {target_name} ({cv_folds}-fold CV)")

    # Set up KFold
    cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    # R², MAE, RMSE scoring
    r2 = cross_val_score(model, X, y, cv=cv, scoring='r2')
    mae = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    rmse = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error')

    print("Avg R²:", round(r2.mean(), 4))
    print("Avg MAE:", round(-mae.mean(), 2))
    print("Avg RMSE:", round(-rmse.mean(), 2))

In [44]:
# Feature Importance Function
import seaborn as sns

def plot_feature_importance(model, feature_names, title, top_n=15, exclude_prefix="resnet_"):
    # Handle model-specific importance attributes
    if hasattr(model, "coef_"):
        importance = model.coef_.flatten() if model.coef_.ndim > 1 else model.coef_
    elif hasattr(model, "feature_importances_"):
        importance = model.feature_importances_
    else:
        print(f"No feature importance attribute for {type(model).__name__}")
        return

    # Create DataFrame for plotting
    feat_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importance
    })

    # Filter out resnet embeddings
    feat_df = feat_df[~feat_df["feature"].str.startswith(exclude_prefix)]

    # Sort by absolute importance
    feat_df["abs_importance"] = feat_df["importance"].abs()
    feat_df = feat_df.sort_values(by="abs_importance", ascending=False).head(top_n)

    # Plot
    # plt.figure(figsize=(8, 6))
    # sns.barplot(x="importance", y="feature", data=feat_df, palette="viridis")
    # plt.title(f"Top {top_n} Non-ResNet Features - {title}")
    # plt.tight_layout()
    # plt.show()

In [46]:
# Training & Evaluating for all our target variables
yv = df["viewCount_log"]
yl = df["likeCount_log"]
yc = df["commentCount_log"]

# for plotting
feature_names = X.columns.tolist()

def print_feature_importance(model, feature_names, title, top_n=15, exclude_prefix="resnet_"):
    # Handle model-specific importance attributes
    if hasattr(model, "coef_"):
        importance = model.coef_.flatten() if model.coef_.ndim > 1 else model.coef_
    elif hasattr(model, "feature_importances_"):
        importance = model.feature_importances_
    else:
        print(f"No feature importance attribute for {type(model).__name__}")
        return

    # Create DataFrame for sorting
    feat_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importance
    })

    # Filter out resnet embeddings
    feat_df = feat_df[~feat_df["feature"].str.startswith(exclude_prefix)]

    # Sort by absolute importance
    feat_df["abs_importance"] = feat_df["importance"].abs()
    feat_df = feat_df.sort_values(by="abs_importance", ascending=False).head(top_n)

    print(f"\nTop {top_n} Features - {title}:")
    for index, row in feat_df.iterrows():
        print(f"- {row['feature']}: {row['importance']:.4f}")


for name, model in models.items():
    # viewCount
    evaluate_model_cv(model, X, yv, "viewCount_log")
    # Fit a model on the full data for feature importance
    final_model_v = models[name].fit(X, yv)
    print_feature_importance(final_model_v, feature_names, f"{name} - viewCount (Full Data)")

    # likeCount
    evaluate_model_cv(model, X, yl, "likeCount_log")
    # Fit a model on the full data for feature importance
    final_model_l = models[name].fit(X, yl)
    print_feature_importance(final_model_l, feature_names, f"{name} - likeCount (Full Data)")

    # commentCount
    evaluate_model_cv(model, X, yc, "commentCount_log")
    # Fit a model on the full data for feature importance
    final_model_c = models[name].fit(X, yc)
    print_feature_importance(final_model_c, feature_names, f"{name} - commentCount (Full Data)")


📈 RandomForestRegressor on viewCount_log (10-fold CV)
Avg R²: 0.5467
Avg MAE: 0.53
Avg RMSE: 0.76

Top 15 Features - Random Forest - viewCount (Full Data):
- contrast: 0.3185
- embedding_distance_to_known_viral: 0.1559
- title_embedding_distance_to_viral: 0.0627
- description_length: 0.0481
- title_length: 0.0442
- percent_letters_uppercase: 0.0404
- avg_red: 0.0360
- avg_blue: 0.0318
- title_sentiment: 0.0314
- uppercase_word_count: 0.0312
- brightness: 0.0305
- title_readability: 0.0285
- tag_count: 0.0264
- avg_green: 0.0239
- num_unique_tags: 0.0209

📈 RandomForestRegressor on likeCount_log (10-fold CV)
Avg R²: 0.5159
Avg MAE: 0.48
Avg RMSE: 0.68

Top 15 Features - Random Forest - likeCount (Full Data):
- contrast: 0.2187
- embedding_distance_to_known_viral: 0.1487
- title_embedding_distance_to_viral: 0.0703
- description_length: 0.0647
- title_length: 0.0563
- num_unique_tags: 0.0541
- tag_count: 0.0476
- percent_letters_uppercase: 0.0466
- avg_red: 0.0382
- avg_blue: 0.0354
- ti