<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality-Testing/blob/main/rf_pipeline_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Setup & load
import pandas as pd
import numpy as np

filepath = "engineered_features_test.csv"  # <- change if needed
df = pd.read_csv(filepath)

print("Shape:", df.shape)
print("Columns:", len(df.columns))

# Target
TARGET_COL = "viral_label"
if TARGET_COL not in df.columns:
    raise KeyError(f"Missing target column '{TARGET_COL}' in CSV.")

# Basic hygiene
df = df.replace([np.inf, -np.inf], np.nan)

# Keep only numeric features for the RF (drop text/path columns automatically)
non_feature_cols = {"title", "tags", "thumbnail_path"}  # anything clearly non-numeric
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# ensure target is last and not in features
feature_cols = [c for c in numeric_cols if c != TARGET_COL]

print("Numeric feature count:", len(feature_cols))
print("Positive rate (viral=1):", df[TARGET_COL].mean().round(4))

Shape: (93, 81)
Columns: 81
Numeric feature count: 80
Positive rate (viral=1): 0.172


In [5]:
# Train/Val/Test split
from sklearn.model_selection import train_test_split

X = df[feature_cols]
y = df[TARGET_COL].astype(int)

# 60/20/20 split with stratification on the target
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train/Val/Test sizes:", len(X_train), len(X_val), len(X_test))

Train/Val/Test sizes: 55 19 19


In [6]:
# Pipeline + hyperparameter search (AUC)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("rf", RandomForestClassifier(
        n_estimators=400,      # will be tuned
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"  # helpful for imbalance
    ))
])

param_dist = {
    "rf__n_estimators":       [300, 400, 600, 800],
    "rf__max_depth":          [None, 8, 12, 16, 24, 32],
    "rf__min_samples_split":  [2, 5, 10, 20],
    "rf__min_samples_leaf":   [1, 2, 4, 8],
    "rf__max_features":       ["sqrt", "log2", 0.3, 0.5, 0.8],
    "rf__bootstrap":          [True],
    "rf__max_samples":        [None, 0.7, 0.85, 0.95],  # if bootstrap=True
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=40,
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42,
    refit=True
)

search.fit(X_train, y_train)
print("Best AUC (CV):", round(search.best_score_, 4))
print("Best params:", search.best_params_)
best_model = search.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best AUC (CV): 0.7378
Best params: {'rf__n_estimators': 300, 'rf__min_samples_split': 20, 'rf__min_samples_leaf': 8, 'rf__max_samples': 0.85, 'rf__max_features': 0.8, 'rf__max_depth': 12, 'rf__bootstrap': True}


In [7]:
# Validation & Test metrics, threshold selection
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_fscore_support, confusion_matrix

# Predict probabilities
p_val = best_model.predict_proba(X_val)[:, 1]
p_test = best_model.predict_proba(X_test)[:, 1]

# Metrics at probability level
print("VAL ROC-AUC:", round(roc_auc_score(y_val, p_val), 4))
print("VAL PR-AUC:",  round(average_precision_score(y_val, p_val), 4))
print("TEST ROC-AUC:", round(roc_auc_score(y_test, p_test), 4))
print("TEST PR-AUC:",  round(average_precision_score(y_test, p_test), 4))

# Pick threshold on VAL that maximizes F1
ths = np.linspace(0.05, 0.95, 19)
f1s = [f1_score(y_val, (p_val >= t).astype(int)) for t in ths]
best_t = ths[int(np.argmax(f1s))]
print("Chosen threshold (max F1 on VAL):", round(best_t, 3))

# Apply to TEST
pred_test = (p_test >= best_t).astype(int)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, pred_test, average="binary")
cm = confusion_matrix(y_test, pred_test)

print(f"TEST Precision: {prec:.3f}  Recall: {rec:.3f}  F1: {f1:.3f}")
print("Confusion Matrix [[TN FP]\n                   [FN TP]]:\n", cm)

VAL ROC-AUC: 0.9167
VAL PR-AUC: 0.7333
TEST ROC-AUC: 0.6667
TEST PR-AUC: 0.2778
Chosen threshold (max F1 on VAL): 0.3
TEST Precision: 0.200  Recall: 0.667  F1: 0.308
Confusion Matrix [[TN FP]
                   [FN TP]]:
 [[8 8]
 [1 2]]


In [8]:
# Top features (Permutation importance on VAL)
from sklearn.inspection import permutation_importance

# Use validation set for importance to avoid test leakage
result = permutation_importance(
    best_model, X_val, y_val,
    n_repeats=10, random_state=42, n_jobs=-1, scoring="roc_auc"
)

imp = pd.DataFrame({
    "feature": X_val.columns,
    "importance_mean": result.importances_mean,
    "importance_std": result.importances_std
}).sort_values("importance_mean", ascending=False)

print("Top 25 features by permutation importance (VAL, AUC):")
display(imp.head(25))

Top 25 features by permutation importance (VAL, AUC):


Unnamed: 0,feature,importance_mean,importance_std
37,title_emb_pca_15,0.203333,0.136585
23,title_emb_pca_01,0.053333,0.071024
25,title_emb_pca_03,0.018333,0.013844
28,title_emb_pca_06,0.013333,0.04
3,thumbnail_hue,0.0,0.0
2,thumbnail_contrast,0.0,0.0
6,thumbnail_texture_entropy,0.0,0.0
4,thumbnail_saturation,0.0,0.0
14,title_emotion_disgust,0.0,0.0
13,title_emotion_anger,0.0,0.0
