<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality-Testing/blob/main/feature_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1: Thumbnail Features

In [None]:
filename = "test.csv"

# Load the CSV
import pandas as pd

df = pd.read_csv(f'/content/{filename}')
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.categoryId,snippet.liveBroadcastContent,snippet.defaultLanguage,snippet.localized.title,snippet.localized.description,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,D5hdxPi5WRssQsXncpidVGWN9ro,qRF04lTrbPs,UCQil4Xo3ymmGwMdCBvHVmLA,2025-08-05T13:01:35Z,UCQil4Xo3ymmGwMdCBvHVmLA,Ken losing his mind RAGE COMPILATION,Ken goes insane raging at Clash Royale. Long c...,https://i.ytimg.com/vi/qRF04lTrbPs/default.jpg,120,...,20,none,en,Ken losing his mind RAGE COMPILATION,Ken goes insane raging at Clash Royale. Long c...,en,42167,1776,0,71
1,youtube#video,auY24O-_owrfJHstP6Oir3EZsp0,NeWv_WA_4R0,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-28T13:22:34Z,UCQil4Xo3ymmGwMdCBvHVmLA,Ken fumbling easy wins RAGE COMPILATION,Compilation of Ken's best rage moments and the...,https://i.ytimg.com/vi/NeWv_WA_4R0/default.jpg,120,...,20,none,en,Ken fumbling easy wins RAGE COMPILATION,Compilation of Ken's best rage moments and the...,en,76601,3056,0,77
2,youtube#video,RFAWz7pgfALVB0hpRRpnbJ9zqyM,xF0KI6oo7ak,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-21T13:45:08Z,UCQil4Xo3ymmGwMdCBvHVmLA,The Impossible Level 16 King Tower Glitch,Very rare chef tower monk bug happens in Ken's...,https://i.ytimg.com/vi/xF0KI6oo7ak/default.jpg,120,...,20,none,en,The Impossible Level 16 King Tower Glitch,Very rare chef tower monk bug happens in Ken's...,en,24549,1042,0,25
3,youtube#video,c4Av6MkEuL2mI-k69AU7aqIfVII,addCkFsak1U,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-16T14:00:05Z,UCQil4Xo3ymmGwMdCBvHVmLA,The Mother Witch Incident (ft. Ken's neighbour),Extremely rare and classic Ken Clash Royale mo...,https://i.ytimg.com/vi/addCkFsak1U/default.jpg,120,...,20,none,en,The Mother Witch Incident (ft. Ken's neighbour),Extremely rare and classic Ken Clash Royale mo...,en,103786,4871,0,99
4,youtube#video,ZYSk9dCdICIQLkhmin_oiVkuB_A,GFd2FmQAyE0,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-10T06:36:14Z,UCQil4Xo3ymmGwMdCBvHVmLA,The WORST Clash Royale crash out of all time (...,I've watched hundreds of Ken videos and this i...,https://i.ytimg.com/vi/GFd2FmQAyE0/default.jpg,120,...,20,none,en,The WORST Clash Royale crash out of all time (...,I've watched hundreds of Ken videos and this i...,en,61414,3818,0,124


In [None]:
# Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

# Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]



##### The below code creates a temporary thumbnail_path column that maps
##### Each video for thumbnail referencing in cross-level features

# Build a mapping from video_id -> local thumbnail path (only if file exists)
path_map = {
    str(vid): f"thumbnails/{vid}.jpg"
    for vid in video_ids
    if os.path.isfile(f"thumbnails/{vid}.jpg")
}

# Add paths to the DataFrame
df["thumbnail_path"] = df["id"].astype(str).map(path_map)

# Optional: quick sanity flag + count
df["thumbnail_exists"] = df["thumbnail_path"].apply(
    lambda p: isinstance(p, str) and os.path.isfile(p)
)
print("Thumbnails mapped for rows:", int(df["thumbnail_exists"].sum()), "/", len(df))


Found 93 video IDs.
Thumbnails mapped for rows: 93 / 93


In [None]:
##### Creates:
# thumbnail_colorfulness

import numpy as np

def compute_colorfulness(img_path):
    """
    Implements the Hasler–Süsstrunk metric:
      C = sqrt(σ_rg^2 + σ_yb^2) + 0.3 * sqrt(μ_rg^2 + μ_yb^2)
    where
      rg = |R − G|
      yb = |0.5*(R + G) − B|
    """
    img = np.array(Image.open(img_path).convert('RGB')).astype('float32')
    R, G, B = img[...,0], img[...,1], img[...,2]
    rg = np.abs(R - G)
    yb = np.abs(0.5*(R + G) - B)

    std_rg = np.std(rg)
    std_yb = np.std(yb)
    mean_rg = np.mean(rg)
    mean_yb = np.mean(yb)

    # Hasler–Süsstrunk colorfulness
    return np.sqrt(std_rg**2 + std_yb**2) + 0.3 * np.sqrt(mean_rg**2 + mean_yb**2)

# Assume `df` has an "id" column of video_ids and your thumbnails are saved as thumbnails/{id}.jpg
colorfulness_scores = []
for vid in df["id"].dropna().unique():
    path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.exists(path):
        colorfulness_scores.append((vid, compute_colorfulness(path)))
    else:
        colorfulness_scores.append((vid, np.nan))

# Turn into a dict for fast lookup, then map back onto df
cf_dict = dict(colorfulness_scores)
df["thumbnail_colorfulness"] = df["id"].map(cf_dict)

# Inspect
df[["id", "thumbnail_colorfulness"]].head()

Unnamed: 0,id,thumbnail_colorfulness
0,qRF04lTrbPs,64.76886
1,NeWv_WA_4R0,68.511398
2,xF0KI6oo7ak,82.837021
3,addCkFsak1U,87.258003
4,GFd2FmQAyE0,58.20858


In [None]:
##### Creates:
# thumbnail_brightness
# thumbnail_contrast

def compute_brightness_rms(img_path):
    """
    Returns:
      - mean_luminance: Mean pixel intensity (0 → 255)
      - rms_contrast: Standard deviation of luminance divided by mean luminance
    """
    img = np.array(Image.open(img_path).convert('L')).astype(np.float32)
    mean_luminance = img.mean()
    rms_contrast = img.std() / mean_luminance if mean_luminance != 0 else 0.0
    return mean_luminance, rms_contrast

# Compute brightness & RMS contrast for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        brightness, rms = compute_brightness_rms(thumb_path)
    else:
        brightness, rms = np.nan, np.nan
    results.append((vid, brightness, rms))

# Map back to DataFrame
brightness_map = {vid: b for vid, b, _ in results}
rms_map        = {vid: rms for vid, _, rms in results}

df["thumbnail_brightness"]   = df["id"].map(brightness_map)
df["thumbnail_contrast"] = df["id"].map(rms_map)

# Preview updated columns
df[["id", "thumbnail_brightness", "thumbnail_contrast"]].head()

Unnamed: 0,id,thumbnail_brightness,thumbnail_contrast
0,qRF04lTrbPs,116.419357,0.564473
1,NeWv_WA_4R0,121.534286,0.574799
2,xF0KI6oo7ak,112.830292,0.539484
3,addCkFsak1U,134.029816,0.50634
4,GFd2FmQAyE0,102.990135,0.761938


In [None]:
##### Creates:
# thumbnail_hue
# thumbnail_saturation

def compute_hue_saturation(img_path):
    """
    Returns:
      - mean_hue: Mean hue in degrees (0 → 360)
      - mean_saturation: Mean saturation (0 → 1)
    """
    # PIL HSV channels are 0–255; convert to float
    img = np.array(Image.open(img_path).convert('HSV')).astype(np.float32)
    H, S, _ = img[..., 0], img[..., 1], img[..., 2]
    # Scale H to degrees, S to fraction
    mean_hue = (H.mean() * 360.0) / 255.0
    mean_saturation = S.mean() / 255.0
    return mean_hue, mean_saturation

# Compute hue & saturation features for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        hue, sat = compute_hue_saturation(thumb_path)
    else:
        hue, sat = np.nan, np.nan
    results.append((vid, hue, sat))

# Map back to DataFrame
hue_map = {vid: hue for vid, hue, _ in results}
sat_map = {vid: sat for vid, _, sat in results}

df["thumbnail_hue"] = df["id"].map(hue_map)
df["thumbnail_saturation"] = df["id"].map(sat_map)

# Preview new columns
df[["id", "thumbnail_hue", "thumbnail_saturation"]].head()

Unnamed: 0,id,thumbnail_hue,thumbnail_saturation
0,qRF04lTrbPs,108.838844,0.380881
1,NeWv_WA_4R0,117.835182,0.427772
2,xF0KI6oo7ak,55.820709,0.503604
3,addCkFsak1U,125.57164,0.361966
4,GFd2FmQAyE0,104.623383,0.424597


In [None]:
##### Creates:
# thumbnail_edge_density
# thumbnail_texture_entropy

import cv2

def compute_edge_texture(img_path):
    """
    Returns:
      - edge_density: fraction of Canny edge pixels (0 → 1)
      - texture_entropy: Shannon entropy of grayscale pixel distribution (in bits)
    """
    # Load as grayscale
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    # Edge density
    edges = cv2.Canny(img, threshold1=100, threshold2=200)
    edge_density = np.count_nonzero(edges) / img.size

    # Texture (grayscale) entropy
    hist, _ = np.histogram(img.flatten(), bins=256, range=(0, 255))
    probs = hist / hist.sum()
    probs_nonzero = probs[probs > 0]
    texture_entropy = -np.sum(probs_nonzero * np.log2(probs_nonzero))

    return edge_density, texture_entropy

# Compute edge density & texture entropy for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        ed, te = compute_edge_texture(thumb_path)
    else:
        ed, te = np.nan, np.nan
    results.append((vid, ed, te))

# Map results back to DataFrame
edge_density_map   = {vid: ed for vid, ed, _ in results}
texture_entropy_map = {vid: te for vid, _, te in results}

df["thumbnail_edge_density"]    = df["id"].map(edge_density_map)
df["thumbnail_texture_entropy"] = df["id"].map(texture_entropy_map)

# Preview new columns
df[["id", "thumbnail_edge_density", "thumbnail_texture_entropy"]].head()

Unnamed: 0,id,thumbnail_edge_density,thumbnail_texture_entropy
0,qRF04lTrbPs,0.164722,7.886865
1,NeWv_WA_4R0,0.200174,7.910999
2,xF0KI6oo7ak,0.214844,7.82749
3,addCkFsak1U,0.165208,7.822907
4,GFd2FmQAyE0,0.168767,7.730054


In [None]:
##### Creates:
# thumbnail_quality

def compute_thumbnail_quality(img_path):
    """
    Returns:
      - thumbnail_quality: variance of the Laplacian of the grayscale image
        (higher = sharper, lower = blurrier)
    """
    gray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    lap = cv2.Laplacian(gray, cv2.CV_64F)
    return lap.var()

# Compute thumbnail_quality for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        quality = compute_thumbnail_quality(thumb_path)
    else:
        quality = np.nan
    results.append((vid, quality))

# Map back to DataFrame
quality_map = {vid: q for vid, q in results}
df["thumbnail_quality"] = df["id"].map(quality_map)

# Preview the new feature
df[["id", "thumbnail_quality"]].head()

Unnamed: 0,id,thumbnail_quality
0,qRF04lTrbPs,4676.820102
1,NeWv_WA_4R0,8264.712197
2,xF0KI6oo7ak,5745.212433
3,addCkFsak1U,6141.966672
4,GFd2FmQAyE0,6204.187401


In [None]:
##### Creates:
# thumbnail_face_area_ratio

# Load OpenCV's built-in Haar cascade for face detection
face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

def compute_face_area_ratio(img_path):
    """
    Returns:
      - face_area_ratio: sum of detected face bounding-box areas divided by total image area
    """
    img = cv2.imread(img_path)
    if img is None:
        return np.nan
    height, width = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(30, 30)
    )
    # Sum up face areas
    total_face_area = sum((fw * fh) for (_, _, fw, fh) in faces)
    total_area = width * height
    return total_face_area / total_area if total_area > 0 else 0.0

# Compute face_area_ratio for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    ratio = compute_face_area_ratio(thumb_path) if os.path.isfile(thumb_path) else np.nan
    results.append((vid, ratio))

# Map back to DataFrame
face_area_map = {vid: ratio for vid, ratio in results}
df["thumbnail_face_area_ratio"] = df["id"].map(face_area_map)

# Preview the new feature
df[["id", "thumbnail_face_area_ratio"]].head()

Unnamed: 0,id,thumbnail_face_area_ratio
0,qRF04lTrbPs,0.229601
1,NeWv_WA_4R0,0.198767
2,xF0KI6oo7ak,0.105625
3,addCkFsak1U,0.068906
4,GFd2FmQAyE0,0.143767


In [None]:
##### Creates:
# thumbnail_face_emotion

!pip install -q mediapipe==0.10.14

import os
import cv2
import numpy as np
import mediapipe as mp

# Assumes `df` is already loaded and thumbnails are in "thumbnails/{video_id}.jpg"

mp_face_mesh = mp.solutions.face_mesh

def _euclid(p1, p2):
    return float(np.hypot(p1[0]-p2[0], p1[1]-p2[1]))

def _safe_ratio(a, b, eps=1e-6):
    return float(a / (b + eps))

def compute_thumbnail_face_emotion(img_path: str) -> float:
    """
    Landmark-based valence proxy in [-1, 1]:
      + Increases with mouth width (smile) and eye openness
      + Slightly increases with mouth openness (surprise)
      Returns 0.0 if no faces detected.
    """
    img_bgr = cv2.imread(img_path)
    if img_bgr is None:
        return np.nan
    H, W = img_bgr.shape[:2]
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

    with mp_face_mesh.FaceMesh(
        static_image_mode=True,
        max_num_faces=10,
        refine_landmarks=True,
        min_detection_confidence=0.5
    ) as fm:
        res = fm.process(img_rgb)

    if not res.multi_face_landmarks:
        return 0.0

    vals, areas = [], []
    for lm in res.multi_face_landmarks:
        pts = [(int(p.x * W), int(p.y * H)) for p in lm.landmark]

        # Landmark indices (MediaPipe Face Mesh):
        # Mouth corners: 61 (left), 291 (right)
        # Upper/Lower lip center: 13, 14
        # Face "width": 234 (left cheek), 454 (right cheek)
        # Face "height": 10 (forehead), 152 (chin)
        # Eyes: left top/bottom 159/145 with corners 33/133; right 386/374 with corners 362/263
        face_w  = _euclid(pts[234], pts[454])
        face_h  = _euclid(pts[10],  pts[152])
        if face_w < 1 or face_h < 1:
            continue

        mouth_w     = _euclid(pts[61],  pts[291])
        mouth_open  = _euclid(pts[13],  pts[14])
        left_eye_op = _safe_ratio(_euclid(pts[159], pts[145]), _euclid(pts[33],  pts[133]))
        right_eye_op= _safe_ratio(_euclid(pts[386], pts[374]), _euclid(pts[362], pts[263]))
        eye_open    = 0.5 * (left_eye_op + right_eye_op)

        # Normalize by face size
        wide  = _safe_ratio(mouth_w, face_w)      # smile width
        opened= _safe_ratio(mouth_open, face_h)   # mouth open
        # Simple weighted score → squashed to [-1, 1]
        score = 1.2 * wide + 0.6 * opened + 0.3 * eye_open
        val   = float(np.tanh((score - 0.55) * 3.0))  # center and scale

        # Approx area to weight bigger faces more
        areas.append(face_w * face_h)
        vals.append(val)

    if not vals:
        return 0.0
    return float(np.average(vals, weights=np.asarray(areas, dtype=np.float32)))

# Compute for each video and map back to df
results = []
for vid in df["id"].dropna().unique():
    p = os.path.join("thumbnails", f"{vid}.jpg")
    v = compute_thumbnail_face_emotion(p) if os.path.isfile(p) else np.nan
    results.append((vid, v))

df["thumbnail_face_emotion"] = df["id"].map({vid: v for vid, v in results})

# Peek
df[["id", "thumbnail_face_emotion"]].head()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydf 0.13.0 requires protobuf<7.0.0,>=5.29.1, but you have protobuf 4.25.8 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 4.25.8 which is incompatible.[0m[31m
[0m



Unnamed: 0,id,thumbnail_face_emotion
0,qRF04lTrbPs,0.684833
1,NeWv_WA_4R0,0.641112
2,xF0KI6oo7ak,0.303327
3,addCkFsak1U,0.260067
4,GFd2FmQAyE0,0.469498


In [None]:
##### Creates:
# thumbnail_ocr_text_coverage

# Heuristic OCR Text Coverage (no large models)
# Idea: enhance stroke-like regions with morphology, binarize, filter CCs, fill boxes -> coverage ∈ [0,1]

def compute_text_coverage(img_path: str,
                          min_frac=0.0005,   # min box area as fraction of image area
                          max_frac=0.25,     # max box area as fraction of image area
                          min_ar=1.1,        # min aspect ratio (w/h) for text regions
                          extent_lo=0.25,    # min contour extent (area / bbox area)
                          extent_hi=0.95):   # max contour extent (filters solid bars/blocks)
    img = cv2.imread(img_path)
    if img is None:
        return np.nan
    H, W = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # --- Enhance text strokes ---
    # Use both blackhat (light text on dark) and tophat (dark text on light) with an elongated kernel
    kx = max(3, W // 40)   # horizontal emphasis
    ky = max(2, H // 120)  # thin vertical thickness
    rect = cv2.getStructuringElement(cv2.MORPH_RECT, (kx, ky))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, rect)
    tophat   = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT,   rect)
    enhanced = cv2.max(blackhat, tophat)

    # Optional slight blur to stabilize thresholding
    enhanced = cv2.GaussianBlur(enhanced, (3, 3), 0)

    # --- Binarize (Otsu) ---
    _, bw = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # --- Connect characters into words/blocks ---
    connect = cv2.getStructuringElement(cv2.MORPH_RECT, (max(3, kx // 2), max(2, ky)))
    bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, connect, iterations=1)
    bw = cv2.dilate(bw, connect, iterations=1)

    # --- Find candidate regions ---
    contours, _ = cv2.findContours(bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    mask = np.zeros((H, W), dtype=np.uint8)
    total_px = H * W
    min_area = min_frac * total_px
    max_area = max_frac * total_px

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = w * h
        if area < min_area or area > max_area:
            continue
        ar = w / float(h) if h > 0 else 0.0
        if ar < min_ar:
            continue
        cnt_area = cv2.contourArea(cnt)
        extent = (cnt_area / area) if area > 0 else 0.0
        if not (extent_lo <= extent <= extent_hi):
            continue

        # Fill the box (mask union of text-like regions)
        cv2.rectangle(mask, (x, y), (x + w, y + h), 255, -1)

    coverage = (mask.sum() / 255.0) / float(total_px) if total_px > 0 else 0.0
    return float(max(0.0, min(1.0, coverage)))

# Compute for each video and map back to df
results = []
for vid in df["id"].dropna().unique():
    p = os.path.join("thumbnails", f"{vid}.jpg")
    cov = compute_text_coverage(p) if os.path.isfile(p) else np.nan
    results.append((vid, cov))

df["thumbnail_ocr_text_coverage"] = df["id"].map({vid: cov for vid, cov in results})

# Peek
df[["id", "thumbnail_ocr_text_coverage"]].head()


Unnamed: 0,id,thumbnail_ocr_text_coverage
0,qRF04lTrbPs,0.396562
1,NeWv_WA_4R0,0.479063
2,xF0KI6oo7ak,0.141198
3,addCkFsak1U,0.077292
4,GFd2FmQAyE0,0.298385


In [None]:
##### Creates:
# thumbnail_saliency_thirds_proximity

def _spectral_residual_centroid(gray: np.ndarray, small=96):
    """
    Compute saliency via spectral residual and return centroid (cx, cy) in ORIGINAL image coords.
    """
    H, W = gray.shape[:2]
    # Downscale for speed
    small_w = small
    small_h = max(8, int(round(H * (small / float(W))))) if W > 0 else small
    g = cv2.resize(gray, (small_w, small_h), interpolation=cv2.INTER_AREA).astype(np.float32)

    # Spectral residual saliency (Hou & Zhang 2007)
    F = np.fft.fft2(g)
    log_amp = np.log(np.abs(F) + 1e-8)
    phase   = np.angle(F)
    avg_log = cv2.blur(log_amp, (3, 3))
    spec_res = log_amp - avg_log
    S = np.abs(np.fft.ifft2(np.exp(spec_res + 1j * phase))) ** 2
    S = cv2.GaussianBlur(S, (3, 3), 0)

    # Normalize to [0,1]
    S -= S.min()
    S /= (S.max() + 1e-8)

    # Centroid on the small map
    yy, xx = np.mgrid[0:small_h, 0:small_w]
    w = S.astype(np.float32)
    wsum = float(w.sum())
    if wsum < 1e-8:
        # Fallback to image center if saliency is degenerate
        return W * 0.5, H * 0.5

    cx_small = float((w * xx).sum() / wsum)
    cy_small = float((w * yy).sum() / wsum)

    # Map back to original coords
    cx = cx_small * (W / float(small_w))
    cy = cy_small * (H / float(small_h))
    return cx, cy

def _thirds_points(W, H):
    return [
        (W/3.0, H/3.0),
        (2*W/3.0, H/3.0),
        (W/3.0, 2*H/3.0),
        (2*W/3.0, 2*H/3.0),
    ]

def _nearest_thirds_distance(cx, cy, W, H):
    pts = _thirds_points(W, H)
    return float(min(np.hypot(cx - x, cy - y) for (x, y) in pts))

def compute_thirds_proximity(img_path: str) -> float:
    """
    Returns thirds proximity in [0,1]:
      1.0 = saliency centroid exactly on a rule-of-thirds hotspot
      0.0 ≈ worst case (near a corner)
    """
    bgr = cv2.imread(img_path)
    if bgr is None:
        return np.nan
    H, W = bgr.shape[:2]
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

    cx, cy = _spectral_residual_centroid(gray, small=96)
    diag = np.hypot(W, H)
    max_min_dist = diag / 3.0  # worst-case distance to nearest thirds point (corners)
    offset_px = _nearest_thirds_distance(cx, cy, W, H)
    proximity = 1.0 - (offset_px / (max_min_dist + 1e-8))
    return float(np.clip(proximity, 0.0, 1.0))

# Compute and map back to df
results = []
for vid in df["id"].dropna().unique():
    p = os.path.join("thumbnails", f"{vid}.jpg")
    prox = compute_thirds_proximity(p) if os.path.isfile(p) else np.nan
    results.append((vid, prox))

df["thumbnail_saliency_thirds_proximity"] = df["id"].map({vid: prox for vid, prox in results})

# Quick peek
df[["id", "thumbnail_saliency_thirds_proximity"]].head()

Unnamed: 0,id,thumbnail_saliency_thirds_proximity
0,qRF04lTrbPs,0.805344
1,NeWv_WA_4R0,0.712971
2,xF0KI6oo7ak,0.611532
3,addCkFsak1U,0.69327
4,GFd2FmQAyE0,0.630173


In [None]:
# Column renaming

df.rename(columns={'snippet.channelId': 'channel_id',
                   'snippet.title': 'title',
                   'snippet.description': 'description',
                   'statistics.likeCount': 'likeCount',
                   'statistics.viewCount': 'viewCount',
                   'statistics.commentCount': 'commentCount'}, inplace=True)

In [None]:
##### Labeling virality (based on high performance)

percentile = 0.75

# 0) Drop duplicate-named columns (keep first occurrence)
dupes = df.columns[df.columns.duplicated()].unique()
if len(dupes):
    print("Dropping duplicate columns:", list(dupes))
df = df.loc[:, ~df.columns.duplicated()]

# 1) Make sure metrics are numeric (YouTube API often gives strings)
num_cols = ["viewCount", "likeCount", "commentCount"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# 2) Compute per-channel quantile thresholds
thresholds = (
    df.groupby("channel_id")[num_cols]
      .quantile(percentile)
      .rename(columns={
          "viewCount": "views_threshold",
          "likeCount": "likes_threshold",
          "commentCount": "comments_threshold",
      })
      .reset_index()
)

# 3) Merge thresholds into main dataframe
df = df.merge(thresholds, on="channel_id", how="left", validate="m:1")

# 4) Label virality
df["viral_label"] = (
    (df["viewCount"] > df["views_threshold"]) &
    (df["likeCount"] > df["likes_threshold"]) &
    (df["commentCount"] > df["comments_threshold"])
).astype(int)

# 5) Clean up
df = df.drop(columns=["views_threshold", "likes_threshold", "comments_threshold"])

# 6) Debug: what's the share of viral?
viral_percentage = (df["viral_label"].sum() / len(df)) * 100 if len(df) else 0.0
print(f"Percentage of viral videos: {viral_percentage:.2f}%")


Dropping duplicate columns: ['channel_id']
Percentage of viral videos: 17.20%


In [None]:
##### Dropping columns that aren't used from youtube data api v3 metadata
##### General refactoring at end of cell

# Dropping bad columns/features
columns_to_drop = ['kind',
                   'etag',
                   'channel_id_x',
                   'snippet.publishedAt',
                   'snippet.title',
                   'snippet.thumbnails.default.url',
                   'snippet.thumbnails.default.width',
                   'snippet.thumbnails.default.height',
                   'snippet.thumbnails.medium.url',
                   'snippet.thumbnails.medium.width',
                   'snippet.thumbnails.medium.height',
                   'snippet.thumbnails.high.url',
                   'snippet.thumbnails.high.width',
                   'snippet.thumbnails.high.height',
                   'snippet.thumbnails.standard.url',
                   'snippet.thumbnails.standard.width',
                   'snippet.thumbnails.standard.height',
                   'snippet.thumbnails.maxres.url',
                   'snippet.thumbnails.maxres.width',
                   'snippet.thumbnails.maxres.height',
                   'statistics.viewCount',
                   'statistics.likeCount',
                   'statistics.commentCount',
                   'snippet.channelTitle',
                   'snippet.categoryId',
                   'snippet.liveBroadcastContent',
                   'snippet.defaultAudioLanguage',
                   'snippet.defaultLanguage',
                   'title',
                   'channel_id_x',
                   'channel_id_y',
                   'snippet.localized.description',
                   'statistics.favoriteCount',
                   'id',
                   'viewCount',
                   'likeCount',
                   'commentCount',
                   'channel_id',
                   'description']

# Check which columns exist in the DataFrame before dropping
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

df = df.drop(columns=existing_columns_to_drop, axis=1)

# Renaming tags for future section
df.rename(columns={'snippet.tags': 'tags', 'snippet.localized.title': 'title'}, inplace=True)

In [None]:
##### Sanity check + feature peek for next section
df.columns

Index(['tags', 'title', 'thumbnail_path', 'thumbnail_exists',
       'thumbnail_colorfulness', 'thumbnail_brightness', 'thumbnail_contrast',
       'thumbnail_hue', 'thumbnail_saturation', 'thumbnail_edge_density',
       'thumbnail_texture_entropy', 'thumbnail_quality',
       'thumbnail_face_area_ratio', 'thumbnail_face_emotion',
       'thumbnail_ocr_text_coverage', 'thumbnail_saliency_thirds_proximity',
       'viral_label'],
      dtype='object')

# Section 2: Title-level Features

In [None]:
##### Creates:
# title_sentiment

# Install and import VADER
!pip install -q vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd

# Initialize analyzer once
_analyzer = SentimentIntensityAnalyzer()

def vader_compound(text: str) -> float:
    if not isinstance(text, str) or not text.strip():
        return np.nan  # or 0.0 if you prefer neutral for missing
    return _analyzer.polarity_scores(text)["compound"]

# Compute and add the feature
df["title_sentiment"] = df["title"].astype(str).map(vader_compound)

# Feature preview
df[["title", "title_sentiment"]].head()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m122.9/126.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

Unnamed: 0,title,title_sentiment
0,Ken losing his mind RAGE COMPILATION,-0.7865
1,Ken fumbling easy wins RAGE COMPILATION,0.3109
2,The Impossible Level 16 King Tower Glitch,0.0
3,The Mother Witch Incident (ft. Ken's neighbour),-0.3612
4,The WORST Clash Royale crash out of all time (...,-0.8192


In [None]:
##### Creates:
# title_emotion_anger
# title_emotion_disgust
# title_emotion_fear
# title_emotion_joy
# title_emotion_neutral
# title_emotion_sadness
# title_emotion_surprise

# Install transformers if missing
try:
    from transformers import pipeline
except Exception:
    !pip install -q transformers
    from transformers import pipeline

import numpy as np
import pandas as pd

# Initialize emotion pipeline
# ['anger','disgust','fear','joy','neutral','sadness','surprise']
emo_nlp = pipeline(
    task="text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True,           # get full distribution
    device=-1                         # CPU
)

EMO_LABELS = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
NEW_COLS = [f"title_emotion_{e}" for e in EMO_LABELS]

# Prepare inputs (keep index to map results back cleanly)
titles = df["title"].astype(str).fillna("")
idx = titles.index.to_list()

def _chunks(seq, n):
    for i in range(0, len(seq), n):
        yield seq[i:i+n]

# Run in batches to avoid memory spikes
all_scores = []
batch_size = 64
for batch in _chunks(titles.tolist(), batch_size):
    out = emo_nlp(batch, truncation=True)
    # 'out' is a list (len=batch) of lists of dicts [{'label':..., 'score':...}, ...]
    for per in out:
        d = {item["label"].lower(): float(item["score"]) for item in per}
        # ensure we have all 7 labels in consistent order
        all_scores.append([d.get(k, 0.0) for k in EMO_LABELS])

# Convert to DataFrame and attach to df
scores_arr = np.array(all_scores, dtype=np.float32)
emo_df = pd.DataFrame(scores_arr, columns=NEW_COLS, index=idx)

# (Optional) ensure each row sums ~1 (numerical drift guard)
row_sums = emo_df.sum(axis=1).replace(0, np.nan)
emo_df = emo_df.div(row_sums, axis=0).fillna(0.0)

# Merge columns into your df
df[NEW_COLS] = emo_df[NEW_COLS]

# Feature preview
df[["title"] + NEW_COLS].head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


Unnamed: 0,title,title_emotion_anger,title_emotion_disgust,title_emotion_fear,title_emotion_joy,title_emotion_neutral,title_emotion_sadness,title_emotion_surprise
0,Ken losing his mind RAGE COMPILATION,0.214829,0.326542,0.009405,0.002268,0.383729,0.03421,0.029018
1,Ken fumbling easy wins RAGE COMPILATION,0.472787,0.070065,0.005779,0.009193,0.389326,0.038567,0.014283
2,The Impossible Level 16 King Tower Glitch,0.021871,0.012825,0.65207,0.015979,0.219951,0.022352,0.054951
3,The Mother Witch Incident (ft. Ken's neighbour),0.042019,0.082553,0.671903,0.005342,0.092464,0.034168,0.071552
4,The WORST Clash Royale crash out of all time (...,0.240045,0.045762,0.024337,0.00838,0.225552,0.093946,0.361978


In [None]:
##### Creates:
# title_subjectivity

# Install TextBlob
!pip install -q textblob

from textblob import TextBlob
import numpy as np

def blob_subjectivity(text: str) -> float:
    if not isinstance(text, str) or not text.strip():
        return np.nan  # or 0.0 if you want to treat empty titles as neutral
    try:
        return float(TextBlob(text).sentiment.subjectivity)
    except Exception:
        # If you ever hit a pattern-related issue, try:
        # !pip install -q "textblob==0.17.1" "pattern==3.6"
        return np.nan

# Compute and attach the feature
df["title_subjectivity"] = df["title"].astype(str).map(blob_subjectivity)

# Feature preview
df[["title", "title_subjectivity"]].head()

Unnamed: 0,title,title_subjectivity
0,Ken losing his mind RAGE COMPILATION,0.0
1,Ken fumbling easy wins RAGE COMPILATION,0.516667
2,The Impossible Level 16 King Tower Glitch,1.0
3,The Mother Witch Incident (ft. Ken's neighbour),0.0
4,The WORST Clash Royale crash out of all time (...,1.0


In [None]:
##### Creates:
# title_readability

# Install textstat if needed
!pip install -q textstat

import numpy as np
import pandas as pd
import textstat

def flesch_reading_ease_clipped(text: str) -> float:
    if not isinstance(text, str) or not text.strip():
        return np.nan
    try:
        score = textstat.flesch_reading_ease(text)
        # FRE can be <0 or >100; clamp to 0..100 for a clean feature
        return float(np.clip(score, 0, 100))
    except Exception:
        return np.nan

# Compute features
titles = df["title"].astype(str)
df["title_readability"] = titles.map(flesch_reading_ease_clipped)   # 0..100 (higher = easier)

# Feature preview
df[["title", "title_readability"]].head()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m235.5/239.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.1/239.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/939.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.7/939.7 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m122.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Unnamed: 0,title,title_readability
0,Ken losing his mind RAGE COMPILATION,59.745
1,Ken fumbling easy wins RAGE COMPILATION,31.545
2,The Impossible Level 16 King Tower Glitch,54.701429
3,The Mother Witch Incident (ft. Ken's neighbour),66.787143
4,The WORST Clash Royale crash out of all time (...,100.0


In [None]:
##### Creates:
# title_log_perplexity

try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
except Exception:
    !pip install -q transformers
    from transformers import AutoTokenizer, AutoModelForCausalLM

import numpy as np
import torch
import torch.nn.functional as F

MODEL_NAME = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()

# GPT-2 has no pad token; use EOS as pad so we can batch/pad
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

@torch.no_grad()
def gpt2_log_perplexity_batch(texts, max_length=128):
    """
    Returns mean negative log-likelihood per token (nats/token) for each text.
    Empty/whitespace → np.nan.
    """
    # Mask empties upfront
    mask_nonempty = [isinstance(t, str) and t.strip() != "" for t in texts]
    results = [np.nan] * len(texts)
    if not any(mask_nonempty):
        return results

    nonempty = [t for t, ok in zip(texts, mask_nonempty) if ok]

    enc = tokenizer(
        nonempty,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
    )
    input_ids = enc["input_ids"].to(device)          # [B,T]
    attention = enc["attention_mask"].to(device)     # [B,T]

    # Shift for causal LM loss
    logits = model(input_ids=input_ids, attention_mask=attention).logits  # [B,T,V]
    shift_logits = logits[:, :-1, :].contiguous()       # [B,T-1,V]
    shift_labels = input_ids[:, 1:].contiguous()        # [B,T-1]
    shift_attn   = attention[:, 1:].contiguous()        # [B,T-1]

    # Mask pads as ignore_index
    shift_labels = shift_labels.masked_fill(shift_attn == 0, -100)

    # Per-token NLL with ignore_index; shape [B*(T-1)]
    loss_flat = F.cross_entropy(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1),
        ignore_index=-100,
        reduction="none"
    )
    # Reshape to [B, T-1]
    loss_tok = loss_flat.view(shift_labels.size())

    # Mean over valid tokens per sequence
    valid_mask = (shift_labels != -100).float()
    tok_counts = valid_mask.sum(dim=1)                        # [B]
    sum_loss   = (loss_tok * valid_mask).sum(dim=1)           # [B]
    mean_nll   = torch.where(tok_counts > 0, sum_loss / tok_counts, torch.full_like(tok_counts, float("nan")))

    # Stitch back to original order
    per_seq = mean_nll.detach().cpu().numpy().tolist()
    it = iter(per_seq)
    for i, ok in enumerate(mask_nonempty):
        if ok:
            results[i] = next(it)

    return results

# Compute for your titles
titles = df["title"].astype(str).tolist()
batch_size = 64
logp = []
for i in range(0, len(titles), batch_size):
    logp.extend(gpt2_log_perplexity_batch(titles[i:i+batch_size], max_length=128))

df["title_log_perplexity"] = logp

# Feature preview
df[["title", "title_log_perplexity"]].head()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Unnamed: 0,title,title_log_perplexity
0,Ken losing his mind RAGE COMPILATION,6.563992
1,Ken fumbling easy wins RAGE COMPILATION,7.545415
2,The Impossible Level 16 King Tower Glitch,7.729622
3,The Mother Witch Incident (ft. Ken's neighbour),7.133723
4,The WORST Clash Royale crash out of all time (...,5.757444


In [None]:
##### Creates Semantic Embeddings (32-column vectorized embedddings PCA'd from a 768-d vector)

# Install/load deps
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    !pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer

try:
    from sklearn.decomposition import PCA
except Exception:
    !pip install -q scikit-learn
    from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
import torch

# Encode titles with a 768-d Sentence-BERT
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  # 768-D
device = "cuda" if torch.cuda.is_available() else "cpu"
sbert = SentenceTransformer(MODEL_NAME, device=device)

titles = df["title"].astype(str).fillna("").tolist()

embeddings = sbert.encode(
    titles,
    batch_size=64,
    show_progress_bar=False,
    convert_to_numpy=True,
    # L2-normalize; good for cosine and PCA stability
    normalize_embeddings=True,
)

# PCA → 32D (prototype: fit on all data)
# For production: fit PCA on TRAIN ONLY, then pca.transform on val/test.
N = 32
pca = PCA(n_components=N, svd_solver="auto", random_state=42)
title_pcs = pca.fit_transform(embeddings)  # shape: [num_rows, 32]

# Attach to df
pc_cols = [f"title_emb_pca_{i:02d}" for i in range(1, N+1)]
# If re-running, avoid duplicate columns
for c in pc_cols:
    if c in df.columns:
        df.drop(columns=[c], inplace=True)

df[pc_cols] = pd.DataFrame(title_pcs, index=df.index)

# Report explained variance
explained = float(pca.explained_variance_ratio_.sum() * 100.0)
print(f"PCA: kept {N} components explaining {explained:.1f}% of variance.")
print("Top 5 component variances:", np.round(pca.explained_variance_ratio_[:5], 4))

# Feature preview
df[["title"] + pc_cols[:5]].head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

PCA: kept 32 components explaining 76.5% of variance.
Top 5 component variances: [0.143  0.0715 0.041  0.0354 0.0334]


Unnamed: 0,title,title_emb_pca_01,title_emb_pca_02,title_emb_pca_03,title_emb_pca_04,title_emb_pca_05
0,Ken losing his mind RAGE COMPILATION,0.131099,-0.135551,-0.077033,-0.244037,-0.023389
1,Ken fumbling easy wins RAGE COMPILATION,0.03725,-0.125397,-0.119702,-0.204091,-0.049404
2,The Impossible Level 16 King Tower Glitch,-0.315173,0.080969,0.247635,0.072237,-0.361322
3,The Mother Witch Incident (ft. Ken's neighbour),0.128926,0.249371,0.210802,0.018192,-0.076431
4,The WORST Clash Royale crash out of all time (...,-0.473281,-0.234562,0.138935,-0.283994,0.053162


In [None]:
##### Creates:
# title_thumbnail_semantic_alignment

# Import / install open_clip if needed
try:
    import open_clip
except Exception:
    !pip install -q open_clip_torch
    import open_clip

import torch
import numpy as np
import pandas as pd
import os
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP model + preprocess (ViT-B/32, OpenAI weights)
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="openai", device=device
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
model.eval()

# Resolve thumbnail path per row
def _resolve_thumb_path(row):
    # Prefer explicit thumbnail_path column if present
    if "thumbnail_path" in row and isinstance(row["thumbnail_path"], str) and os.path.isfile(row["thumbnail_path"]):
        return row["thumbnail_path"]
    # Fallback to thumbnails/{id}.jpg if 'id' column exists
    if "id" in row and isinstance(row["id"], str):
        p = os.path.join("thumbnails", f"{row['id']}.jpg")
        if os.path.isfile(p):
            return p
    return None

# Build text embeddings (batch)
titles = df["title"].astype(str).fillna("").tolist()
with torch.no_grad():
    text_tokens = tokenizer(titles)
    text_tokens = torch.tensor(text_tokens).to(device)
    text_emb = model.encode_text(text_tokens)
    # L2 normalize
    text_emb = text_emb / text_emb.norm(dim=1, keepdim=True).clamp_min(1e-9)

# Build image embeddings (batched to avoid OOM)
paths = df.apply(_resolve_thumb_path, axis=1).tolist()
valid_idxs = [i for i, p in enumerate(paths) if isinstance(p, str)]
image_emb = torch.zeros((len(titles), text_emb.shape[1]), dtype=text_emb.dtype, device=device)

batch_size = 64
with torch.no_grad():
    for s in range(0, len(valid_idxs), batch_size):
        batch_idx = valid_idxs[s:s+batch_size]
        batch_imgs = []
        for i in batch_idx:
            try:
                img = Image.open(paths[i]).convert("RGB")
                batch_imgs.append(preprocess(img))
            except Exception:
                # If the image can't be opened, skip; leave zeros (will yield NaN later)
                batch_imgs.append(None)
        # Filter out Nones while keeping index mapping
        keep = [(i, im) for i, im in zip(batch_idx, batch_imgs) if im is not None]
        if not keep:
            continue
        idxs, tensors = zip(*keep)
        imgs = torch.stack(list(tensors)).to(device)
        emb = model.encode_image(imgs)
        emb = emb / emb.norm(dim=1, keepdim=True).clamp_min(1e-9)
        # place into full tensor
        image_emb[idxs, :] = emb

# Cosine similarity (since both are normalized, dot product = cosine)
cos = (text_emb * image_emb).sum(dim=1)  # [-1, 1] where 0 means unrelated
# Map to 0..1 (optional but convenient):  (cos + 1) / 2
sim_01 = (cos + 1.0) / 2.0

# For rows with missing images (all-zero emb), set NaN instead of 0.5
missing_mask = (image_emb.norm(dim=1) < 1e-8).detach().cpu().numpy()
sim_vals = sim_01.detach().cpu().numpy()
sim_vals[missing_mask] = np.nan

df["title_thumbnail_semantic_alignment"] = sim_vals.astype(float)

# Feature preview
df[["title", "title_thumbnail_semantic_alignment"]].head()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

  text_tokens = torch.tensor(text_tokens).to(device)


Unnamed: 0,title,title_thumbnail_semantic_alignment
0,Ken losing his mind RAGE COMPILATION,0.624235
1,Ken fumbling easy wins RAGE COMPILATION,0.633395
2,The Impossible Level 16 King Tower Glitch,0.597753
3,The Mother Witch Incident (ft. Ken's neighbour),0.598637
4,The WORST Clash Royale crash out of all time (...,0.637247


In [None]:
##### Sanity check + feature peek for next section
df.columns

Index(['tags', 'title', 'thumbnail_path', 'thumbnail_exists',
       'thumbnail_colorfulness', 'thumbnail_brightness', 'thumbnail_contrast',
       'thumbnail_hue', 'thumbnail_saturation', 'thumbnail_edge_density',
       'thumbnail_texture_entropy', 'thumbnail_quality',
       'thumbnail_face_area_ratio', 'thumbnail_face_emotion',
       'thumbnail_ocr_text_coverage', 'thumbnail_saliency_thirds_proximity',
       'viral_label', 'title_sentiment', 'title_emotion_anger',
       'title_emotion_disgust', 'title_emotion_fear', 'title_emotion_joy',
       'title_emotion_neutral', 'title_emotion_sadness',
       'title_emotion_surprise', 'title_subjectivity', 'title_readability',
       'title_log_perplexity', 'title_emb_pca_01', 'title_emb_pca_02',
       'title_emb_pca_03', 'title_emb_pca_04', 'title_emb_pca_05',
       'title_emb_pca_06', 'title_emb_pca_07', 'title_emb_pca_08',
       'title_emb_pca_09', 'title_emb_pca_10', 'title_emb_pca_11',
       'title_emb_pca_12', 'title_emb_pca_13'

# Section 3: Tag-level Features

In [None]:
##### Sanity check + feature peek
df.columns

Index(['tags', 'title', 'thumbnail_path', 'thumbnail_exists',
       'thumbnail_colorfulness', 'thumbnail_brightness', 'thumbnail_contrast',
       'thumbnail_hue', 'thumbnail_saturation', 'thumbnail_edge_density',
       'thumbnail_texture_entropy', 'thumbnail_quality',
       'thumbnail_face_area_ratio', 'thumbnail_face_emotion',
       'thumbnail_ocr_text_coverage', 'thumbnail_saliency_thirds_proximity',
       'viral_label', 'title_sentiment', 'title_emotion_anger',
       'title_emotion_disgust', 'title_emotion_fear', 'title_emotion_joy',
       'title_emotion_neutral', 'title_emotion_sadness',
       'title_emotion_surprise', 'title_subjectivity', 'title_readability',
       'title_log_perplexity', 'title_emb_pca_01', 'title_emb_pca_02',
       'title_emb_pca_03', 'title_emb_pca_04', 'title_emb_pca_05',
       'title_emb_pca_06', 'title_emb_pca_07', 'title_emb_pca_08',
       'title_emb_pca_09', 'title_emb_pca_10', 'title_emb_pca_11',
       'title_emb_pca_12', 'title_emb_pca_13'

In [None]:
##### Creates:
# tag_semantic_entropy

# Deps
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    !pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer

try:
    from sklearn.cluster import MiniBatchKMeans
except Exception:
    !pip install -q scikit-learn
    from sklearn.cluster import MiniBatchKMeans

import numpy as np
import pandas as pd
import ast, re, torch

# Parse and normalize tags per row
def parse_tags_cell(x):
    # Input can be a list or a string (JSON-ish list, or comma/pipe/semicolon separated)
    if isinstance(x, list):
        tags = x
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                tags = parsed if isinstance(parsed, list) else [s]
            except Exception:
                tags = re.split(r"[,\|;]", s)
        else:
            tags = re.split(r"[,\|;]", s)
    else:
        tags = []

    # normalize: lowercase, strip whitespace/hashtags, drop empties; de-dup preserving order
    cleaned, seen = [], set()
    for t in tags:
        if not isinstance(t, str):
            continue
        tt = t.strip().lower().strip("#")
        if tt and tt not in seen:
            cleaned.append(tt); seen.add(tt)
    return cleaned

tag_lists = df["tags"].apply(parse_tags_cell)

# Build the unique tag vocabulary
all_tags = sorted({t for tags in tag_lists for t in tags})
print(f"Unique tags discovered: {len(all_tags)}")

# Early exit if no tags at all
if len(all_tags) == 0:
    df["tag_semantic_entropy"] = np.nan
else:
    # Embed tags with a lightweight SBERT (384-D)
    EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # good for short tags
    device = "cuda" if torch.cuda.is_available() else "cpu"
    encoder = SentenceTransformer(EMB_MODEL, device=device)

    def encode_texts(texts, batch_size=512):
        out = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            vecs = encoder.encode(
                batch,
                batch_size=min(batch_size, len(batch)),
                show_progress_bar=False,
                convert_to_numpy=True,
                normalize_embeddings=True,  # L2 norm for cosine-friendly space
            )
            out.append(vecs)
        return np.vstack(out)

    tag_emb = encode_texts(all_tags)  # shape [V, D]

    # Global clustering over tag embeddings
    K = 32  # tune 16–64; 32 is a good default
    kmeans = MiniBatchKMeans(
        n_clusters=K,
        batch_size=2048,
        random_state=42,
        n_init="auto",
        verbose=0,
    ).fit(tag_emb)

    # Map each tag to a cluster id (0..K-1)
    tag_to_cluster = {t: int(c) for t, c in zip(all_tags, kmeans.predict(tag_emb))}

    # Per-row normalized entropy in [0,1]
    def row_entropy(tags):
        if not tags:
            return np.nan
        clusters = [tag_to_cluster[t] for t in tags if t in tag_to_cluster]
        if not clusters:
            return np.nan
        counts = np.bincount(clusters, minlength=K).astype(np.float32)
        active = counts[counts > 0]
        if active.size <= 1:
            return 0.0  # all tags in one cluster → zero diversity
        p = active / active.sum()
        H = -np.sum(p * np.log2(p))   # bits
        H_max = np.log2(active.size)  # max bits for this many active clusters
        return float(H / H_max) if H_max > 0 else 0.0

    df["tag_semantic_entropy"] = tag_lists.apply(row_entropy)

# Feature preview
df[["tags", "tag_semantic_entropy"]].head()

Unique tags discovered: 37


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,tags,tag_semantic_entropy
0,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.96957
1,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.96957
2,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.96957
3,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.96957
4,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.971326


In [None]:
##### Creates:
# tag_title_coherence

# Deps
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    !pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer

import numpy as np
import pandas as pd
import ast, re, torch

# Parse & normalize tags per row
def parse_tags_cell(x):
    if isinstance(x, list):
        tags = x
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                tags = parsed if isinstance(parsed, list) else [s]
            except Exception:
                tags = re.split(r"[,\|;]", s)
        else:
            tags = re.split(r"[,\|;]", s)
    else:
        tags = []

    cleaned, seen = [], set()
    for t in tags:
        if not isinstance(t, str):
            continue
        tt = t.strip().lower().strip("#")
        if tt and tt not in seen:
            cleaned.append(tt); seen.add(tt)
    return cleaned

tag_lists = df["tags"].apply(parse_tags_cell)

# Collect unique tags; early-exit if none
unique_tags = sorted({t for tags in tag_lists for t in tags})
if len(unique_tags) == 0:
    df["tag_title_coherence"] = np.nan
else:
    # Load encoder (same model for titles & tags)
    MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # fast, good for short text
    device = "cuda" if torch.cuda.is_available() else "cpu"
    encoder = SentenceTransformer(MODEL, device=device)

    def encode_texts(texts, batch_size=512):
        if len(texts) == 0:
            return np.empty((0, 384), dtype=np.float32)
        return encoder.encode(
            texts,
            batch_size=min(batch_size, len(texts)),
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=True,  # L2-normalized outputs
        )

    # Embed unique tags once, build tag->vector map
    tag_vecs = encode_texts(unique_tags, batch_size=1024)
    tag2vec = {t: v for t, v in zip(unique_tags, tag_vecs)}

    # Embed all titles
    titles = df["title"].astype(str).fillna("").tolist()
    title_blank = [not t.strip() for t in titles]
    title_vecs = encode_texts(titles, batch_size=256)  # already normalized

    # Per-row mean tag embedding (normalized), then cosine with title
    def mean_tag_vec(tags_for_row):
        vecs = [tag2vec[t] for t in tags_for_row if t in tag2vec]
        if not vecs:
            return None
        m = np.mean(vecs, axis=0)
        n = np.linalg.norm(m)
        if n == 0:
            return None
        return m / n

    tag_means = [mean_tag_vec(tags) for tags in tag_lists]

    sims_01 = []
    for i, (tvec, mvec) in enumerate(zip(title_vecs, tag_means)):
        if mvec is None or title_blank[i]:
            sims_01.append(np.nan)
        else:
            cos = float(np.clip(np.dot(tvec, mvec), -1.0, 1.0))  # cosine since both L2-normalized
            sims_01.append((cos + 1.0) / 2.0)  # map [-1,1] -> [0,1]

    df["tag_title_coherence"] = sims_01

# Feature preview
df[["tags", "title", "tag_title_coherence"]].head()

Unnamed: 0,tags,title,tag_title_coherence
0,"['ken cr', 'ken clips', 'ken clash royale', 'k...",Ken losing his mind RAGE COMPILATION,0.819727
1,"['ken cr', 'ken clips', 'ken clash royale', 'k...",Ken fumbling easy wins RAGE COMPILATION,0.772894
2,"['ken cr', 'ken clips', 'ken clash royale', 'k...",The Impossible Level 16 King Tower Glitch,0.547227
3,"['ken cr', 'ken clips', 'ken clash royale', 'k...",The Mother Witch Incident (ft. Ken's neighbour),0.724189
4,"['ken cr', 'ken clips', 'ken clash royale', 'k...",The WORST Clash Royale crash out of all time (...,0.651607


In [None]:
##### Creates:
# tag_num_unique
# tag_multiword_ratio

import pandas as pd
import numpy as np
import ast, re

def parse_tags_cell(x):
    """
    Normalize a tags cell into a de-duplicated list of lowercase strings.
    Accepts lists or strings (JSON-ish list or comma/pipe/semicolon separated).
    Strips leading '#' and whitespace.
    """
    if isinstance(x, list):
        tags = x
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                tags = parsed if isinstance(parsed, list) else [s]
            except Exception:
                tags = re.split(r"[,\|;]", s)
        else:
            tags = re.split(r"[,\|;]", s)
    else:
        tags = []

    cleaned, seen = [], set()
    for t in tags:
        if not isinstance(t, str):
            continue
        tt = t.strip().lower().strip("#")
        if tt and tt not in seen:
            cleaned.append(tt); seen.add(tt)
    return cleaned

def is_multiword(tag: str) -> bool:
    """
    Heuristic: a tag has >1 'word' if splitting on whitespace/underscores/hyphens yields >1 tokens.
    """
    tokens = [tok for tok in re.split(r"[\s\-_]+", tag) if tok]
    return len(tokens) > 1

# Parse tags once
_tag_lists = df["tags"].apply(parse_tags_cell)

# Feature: number of unique tags
df["tag_num_unique"] = _tag_lists.apply(len).astype("Int64")

# Feature: ratio of multiword tags among unique tags
def multiword_ratio(tags):
    if not tags:
        return np.nan
    mw = sum(1 for t in tags if is_multiword(t))
    return mw / len(tags)

df["tag_multiword_ratio"] = _tag_lists.apply(multiword_ratio).astype(float)

# Feature preview
df[["tags", "tag_num_unique", "tag_multiword_ratio"]].head()

Unnamed: 0,tags,tag_num_unique,tag_multiword_ratio
0,"['ken cr', 'ken clips', 'ken clash royale', 'k...",10,0.9
1,"['ken cr', 'ken clips', 'ken clash royale', 'k...",10,0.9
2,"['ken cr', 'ken clips', 'ken clash royale', 'k...",10,0.9
3,"['ken cr', 'ken clips', 'ken clash royale', 'k...",10,0.9
4,"['ken cr', 'ken clips', 'ken clash royale', 'k...",11,0.818182


In [None]:
##### Creates:
# tag_title_overlap

# Deps: nltk only for PorterStemmer (no corpora), sklearn for stopwords
try:
    from nltk.stem import PorterStemmer
except Exception:
    !pip install -q nltk
    from nltk.stem import PorterStemmer

try:
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
except Exception:
    !pip install -q scikit-learn
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import re, ast, numpy as np, pandas as pd

# Stopword set (sklearn) + a few domain extras
STOP = set(ENGLISH_STOP_WORDS)
STOP |= {"video","videos","official","channel","new","best","top","full","live","shorts"}

stemmer = PorterStemmer()
_WORD_RE = re.compile(r"[a-z0-9]+")

def parse_tags_cell(x):
    if isinstance(x, list):
        tags = x
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                tags = parsed if isinstance(parsed, list) else [s]
            except Exception:
                tags = re.split(r"[,\|;]", s)
        else:
            tags = re.split(r"[,\|;]", s)
    else:
        tags = []
    cleaned, seen = [], set()
    for t in tags:
        if not isinstance(t, str):
            continue
        tt = t.strip().lower().strip("#")
        if tt and tt not in seen:
            cleaned.append(tt); seen.add(tt)
    return cleaned

def tokenize_normalize(text: str):
    if not isinstance(text, str):
        return set()
    toks = _WORD_RE.findall(text.lower())
    toks = [t for t in toks if t and t not in STOP]
    return {stemmer.stem(t) for t in toks}

def tags_to_token_set(tags_list):
    tokens = []
    for tag in tags_list:
        for part in re.split(r"[\s\-_]+", tag):
            if part:
                tokens.extend(_WORD_RE.findall(part.lower()))
    tokens = [t for t in tokens if t and t not in STOP]
    return {stemmer.stem(t) for t in tokens}

def jaccard(a: set, b: set):
    if not a or not b:
        return np.nan  # use 0.0 instead if you prefer
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else np.nan

# Compute feature
_tag_lists = df["tags"].apply(parse_tags_cell)
_title_sets = df["title"].astype(str).apply(tokenize_normalize)
_tag_sets = _tag_lists.apply(tags_to_token_set)

df["tag_title_overlap"] = [jaccard(a, b) for a, b in zip(_title_sets, _tag_sets)]

# Feature preview
df[["title", "tags", "tag_title_overlap"]].head()

Unnamed: 0,title,tags,tag_title_overlap
0,Ken losing his mind RAGE COMPILATION,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.142857
1,Ken fumbling easy wins RAGE COMPILATION,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.133333
2,The Impossible Level 16 King Tower Glitch,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.0
3,The Mother Witch Incident (ft. Ken's neighbour),"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.058824
4,The WORST Clash Royale crash out of all time (...,"['ken cr', 'ken clips', 'ken clash royale', 'k...",0.125


# Section 4: Additional Cross-level Features

In [None]:
##### Sanity check + feature peek
df.columns

Index(['tags', 'title', 'thumbnail_path', 'thumbnail_exists',
       'thumbnail_colorfulness', 'thumbnail_brightness', 'thumbnail_contrast',
       'thumbnail_hue', 'thumbnail_saturation', 'thumbnail_edge_density',
       'thumbnail_texture_entropy', 'thumbnail_quality',
       'thumbnail_face_area_ratio', 'thumbnail_face_emotion',
       'thumbnail_ocr_text_coverage', 'thumbnail_saliency_thirds_proximity',
       'viral_label', 'title_sentiment', 'title_emotion_anger',
       'title_emotion_disgust', 'title_emotion_fear', 'title_emotion_joy',
       'title_emotion_neutral', 'title_emotion_sadness',
       'title_emotion_surprise', 'title_subjectivity', 'title_readability',
       'title_log_perplexity', 'title_emb_pca_01', 'title_emb_pca_02',
       'title_emb_pca_03', 'title_emb_pca_04', 'title_emb_pca_05',
       'title_emb_pca_06', 'title_emb_pca_07', 'title_emb_pca_08',
       'title_emb_pca_09', 'title_emb_pca_10', 'title_emb_pca_11',
       'title_emb_pca_12', 'title_emb_pca_13'

In [None]:
##### Creates:
# Cross-level feature between title and face sentiment

# title_face_sentiment_gap               = title_sentiment - thumbnail_face_emotion           [-2,2]
# title_face_sentiment_mismatch          = abs(gap)                                           [0,2]
# title_face_sentiment_mismatch_weighted = mismatch * thumbnail_face_area_ratio               [0,2] scaled by face area

import numpy as np
import pandas as pd

# Ensure required columns exist and are numeric
for col in ["title_sentiment", "thumbnail_face_emotion", "thumbnail_face_area_ratio"]:
    if col not in df.columns:
        raise KeyError(f"Missing required column: {col}")
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Signed gap (direction matters)
df["title_face_sentiment_gap"] = df["title_sentiment"] - df["thumbnail_face_emotion"]

# Magnitude-only mismatch
df["title_face_sentiment_mismatch"] = df["title_face_sentiment_gap"].abs()

# Weighted by face presence/size (zeros out when no face; stays NaN if inputs are NaN)
df["title_face_sentiment_mismatch_weighted"] = (
    df["title_face_sentiment_mismatch"] * df["thumbnail_face_area_ratio"].fillna(0.0)
)

# Feature preview
df[[
    "title", "thumbnail_face_emotion", "thumbnail_face_area_ratio",
    "title_sentiment", "title_face_sentiment_gap",
    "title_face_sentiment_mismatch", "title_face_sentiment_mismatch_weighted"
]].head()

Unnamed: 0,title,thumbnail_face_emotion,thumbnail_face_area_ratio,title_sentiment,title_face_sentiment_gap,title_face_sentiment_mismatch,title_face_sentiment_mismatch_weighted
0,Ken losing his mind RAGE COMPILATION,0.684833,0.229601,-0.7865,-1.471333,1.471333,0.337819
1,Ken fumbling easy wins RAGE COMPILATION,0.641112,0.198767,0.3109,-0.330212,0.330212,0.065635
2,The Impossible Level 16 King Tower Glitch,0.303327,0.105625,0.0,-0.303327,0.303327,0.032039
3,The Mother Witch Incident (ft. Ken's neighbour),0.260067,0.068906,-0.3612,-0.621267,0.621267,0.042809
4,The WORST Clash Royale crash out of all time (...,0.469498,0.143767,-0.8192,-1.288698,1.288698,0.185273


In [None]:
##### Creates:
# Coherence Triangle: Title–Tags–Image

# tri_coherence = tag_title_coherence × title_thumbnail_semantic_alignment    [0,1]
# tri_coherence_lex = tri_coherence × tag_title_overlap                       [0,1]

import pandas as pd
import numpy as np

# Verify required columns
need = ["tag_title_coherence", "title_thumbnail_semantic_alignment"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

# Coerce to numeric
df["tag_title_coherence"] = pd.to_numeric(df["tag_title_coherence"], errors="coerce")
df["title_thumbnail_semantic_alignment"] = pd.to_numeric(df["title_thumbnail_semantic_alignment"], errors="coerce")

# Main triangle coherence (Title–Tags × Title–Image)
df["tri_coherence"] = df["tag_title_coherence"] * df["title_thumbnail_semantic_alignment"]

# Optional lexical-enhanced version if overlap column exists
overlap_col = None
if "tag_title_overlap_jaccard" in df.columns:
    overlap_col = "tag_title_overlap_jaccard"
elif "tag_title_overlap" in df.columns:
    overlap_col = "tag_title_overlap"

if overlap_col:
    df[overlap_col] = pd.to_numeric(df[overlap_col], errors="coerce")
    df["tri_coherence_lex"] = df["tri_coherence"] * df[overlap_col]
    print(f"Created tri_coherence_lex using overlap column: {overlap_col}")

# Quick peek
cols = ["title", "tag_title_coherence", "title_thumbnail_semantic_alignment", "tri_coherence"]
if "tri_coherence_lex" in df.columns:
    cols.append("tri_coherence_lex")
df[cols].head()

Created tri_coherence_lex using overlap column: tag_title_overlap


Unnamed: 0,title,tag_title_coherence,title_thumbnail_semantic_alignment,tri_coherence,tri_coherence_lex
0,Ken losing his mind RAGE COMPILATION,0.819727,0.624235,0.511702,0.0731
1,Ken fumbling easy wins RAGE COMPILATION,0.772894,0.633395,0.489547,0.065273
2,The Impossible Level 16 King Tower Glitch,0.547227,0.597753,0.327107,0.0
3,The Mother Witch Incident (ft. Ken's neighbour),0.724189,0.598637,0.433526,0.025502
4,The WORST Clash Royale crash out of all time (...,0.651607,0.637247,0.415235,0.051904


In [None]:
##### Creates:
# Text-heavy x Readability cross-level feature interaction

# text_x_hard       -> thumbnail_ocr_text_coverage * (100 - title_readability)         [0,100]
# text_x_hard_norm  -> normalized version                                              [0,1]

import pandas as pd

# Ensure required columns exist and are numeric
need = ["thumbnail_ocr_text_coverage", "title_readability"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")
df[need] = df[need].apply(pd.to_numeric, errors="coerce")

# Clip to expected ranges
cov = df["thumbnail_ocr_text_coverage"].clip(0.0, 1.0)
fre = df["title_readability"].clip(0.0, 100.0)  # Flesch Reading Ease (0..100, higher = easier)

# Interaction: more text + harder-to-read title → higher risk
df["text_x_hard"] = cov * (100.0 - fre)          # 0..100
df["text_x_hard_norm"] = df["text_x_hard"] / 100.0  # 0..1

# Feature preview
df[["title", "thumbnail_ocr_text_coverage", "title_readability",
    "text_x_hard", "text_x_hard_norm"]].head()

Unnamed: 0,title,thumbnail_ocr_text_coverage,title_readability,text_x_hard,text_x_hard_norm
0,Ken losing his mind RAGE COMPILATION,0.396562,59.745,15.963623,0.159636
1,Ken fumbling easy wins RAGE COMPILATION,0.479063,31.545,32.794223,0.327942
2,The Impossible Level 16 King Tower Glitch,0.141198,54.701429,6.396064,0.063961
3,The Mother Witch Incident (ft. Ken's neighbour),0.077292,66.787143,2.567077,0.025671
4,The WORST Clash Royale crash out of all time (...,0.298385,100.0,0.0,0.0


In [None]:
##### Creates:
# Complexity × Text interactions

# complexity_x_text_edge     -> edge density × OCR text coverage          [0,1]
# complexity_x_text_entropy  -> normalized texture entropy × text cover   [0,1]

import pandas as pd
import numpy as np

# Ensure required columns
need = ["thumbnail_edge_density", "thumbnail_texture_entropy", "thumbnail_ocr_text_coverage"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

# Coerce & clip to expected ranges
edge = pd.to_numeric(df["thumbnail_edge_density"], errors="coerce").clip(0.0, 1.0)
ent  = pd.to_numeric(df["thumbnail_texture_entropy"], errors="coerce").clip(lower=0.0)   # bits, usually 0..8
cov  = pd.to_numeric(df["thumbnail_ocr_text_coverage"], errors="coerce").clip(0.0, 1.0)

# Normalize entropy to [0,1] assuming 8-bit grayscale (max entropy ≈ 8 bits)
MAX_BITS = 8.0
ent_norm = (ent / MAX_BITS).clip(0.0, 1.0)

# Interactions
df["complexity_x_text_edge"]    = edge * cov                 # busy background + text → legibility risk
df["complexity_x_text_entropy"] = ent_norm * cov             # texture complexity + text → legibility risk

# Feature preview
df[[
    "title", "thumbnail_edge_density", "thumbnail_texture_entropy", "thumbnail_ocr_text_coverage",
    "complexity_x_text_edge", "complexity_x_text_entropy"
]].head()

Unnamed: 0,title,thumbnail_edge_density,thumbnail_texture_entropy,thumbnail_ocr_text_coverage,complexity_x_text_edge,complexity_x_text_entropy
0,Ken losing his mind RAGE COMPILATION,0.164722,7.886865,0.396562,0.065323,0.390954
1,Ken fumbling easy wins RAGE COMPILATION,0.200174,7.910999,0.479063,0.095896,0.473733
2,The Impossible Level 16 King Tower Glitch,0.214844,7.82749,0.141198,0.030335,0.138153
3,The Mother Witch Incident (ft. Ken's neighbour),0.165208,7.822907,0.077292,0.012769,0.075581
4,The WORST Clash Royale crash out of all time (...,0.168767,7.730054,0.298385,0.050358,0.288317


In [None]:
##### Creates:
# Composition × Face interaction

#   face_comp     -> thumbnail_face_area_ratio * thumbnail_saliency_thirds_proximity  (0..1)
#   face_present  -> 1 if any face area > 0 else 0 (helper flag)

import pandas as pd

# Ensure required columns
need = ["thumbnail_face_area_ratio", "thumbnail_saliency_thirds_proximity"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

# Coerce & clip
face_area = pd.to_numeric(df["thumbnail_face_area_ratio"], errors="coerce").clip(0.0, 1.0).fillna(0.0)
thirds    = pd.to_numeric(df["thumbnail_saliency_thirds_proximity"], errors="coerce").clip(0.0, 1.0)

# Interaction: bigger faces placed closer to rule-of-thirds hotspots → higher score
df["face_comp"] = face_area * thirds  # stays in [0,1] if inputs are

# Optional helper: face presence indicator
df["face_present"] = (face_area > 0).astype(int)

# Feature preview
df[["title", "thumbnail_face_area_ratio", "thumbnail_saliency_thirds_proximity", "face_comp", "face_present"]].head()

Unnamed: 0,title,thumbnail_face_area_ratio,thumbnail_saliency_thirds_proximity,face_comp,face_present
0,Ken losing his mind RAGE COMPILATION,0.229601,0.805344,0.184908,1
1,Ken fumbling easy wins RAGE COMPILATION,0.198767,0.712971,0.141715,1
2,The Impossible Level 16 King Tower Glitch,0.105625,0.611532,0.064593,1
3,The Mother Witch Incident (ft. Ken's neighbour),0.068906,0.69327,0.047771,1
4,The WORST Clash Royale crash out of all time (...,0.143767,0.630173,0.090598,1


In [None]:
##### Creates:
# Warmth/Coolness × Sentiment cross-level feature interactions

# warmth_score        -> closeness to warm hues, scaled by saturation                [0,1]
# cool_score          -> closeness to cool hues (~blue/cyan), scaled by saturation   [0,1]
# warm_x_positive     -> warmth × (joy + positive VADER)                             [0,1] normalized
# cool_x_negative     -> coolness × (sadness + fear + neg VADER)                     [0,1] normalized

import numpy as np
import pandas as pd

# Ensure required columns
need = ["thumbnail_hue", "thumbnail_saturation",
        "title_emotion_joy", "title_emotion_sadness", "title_emotion_fear",
        "title_sentiment"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

df[need] = df[need].apply(pd.to_numeric, errors="coerce")

# Normalize hue to degrees and saturation to [0,1] (robust to different scales)
def norm_hue_to_degrees(series: pd.Series) -> pd.Series:
    m = series.max(skipna=True)
    if pd.isna(m):
        return series
    if m <= 1.5:        # 0..1 -> degrees
        out = (series * 360.0) % 360.0
    elif m <= 190:      # OpenCV 0..179 -> degrees
        out = (series * 2.0) % 360.0
    else:               # already degrees
        out = series % 360.0
    return out

def norm_sat_to_unit(series: pd.Series) -> pd.Series:
    m = series.max(skipna=True)
    if pd.isna(m):
        return series
    if m <= 1.5:             # 0..1
        out = series.clip(0.0, 1.0)
    elif m <= 255 + 1e-6:    # 0..255
        out = (series / 255.0).clip(0.0, 1.0)
    elif m <= 100 + 1e-6:    # 0..100
        out = (series / 100.0).clip(0.0, 1.0)
    else:                    # fallback min-max
        mn = series.min(skipna=True)
        rng = max(float(m - mn), 1e-6)
        out = ((series - mn) / rng).clip(0.0, 1.0)
    return out

h_deg = norm_hue_to_degrees(df["thumbnail_hue"].astype(float))
s01  = norm_sat_to_unit(df["thumbnail_saturation"].astype(float)).fillna(0.0)

# Warmth & Coolness scores (0..1), modulated by saturation
# Warmth: closeness to warm band around 0° (reds/oranges/yellows), zero by ±60°
d_warm = np.minimum(h_deg, 360.0 - h_deg)  # angular distance to 0° with wrap
warm_closeness = np.maximum(0.0, 1.0 - (d_warm / 60.0))  # 1 at 0°, 0 by 60°
warmth = (warm_closeness * s01).clip(0.0, 1.0)

# Coolness: closeness to 180° (cyan/blue), zero by ±60°
delta = np.abs(h_deg - 180.0)
d_cool = np.minimum(delta, 360.0 - delta)  # angular distance to 180° with wrap
cool_closeness = np.maximum(0.0, 1.0 - (d_cool / 60.0))  # 1 at 180°, 0 by 60°
coolness = (cool_closeness * s01).clip(0.0, 1.0)

df["warmth_score"] = warmth
df["cool_score"]   = coolness

# Sentiment parts (use probabilities + VADER parts)
joy = df["title_emotion_joy"].fillna(0.0).clip(0.0, 1.0)
sad = df["title_emotion_sadness"].fillna(0.0).clip(0.0, 1.0)
fear = df["title_emotion_fear"].fillna(0.0).clip(0.0, 1.0)
sent = df["title_sentiment"].fillna(0.0)

pos_part = sent.clip(lower=0.0)          # positive VADER
neg_part = (-sent).clip(lower=0.0)       # negative VADER

# Raw sums could exceed 1; we provide normalized [0,1] features
# Warm × positive: warmth * (joy + pos_sent) / 2
df["warm_x_positive"] = (warmth * (joy + pos_part) / 2.0).clip(0.0, 1.0)

# Cool × negative: cool * (sadness + fear + neg_sent) / 3
df["cool_x_negative"] = (coolness * (sad + fear + neg_part) / 3.0).clip(0.0, 1.0)

# Quick peek
df[[
    "title", "thumbnail_hue", "thumbnail_saturation",
    "warmth_score", "cool_score",
    "warm_x_positive", "cool_x_negative"
]].head()

Unnamed: 0,title,thumbnail_hue,thumbnail_saturation,warmth_score,cool_score,warm_x_positive,cool_x_negative
0,Ken losing his mind RAGE COMPILATION,108.838844,0.380881,0.0,0.141702,0.0,0.03921
1,Ken fumbling easy wins RAGE COMPILATION,117.835182,0.427772,0.0,0.030868,0.0,0.000456
2,The Impossible Level 16 King Tower Glitch,55.820709,0.503604,0.0,0.0,0.0,0.0
3,The Mother Witch Incident (ft. Ken's neighbour),125.57164,0.361966,0.0,0.0,0.0,0.0
4,The WORST Clash Royale crash out of all time (...,104.623383,0.424597,0.0,0.217629,0.0,0.068008


In [None]:
##### Creates:
# Novel phrasing × Alignment cross-level feature interactions

#   novelty_score       -> normalized title novelty (from title_log_perplexity)     [0,1]
#   novel_x_align       -> novelty × alignment (beneficial grounded novelty)
#   novel_x_misaligned  -> novelty × (1 - alignment) (risky novelty)

import numpy as np
import pandas as pd

# Ensure required columns
need = ["title_log_perplexity", "title_thumbnail_semantic_alignment"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

# Coerce to numeric
lp  = pd.to_numeric(df["title_log_perplexity"], errors="coerce")                 # nats/token
ali = pd.to_numeric(df["title_thumbnail_semantic_alignment"], errors="coerce")   # expected 0..1

# Normalize novelty to [0,1] robustly using 5th–95th percentiles (clip outliers)
p5, p95 = np.nanpercentile(lp, [5, 95]) if lp.notna().any() else (0.0, 1.0)
rng = max(p95 - p5, 1e-6)
novelty = ((lp - p5) / rng).clip(0.0, 1.0)

# Clip alignment to [0,1]
ali = ali.clip(0.0, 1.0)

# Interactions
df["novelty_score"] = novelty
df["novel_x_align"] = novelty * ali                 # novelty that is grounded by a well-aligned image
df["novel_x_misaligned"] = novelty * (1.0 - ali)    # novelty that is NOT grounded (riskier)

# Feature preview
df[["title", "title_log_perplexity", "title_thumbnail_semantic_alignment",
    "novelty_score", "novel_x_align", "novel_x_misaligned"]].head()

Unnamed: 0,title,title_log_perplexity,title_thumbnail_semantic_alignment,novelty_score,novel_x_align,novel_x_misaligned
0,Ken losing his mind RAGE COMPILATION,6.563992,0.624235,0.525903,0.328287,0.197616
1,Ken fumbling easy wins RAGE COMPILATION,7.545415,0.633395,0.748684,0.474213,0.274471
2,The Impossible Level 16 King Tower Glitch,7.729622,0.597753,0.790499,0.472523,0.317976
3,The Mother Witch Incident (ft. Ken's neighbour),7.133723,0.598637,0.655231,0.392245,0.262985
4,The WORST Clash Royale crash out of all time (...,5.757444,0.637247,0.342819,0.218461,0.124359


# Section 5: Trending Score

In [None]:
##### Creates:
# trending_score ∈ [0,1]

# External Trending Score via YouTube Data API (cheap & first-party)
# Fetches "mostPopular" videos for selected regions
# Builds a token→score dict (view-weighted)

import os, re, ast, math, requests
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

# CONFIG
from google.colab import userdata
YOUTUBE_API_KEY = userdata.get('YOUTUBE_API_KEY')

REGIONS = ["US","GB","IN","BR","DE","JP","KR","FR","CA","AU"]   # tweak as you like
PAGES_PER_REGION = 2    # each page returns up to 50 videos → ~100/region
TIMEOUT = 20

# stopwords (no downloads)
try:
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    STOP = set(ENGLISH_STOP_WORDS)
except Exception:
    STOP = set()
STOP |= {"video","videos","official","channel","new","best","top","full","live","shorts"}

WORD_RE = re.compile(r"[a-z0-9]+")

def tokenize_text(s: str):
    toks = WORD_RE.findall(str(s).lower())
    return [t for t in toks if t and t not in STOP]

def parse_tags_cell(x):
    if isinstance(x, list):
        tags = x
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                tags = ast.literal_eval(s)
                if not isinstance(tags, list): tags = [s]
            except Exception:
                tags = re.split(r"[,\|;]", s)
        else:
            tags = re.split(r"[,\|;]", s)
    else:
        tags = []
    # basic normalize
    seen, out = set(), []
    for t in tags:
        if not isinstance(t, str): continue
        tt = t.strip().lower().strip("#")
        if tt and tt not in seen:
            out.append(tt); seen.add(tt)
    return out

def split_tag_to_tokens(tag: str):
    toks = []
    for part in re.split(r"[\s\-_]+", tag):
        toks.extend(WORD_RE.findall(part.lower()))
    return [t for t in toks if t and t not in STOP]

# FETCH TRENDING ITEMS
def fetch_trending_items(region: str, pages: int = 2):
    url = "https://www.googleapis.com/youtube/v3/videos"
    items = []
    page_token = None
    for _ in range(max(1, pages)):
        params = {
            "part": "snippet,statistics",
            "chart": "mostPopular",
            "regionCode": region,
            "maxResults": 50,
            "pageToken": page_token or "",
            "key": YOUTUBE_API_KEY
        }
        r = requests.get(url, params=params, timeout=TIMEOUT)
        r.raise_for_status()
        data = r.json()
        items.extend(data.get("items", []))
        page_token = data.get("nextPageToken")
        if not page_token:
            break
    return items

all_items = []
for rc in REGIONS:
    try:
        all_items.extend(fetch_trending_items(rc, PAGES_PER_REGION))
    except Exception as e:
        print(f"[warn] region {rc}: {e}")

print(f"Fetched {len(all_items)} trending video items across {len(REGIONS)} regions.")

# BUILD TOKEN → SCORE DICT
token_scores = Counter()
for it in all_items:
    snip = it.get("snippet", {})
    stats = it.get("statistics", {})
    title = snip.get("title", "")
    tags  = snip.get("tags") or []

    # weight by views (log1p to tame extremes)
    try:
        views = int(stats.get("viewCount", "0"))
    except Exception:
        views = 0
    w = math.log1p(max(views, 0))

    # tokens from title + tags
    toks = set(tokenize_text(title))
    for tg in parse_tags_cell(tags):
        toks.update(split_tag_to_tokens(tg))

    for tok in toks:
        token_scores[tok] += w

# prune ultra-rare tokens
MIN_TOKENS = 3
token_scores = Counter({k:v for k,v in token_scores.items() if v > 0 and v >= MIN_TOKENS})

print(f"Trending vocabulary size: {len(token_scores)}")

# Convert to dict of floats
tok2score = {k: float(v) for k, v in token_scores.items()}

# MAP TO YOUR DF AS trending_score
# Prepare per-row token sets from your current df (title + tags)
title_tokens = df["title"].astype(str).apply(tokenize_text)
if "tags" in df.columns:
    tag_lists = df["tags"].apply(parse_tags_cell)
    tag_tokens = tag_lists.apply(lambda lst: [t for tg in lst for t in split_tag_to_tokens(tg)])
else:
    tag_tokens = pd.Series([[] for _ in range(len(df))], index=df.index)

row_token_sets = [set(a) | set(b) for a,b in zip(title_tokens, tag_tokens)]

def topk_mean(scores, k=3):
    if not scores:
        return np.nan
    arr = np.array(scores, dtype=float)
    if arr.size >= k:
        arr.sort()
        return float(arr[-k:].mean())
    return float(arr.mean())

row_raw = []
for toks in row_token_sets:
    scores = [tok2score[t] for t in toks if t in tok2score]
    row_raw.append(topk_mean(scores, k=3))

df["trending_score_raw"] = row_raw

# Normalize to [0,1] using robust min/max (5th–95th pct)
vals = pd.to_numeric(df["trending_score_raw"], errors="coerce")
if vals.notna().any():
    p5, p95 = np.nanpercentile(vals, [5, 95])
    rng = max(p95 - p5, 1e-6)
    df["trending_score"] = ((vals - p5) / rng).clip(0.0, 1.0)
else:
    df["trending_score"] = np.nan

# Optional: drop the raw column after inspection
df.drop(columns=["trending_score_raw"], inplace=True)

# Feature preview
df[["title", "trending_score"]].head()

Fetched 1000 trending video items across 10 regions.
Trending vocabulary size: 6406


Unnamed: 0,title,trending_score
0,Ken losing his mind RAGE COMPILATION,0.533651
1,Ken fumbling easy wins RAGE COMPILATION,0.533651
2,The Impossible Level 16 King Tower Glitch,0.543299
3,The Mother Witch Incident (ft. Ken's neighbour),0.777067
4,The WORST Clash Royale crash out of all time (...,0.533651


In [None]:
##### Sanity check + feature peek
df.columns

Index(['tags', 'title', 'thumbnail_path', 'thumbnail_exists',
       'thumbnail_colorfulness', 'thumbnail_brightness', 'thumbnail_contrast',
       'thumbnail_hue', 'thumbnail_saturation', 'thumbnail_edge_density',
       'thumbnail_texture_entropy', 'thumbnail_quality',
       'thumbnail_face_area_ratio', 'thumbnail_face_emotion',
       'thumbnail_ocr_text_coverage', 'thumbnail_saliency_thirds_proximity',
       'viral_label', 'title_sentiment', 'title_emotion_anger',
       'title_emotion_disgust', 'title_emotion_fear', 'title_emotion_joy',
       'title_emotion_neutral', 'title_emotion_sadness',
       'title_emotion_surprise', 'title_subjectivity', 'title_readability',
       'title_log_perplexity', 'title_emb_pca_01', 'title_emb_pca_02',
       'title_emb_pca_03', 'title_emb_pca_04', 'title_emb_pca_05',
       'title_emb_pca_06', 'title_emb_pca_07', 'title_emb_pca_08',
       'title_emb_pca_09', 'title_emb_pca_10', 'title_emb_pca_11',
       'title_emb_pca_12', 'title_emb_pca_13'

# Section 6: Cleanup & Export

In [None]:
##### Dropping helper columns

# Dropping bad columns/features
columns_to_drop = [
    'tags',
    'title',
    'thumbnail_path',
    'thumbnail_exists'
]

# Check which columns exist in the DataFrame before dropping
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

df = df.drop(columns=existing_columns_to_drop, axis=1)

In [None]:
df.columns

Index(['thumbnail_colorfulness', 'thumbnail_brightness', 'thumbnail_contrast',
       'thumbnail_hue', 'thumbnail_saturation', 'thumbnail_edge_density',
       'thumbnail_texture_entropy', 'thumbnail_quality',
       'thumbnail_face_area_ratio', 'thumbnail_face_emotion',
       'thumbnail_ocr_text_coverage', 'thumbnail_saliency_thirds_proximity',
       'viral_label', 'title_sentiment', 'title_emotion_anger',
       'title_emotion_disgust', 'title_emotion_fear', 'title_emotion_joy',
       'title_emotion_neutral', 'title_emotion_sadness',
       'title_emotion_surprise', 'title_subjectivity', 'title_readability',
       'title_log_perplexity', 'title_emb_pca_01', 'title_emb_pca_02',
       'title_emb_pca_03', 'title_emb_pca_04', 'title_emb_pca_05',
       'title_emb_pca_06', 'title_emb_pca_07', 'title_emb_pca_08',
       'title_emb_pca_09', 'title_emb_pca_10', 'title_emb_pca_11',
       'title_emb_pca_12', 'title_emb_pca_13', 'title_emb_pca_14',
       'title_emb_pca_15', 'title_emb_pc

In [None]:
filename = 'engineered_features_v1.csv'

# Save DataFrame to CSV
df.to_csv(filename, index=False)

# Download it
from google.colab import files
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>