<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality-Testing/blob/main/feature_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1: Thumbnail Features

In [93]:
filename = "test.csv"

# Load the CSV
import pandas as pd

df = pd.read_csv(f'/content/{filename}')
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.categoryId,snippet.liveBroadcastContent,snippet.defaultLanguage,snippet.localized.title,snippet.localized.description,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,D5hdxPi5WRssQsXncpidVGWN9ro,qRF04lTrbPs,UCQil4Xo3ymmGwMdCBvHVmLA,2025-08-05T13:01:35Z,UCQil4Xo3ymmGwMdCBvHVmLA,Ken losing his mind RAGE COMPILATION,Ken goes insane raging at Clash Royale. Long c...,https://i.ytimg.com/vi/qRF04lTrbPs/default.jpg,120,...,20,none,en,Ken losing his mind RAGE COMPILATION,Ken goes insane raging at Clash Royale. Long c...,en,42167,1776,0,71
1,youtube#video,auY24O-_owrfJHstP6Oir3EZsp0,NeWv_WA_4R0,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-28T13:22:34Z,UCQil4Xo3ymmGwMdCBvHVmLA,Ken fumbling easy wins RAGE COMPILATION,Compilation of Ken's best rage moments and the...,https://i.ytimg.com/vi/NeWv_WA_4R0/default.jpg,120,...,20,none,en,Ken fumbling easy wins RAGE COMPILATION,Compilation of Ken's best rage moments and the...,en,76601,3056,0,77
2,youtube#video,RFAWz7pgfALVB0hpRRpnbJ9zqyM,xF0KI6oo7ak,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-21T13:45:08Z,UCQil4Xo3ymmGwMdCBvHVmLA,The Impossible Level 16 King Tower Glitch,Very rare chef tower monk bug happens in Ken's...,https://i.ytimg.com/vi/xF0KI6oo7ak/default.jpg,120,...,20,none,en,The Impossible Level 16 King Tower Glitch,Very rare chef tower monk bug happens in Ken's...,en,24549,1042,0,25
3,youtube#video,c4Av6MkEuL2mI-k69AU7aqIfVII,addCkFsak1U,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-16T14:00:05Z,UCQil4Xo3ymmGwMdCBvHVmLA,The Mother Witch Incident (ft. Ken's neighbour),Extremely rare and classic Ken Clash Royale mo...,https://i.ytimg.com/vi/addCkFsak1U/default.jpg,120,...,20,none,en,The Mother Witch Incident (ft. Ken's neighbour),Extremely rare and classic Ken Clash Royale mo...,en,103786,4871,0,99
4,youtube#video,ZYSk9dCdICIQLkhmin_oiVkuB_A,GFd2FmQAyE0,UCQil4Xo3ymmGwMdCBvHVmLA,2025-07-10T06:36:14Z,UCQil4Xo3ymmGwMdCBvHVmLA,The WORST Clash Royale crash out of all time (...,I've watched hundreds of Ken videos and this i...,https://i.ytimg.com/vi/GFd2FmQAyE0/default.jpg,120,...,20,none,en,The WORST Clash Royale crash out of all time (...,I've watched hundreds of Ken videos and this i...,en,61414,3818,0,124


In [94]:
# Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

# Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]

Found 93 video IDs.


In [95]:
##### Creates:
# thumbnail_colorfulness

import numpy as np

def compute_colorfulness(img_path):
    """
    Implements the Hasler–Süsstrunk metric:
      C = sqrt(σ_rg^2 + σ_yb^2) + 0.3 * sqrt(μ_rg^2 + μ_yb^2)
    where
      rg = |R − G|
      yb = |0.5*(R + G) − B|
    """
    img = np.array(Image.open(img_path).convert('RGB')).astype('float32')
    R, G, B = img[...,0], img[...,1], img[...,2]
    rg = np.abs(R - G)
    yb = np.abs(0.5*(R + G) - B)

    std_rg = np.std(rg)
    std_yb = np.std(yb)
    mean_rg = np.mean(rg)
    mean_yb = np.mean(yb)

    # Hasler–Süsstrunk colorfulness
    return np.sqrt(std_rg**2 + std_yb**2) + 0.3 * np.sqrt(mean_rg**2 + mean_yb**2)

# Assume `df` has an "id" column of video_ids and your thumbnails are saved as thumbnails/{id}.jpg
colorfulness_scores = []
for vid in df["id"].dropna().unique():
    path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.exists(path):
        colorfulness_scores.append((vid, compute_colorfulness(path)))
    else:
        colorfulness_scores.append((vid, np.nan))

# Turn into a dict for fast lookup, then map back onto df
cf_dict = dict(colorfulness_scores)
df["thumbnail_colorfulness"] = df["id"].map(cf_dict)

# Inspect
df[["id", "thumbnail_colorfulness"]].head()

Unnamed: 0,id,thumbnail_colorfulness
0,qRF04lTrbPs,64.768858
1,NeWv_WA_4R0,68.511394
2,xF0KI6oo7ak,82.83702
3,addCkFsak1U,87.258003
4,GFd2FmQAyE0,58.208581


In [96]:
##### Creates:
# thumbnail_brightness
# thumbnail_contrast

def compute_brightness_rms(img_path):
    """
    Returns:
      - mean_luminance: Mean pixel intensity (0 → 255)
      - rms_contrast: Standard deviation of luminance divided by mean luminance
    """
    img = np.array(Image.open(img_path).convert('L')).astype(np.float32)
    mean_luminance = img.mean()
    rms_contrast = img.std() / mean_luminance if mean_luminance != 0 else 0.0
    return mean_luminance, rms_contrast

# Compute brightness & RMS contrast for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        brightness, rms = compute_brightness_rms(thumb_path)
    else:
        brightness, rms = np.nan, np.nan
    results.append((vid, brightness, rms))

# Map back to DataFrame
brightness_map = {vid: b for vid, b, _ in results}
rms_map        = {vid: rms for vid, _, rms in results}

df["thumbnail_brightness"]   = df["id"].map(brightness_map)
df["thumbnail_contrast"] = df["id"].map(rms_map)

# Preview updated columns
df[["id", "thumbnail_brightness", "thumbnail_contrast"]].head()

Unnamed: 0,id,thumbnail_brightness,thumbnail_contrast
0,qRF04lTrbPs,116.419357,0.564473
1,NeWv_WA_4R0,121.534286,0.574799
2,xF0KI6oo7ak,112.830292,0.539484
3,addCkFsak1U,134.029816,0.50634
4,GFd2FmQAyE0,102.990135,0.761938


In [97]:
##### Creates:
# thumbnail_hue
# thumbnail_saturation

def compute_hue_saturation(img_path):
    """
    Returns:
      - mean_hue: Mean hue in degrees (0 → 360)
      - mean_saturation: Mean saturation (0 → 1)
    """
    # PIL HSV channels are 0–255; convert to float
    img = np.array(Image.open(img_path).convert('HSV')).astype(np.float32)
    H, S, _ = img[..., 0], img[..., 1], img[..., 2]
    # Scale H to degrees, S to fraction
    mean_hue = (H.mean() * 360.0) / 255.0
    mean_saturation = S.mean() / 255.0
    return mean_hue, mean_saturation

# Compute hue & saturation features for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        hue, sat = compute_hue_saturation(thumb_path)
    else:
        hue, sat = np.nan, np.nan
    results.append((vid, hue, sat))

# Map back to DataFrame
hue_map = {vid: hue for vid, hue, _ in results}
sat_map = {vid: sat for vid, _, sat in results}

df["thumbnail_hue"] = df["id"].map(hue_map)
df["thumbnail_saturation"] = df["id"].map(sat_map)

# Preview new columns
df[["id", "thumbnail_hue", "thumbnail_saturation"]].head()

Unnamed: 0,id,thumbnail_hue,thumbnail_saturation
0,qRF04lTrbPs,108.838849,0.380881
1,NeWv_WA_4R0,117.835176,0.427772
2,xF0KI6oo7ak,55.820709,0.503604
3,addCkFsak1U,125.571641,0.361966
4,GFd2FmQAyE0,104.623384,0.424597


In [98]:
##### Creates:
# thumbnail_edge_density
# thumbnail_texture_entropy

import cv2

def compute_edge_texture(img_path):
    """
    Returns:
      - edge_density: fraction of Canny edge pixels (0 → 1)
      - texture_entropy: Shannon entropy of grayscale pixel distribution (in bits)
    """
    # Load as grayscale
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    # Edge density
    edges = cv2.Canny(img, threshold1=100, threshold2=200)
    edge_density = np.count_nonzero(edges) / img.size

    # Texture (grayscale) entropy
    hist, _ = np.histogram(img.flatten(), bins=256, range=(0, 255))
    probs = hist / hist.sum()
    probs_nonzero = probs[probs > 0]
    texture_entropy = -np.sum(probs_nonzero * np.log2(probs_nonzero))

    return edge_density, texture_entropy

# Compute edge density & texture entropy for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        ed, te = compute_edge_texture(thumb_path)
    else:
        ed, te = np.nan, np.nan
    results.append((vid, ed, te))

# Map results back to DataFrame
edge_density_map   = {vid: ed for vid, ed, _ in results}
texture_entropy_map = {vid: te for vid, _, te in results}

df["thumbnail_edge_density"]    = df["id"].map(edge_density_map)
df["thumbnail_texture_entropy"] = df["id"].map(texture_entropy_map)

# Preview new columns
df[["id", "thumbnail_edge_density", "thumbnail_texture_entropy"]].head()

Unnamed: 0,id,thumbnail_edge_density,thumbnail_texture_entropy
0,qRF04lTrbPs,0.164722,7.886865
1,NeWv_WA_4R0,0.200174,7.910999
2,xF0KI6oo7ak,0.214844,7.82749
3,addCkFsak1U,0.165208,7.822907
4,GFd2FmQAyE0,0.168767,7.730054


In [99]:
##### Creates:
# thumbnail_quality

def compute_thumbnail_quality(img_path):
    """
    Returns:
      - thumbnail_quality: variance of the Laplacian of the grayscale image
        (higher = sharper, lower = blurrier)
    """
    gray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    lap = cv2.Laplacian(gray, cv2.CV_64F)
    return lap.var()

# Compute thumbnail_quality for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    if os.path.isfile(thumb_path):
        quality = compute_thumbnail_quality(thumb_path)
    else:
        quality = np.nan
    results.append((vid, quality))

# Map back to DataFrame
quality_map = {vid: q for vid, q in results}
df["thumbnail_quality"] = df["id"].map(quality_map)

# Preview the new feature
df[["id", "thumbnail_quality"]].head()

Unnamed: 0,id,thumbnail_quality
0,qRF04lTrbPs,4676.820102
1,NeWv_WA_4R0,8264.712197
2,xF0KI6oo7ak,5745.212433
3,addCkFsak1U,6141.966672
4,GFd2FmQAyE0,6204.187401


In [100]:
##### Creates:
# thumbnail_face_area_ratio

# Load OpenCV's built-in Haar cascade for face detection
face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

def compute_face_area_ratio(img_path):
    """
    Returns:
      - face_area_ratio: sum of detected face bounding-box areas divided by total image area
    """
    img = cv2.imread(img_path)
    if img is None:
        return np.nan
    height, width = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(30, 30)
    )
    # Sum up face areas
    total_face_area = sum((fw * fh) for (_, _, fw, fh) in faces)
    total_area = width * height
    return total_face_area / total_area if total_area > 0 else 0.0

# Compute face_area_ratio for each video
results = []
for vid in df["id"].dropna().unique():
    thumb_path = os.path.join("thumbnails", f"{vid}.jpg")
    ratio = compute_face_area_ratio(thumb_path) if os.path.isfile(thumb_path) else np.nan
    results.append((vid, ratio))

# Map back to DataFrame
face_area_map = {vid: ratio for vid, ratio in results}
df["thumbnail_face_area_ratio"] = df["id"].map(face_area_map)

# Preview the new feature
df[["id", "thumbnail_face_area_ratio"]].head()

Unnamed: 0,id,thumbnail_face_area_ratio
0,qRF04lTrbPs,0.229601
1,NeWv_WA_4R0,0.198767
2,xF0KI6oo7ak,0.105625
3,addCkFsak1U,0.068906
4,GFd2FmQAyE0,0.143767


In [101]:
##### Creates:
# thumbnail_face_emotion

!pip install -q mediapipe==0.10.14

import os
import cv2
import numpy as np
import mediapipe as mp

# Assumes `df` is already loaded and thumbnails are in "thumbnails/{video_id}.jpg"

mp_face_mesh = mp.solutions.face_mesh

def _euclid(p1, p2):
    return float(np.hypot(p1[0]-p2[0], p1[1]-p2[1]))

def _safe_ratio(a, b, eps=1e-6):
    return float(a / (b + eps))

def compute_thumbnail_face_emotion(img_path: str) -> float:
    """
    Landmark-based valence proxy in [-1, 1]:
      + Increases with mouth width (smile) and eye openness
      + Slightly increases with mouth openness (surprise)
      Returns 0.0 if no faces detected.
    """
    img_bgr = cv2.imread(img_path)
    if img_bgr is None:
        return np.nan
    H, W = img_bgr.shape[:2]
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

    with mp_face_mesh.FaceMesh(
        static_image_mode=True,
        max_num_faces=10,
        refine_landmarks=True,
        min_detection_confidence=0.5
    ) as fm:
        res = fm.process(img_rgb)

    if not res.multi_face_landmarks:
        return 0.0

    vals, areas = [], []
    for lm in res.multi_face_landmarks:
        pts = [(int(p.x * W), int(p.y * H)) for p in lm.landmark]

        # Landmark indices (MediaPipe Face Mesh):
        # Mouth corners: 61 (left), 291 (right)
        # Upper/Lower lip center: 13, 14
        # Face "width": 234 (left cheek), 454 (right cheek)
        # Face "height": 10 (forehead), 152 (chin)
        # Eyes: left top/bottom 159/145 with corners 33/133; right 386/374 with corners 362/263
        face_w  = _euclid(pts[234], pts[454])
        face_h  = _euclid(pts[10],  pts[152])
        if face_w < 1 or face_h < 1:
            continue

        mouth_w     = _euclid(pts[61],  pts[291])
        mouth_open  = _euclid(pts[13],  pts[14])
        left_eye_op = _safe_ratio(_euclid(pts[159], pts[145]), _euclid(pts[33],  pts[133]))
        right_eye_op= _safe_ratio(_euclid(pts[386], pts[374]), _euclid(pts[362], pts[263]))
        eye_open    = 0.5 * (left_eye_op + right_eye_op)

        # Normalize by face size
        wide  = _safe_ratio(mouth_w, face_w)      # smile width
        opened= _safe_ratio(mouth_open, face_h)   # mouth open
        # Simple weighted score → squashed to [-1, 1]
        score = 1.2 * wide + 0.6 * opened + 0.3 * eye_open
        val   = float(np.tanh((score - 0.55) * 3.0))  # center and scale

        # Approx area to weight bigger faces more
        areas.append(face_w * face_h)
        vals.append(val)

    if not vals:
        return 0.0
    return float(np.average(vals, weights=np.asarray(areas, dtype=np.float32)))

# Compute for each video and map back to df
results = []
for vid in df["id"].dropna().unique():
    p = os.path.join("thumbnails", f"{vid}.jpg")
    v = compute_thumbnail_face_emotion(p) if os.path.isfile(p) else np.nan
    results.append((vid, v))

df["thumbnail_face_emotion"] = df["id"].map({vid: v for vid, v in results})

# Peek
df[["id", "thumbnail_face_emotion"]].head()




Unnamed: 0,id,thumbnail_face_emotion
0,qRF04lTrbPs,0.684833
1,NeWv_WA_4R0,0.641112
2,xF0KI6oo7ak,0.303327
3,addCkFsak1U,0.260067
4,GFd2FmQAyE0,0.469498


In [102]:
##### Creates:
# thumbnail_ocr_text_coverage

# Heuristic OCR Text Coverage (no large models)
# Idea: enhance stroke-like regions with morphology, binarize, filter CCs, fill boxes -> coverage ∈ [0,1]

def compute_text_coverage(img_path: str,
                          min_frac=0.0005,   # min box area as fraction of image area
                          max_frac=0.25,     # max box area as fraction of image area
                          min_ar=1.1,        # min aspect ratio (w/h) for text regions
                          extent_lo=0.25,    # min contour extent (area / bbox area)
                          extent_hi=0.95):   # max contour extent (filters solid bars/blocks)
    img = cv2.imread(img_path)
    if img is None:
        return np.nan
    H, W = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # --- Enhance text strokes ---
    # Use both blackhat (light text on dark) and tophat (dark text on light) with an elongated kernel
    kx = max(3, W // 40)   # horizontal emphasis
    ky = max(2, H // 120)  # thin vertical thickness
    rect = cv2.getStructuringElement(cv2.MORPH_RECT, (kx, ky))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, rect)
    tophat   = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT,   rect)
    enhanced = cv2.max(blackhat, tophat)

    # Optional slight blur to stabilize thresholding
    enhanced = cv2.GaussianBlur(enhanced, (3, 3), 0)

    # --- Binarize (Otsu) ---
    _, bw = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # --- Connect characters into words/blocks ---
    connect = cv2.getStructuringElement(cv2.MORPH_RECT, (max(3, kx // 2), max(2, ky)))
    bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, connect, iterations=1)
    bw = cv2.dilate(bw, connect, iterations=1)

    # --- Find candidate regions ---
    contours, _ = cv2.findContours(bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    mask = np.zeros((H, W), dtype=np.uint8)
    total_px = H * W
    min_area = min_frac * total_px
    max_area = max_frac * total_px

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = w * h
        if area < min_area or area > max_area:
            continue
        ar = w / float(h) if h > 0 else 0.0
        if ar < min_ar:
            continue
        cnt_area = cv2.contourArea(cnt)
        extent = (cnt_area / area) if area > 0 else 0.0
        if not (extent_lo <= extent <= extent_hi):
            continue

        # Fill the box (mask union of text-like regions)
        cv2.rectangle(mask, (x, y), (x + w, y + h), 255, -1)

    coverage = (mask.sum() / 255.0) / float(total_px) if total_px > 0 else 0.0
    return float(max(0.0, min(1.0, coverage)))

# Compute for each video and map back to df
results = []
for vid in df["id"].dropna().unique():
    p = os.path.join("thumbnails", f"{vid}.jpg")
    cov = compute_text_coverage(p) if os.path.isfile(p) else np.nan
    results.append((vid, cov))

df["thumbnail_ocr_text_coverage"] = df["id"].map({vid: cov for vid, cov in results})

# Peek
df[["id", "thumbnail_ocr_text_coverage"]].head()


Unnamed: 0,id,thumbnail_ocr_text_coverage
0,qRF04lTrbPs,0.396562
1,NeWv_WA_4R0,0.479063
2,xF0KI6oo7ak,0.141198
3,addCkFsak1U,0.077292
4,GFd2FmQAyE0,0.298385


In [103]:
##### Creates:
# thumbnail_saliency_thirds_proximity

def _spectral_residual_centroid(gray: np.ndarray, small=96):
    """
    Compute saliency via spectral residual and return centroid (cx, cy) in ORIGINAL image coords.
    """
    H, W = gray.shape[:2]
    # Downscale for speed
    small_w = small
    small_h = max(8, int(round(H * (small / float(W))))) if W > 0 else small
    g = cv2.resize(gray, (small_w, small_h), interpolation=cv2.INTER_AREA).astype(np.float32)

    # Spectral residual saliency (Hou & Zhang 2007)
    F = np.fft.fft2(g)
    log_amp = np.log(np.abs(F) + 1e-8)
    phase   = np.angle(F)
    avg_log = cv2.blur(log_amp, (3, 3))
    spec_res = log_amp - avg_log
    S = np.abs(np.fft.ifft2(np.exp(spec_res + 1j * phase))) ** 2
    S = cv2.GaussianBlur(S, (3, 3), 0)

    # Normalize to [0,1]
    S -= S.min()
    S /= (S.max() + 1e-8)

    # Centroid on the small map
    yy, xx = np.mgrid[0:small_h, 0:small_w]
    w = S.astype(np.float32)
    wsum = float(w.sum())
    if wsum < 1e-8:
        # Fallback to image center if saliency is degenerate
        return W * 0.5, H * 0.5

    cx_small = float((w * xx).sum() / wsum)
    cy_small = float((w * yy).sum() / wsum)

    # Map back to original coords
    cx = cx_small * (W / float(small_w))
    cy = cy_small * (H / float(small_h))
    return cx, cy

def _thirds_points(W, H):
    return [
        (W/3.0, H/3.0),
        (2*W/3.0, H/3.0),
        (W/3.0, 2*H/3.0),
        (2*W/3.0, 2*H/3.0),
    ]

def _nearest_thirds_distance(cx, cy, W, H):
    pts = _thirds_points(W, H)
    return float(min(np.hypot(cx - x, cy - y) for (x, y) in pts))

def compute_thirds_proximity(img_path: str) -> float:
    """
    Returns thirds proximity in [0,1]:
      1.0 = saliency centroid exactly on a rule-of-thirds hotspot
      0.0 ≈ worst case (near a corner)
    """
    bgr = cv2.imread(img_path)
    if bgr is None:
        return np.nan
    H, W = bgr.shape[:2]
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

    cx, cy = _spectral_residual_centroid(gray, small=96)
    diag = np.hypot(W, H)
    max_min_dist = diag / 3.0  # worst-case distance to nearest thirds point (corners)
    offset_px = _nearest_thirds_distance(cx, cy, W, H)
    proximity = 1.0 - (offset_px / (max_min_dist + 1e-8))
    return float(np.clip(proximity, 0.0, 1.0))

# Compute and map back to df
results = []
for vid in df["id"].dropna().unique():
    p = os.path.join("thumbnails", f"{vid}.jpg")
    prox = compute_thirds_proximity(p) if os.path.isfile(p) else np.nan
    results.append((vid, prox))

df["thumbnail_saliency_thirds_proximity"] = df["id"].map({vid: prox for vid, prox in results})

# Quick peek
df[["id", "thumbnail_saliency_thirds_proximity"]].head()

Unnamed: 0,id,thumbnail_saliency_thirds_proximity
0,qRF04lTrbPs,0.805344
1,NeWv_WA_4R0,0.712971
2,xF0KI6oo7ak,0.611532
3,addCkFsak1U,0.69327
4,GFd2FmQAyE0,0.630173


In [104]:
# Column renaming

df.rename(columns={'snippet.channelId': 'channel_id',
                   'snippet.title': 'title',
                   'snippet.description': 'description',
                   'statistics.likeCount': 'likeCount',
                   'statistics.viewCount': 'viewCount',
                   'statistics.commentCount': 'commentCount'}, inplace=True)

In [105]:
##### Labeling virality (based on high performance)

percentile = 0.75

# 0) Drop duplicate-named columns (keep first occurrence)
dupes = df.columns[df.columns.duplicated()].unique()
if len(dupes):
    print("Dropping duplicate columns:", list(dupes))
df = df.loc[:, ~df.columns.duplicated()]

# 1) Make sure metrics are numeric (YouTube API often gives strings)
num_cols = ["viewCount", "likeCount", "commentCount"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# 2) Compute per-channel quantile thresholds
thresholds = (
    df.groupby("channel_id")[num_cols]
      .quantile(percentile)
      .rename(columns={
          "viewCount": "views_threshold",
          "likeCount": "likes_threshold",
          "commentCount": "comments_threshold",
      })
      .reset_index()
)

# 3) Merge thresholds into main dataframe
df = df.merge(thresholds, on="channel_id", how="left", validate="m:1")

# 4) Label virality
df["viral_label"] = (
    (df["viewCount"] > df["views_threshold"]) &
    (df["likeCount"] > df["likes_threshold"]) &
    (df["commentCount"] > df["comments_threshold"])
).astype(int)

# 5) Clean up
df = df.drop(columns=["views_threshold", "likes_threshold", "comments_threshold"])

# 6) Debug: what's the share of viral?
viral_percentage = (df["viral_label"].sum() / len(df)) * 100 if len(df) else 0.0
print(f"Percentage of viral videos: {viral_percentage:.2f}%")


Dropping duplicate columns: ['channel_id']
Percentage of viral videos: 17.20%


In [106]:
##### Dropping columns that aren't used from youtube data api v3 metadata
##### General refactoring at end of cell

# Dropping bad columns/features
columns_to_drop = ['kind',
                   'etag',
                   'channel_id_x',
                   'snippet.publishedAt',
                   'snippet.title',
                   'snippet.thumbnails.default.url',
                   'snippet.thumbnails.default.width',
                   'snippet.thumbnails.default.height',
                   'snippet.thumbnails.medium.url',
                   'snippet.thumbnails.medium.width',
                   'snippet.thumbnails.medium.height',
                   'snippet.thumbnails.high.url',
                   'snippet.thumbnails.high.width',
                   'snippet.thumbnails.high.height',
                   'snippet.thumbnails.standard.url',
                   'snippet.thumbnails.standard.width',
                   'snippet.thumbnails.standard.height',
                   'snippet.thumbnails.maxres.url',
                   'snippet.thumbnails.maxres.width',
                   'snippet.thumbnails.maxres.height',
                   'statistics.viewCount',
                   'statistics.likeCount',
                   'statistics.commentCount',
                   'snippet.channelTitle',
                   'snippet.categoryId',
                   'snippet.liveBroadcastContent',
                   'snippet.defaultAudioLanguage',
                   'snippet.defaultLanguage',
                   'title',
                   'channel_id_x',
                   'channel_id_y',
                   'snippet.localized.description',
                   'statistics.favoriteCount',
                   'id',
                   'viewCount',
                   'likeCount',
                   'commentCount',
                   'channel_id',
                   'description']

# Check which columns exist in the DataFrame before dropping
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

df = df.drop(columns=existing_columns_to_drop, axis=1)

# Renaming tags for future section
df.rename(columns={'snippet.tags': 'tags', 'snippet.localized.title': 'title'}, inplace=True)

In [107]:
##### Sanity check + feature peek for next section
df.columns

Index(['tags', 'title', 'thumbnail_colorfulness', 'thumbnail_brightness',
       'thumbnail_contrast', 'thumbnail_hue', 'thumbnail_saturation',
       'thumbnail_edge_density', 'thumbnail_texture_entropy',
       'thumbnail_quality', 'thumbnail_face_area_ratio',
       'thumbnail_face_emotion', 'thumbnail_ocr_text_coverage',
       'thumbnail_saliency_thirds_proximity', 'viral_label'],
      dtype='object')