<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality/blob/main/notebooks/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Download thumbnails and extract simple visual features


In [None]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.categoryId,snippet.liveBroadcastContent,snippet.defaultLanguage,snippet.localized.title,snippet.localized.description,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,EZIz8pZ8r7Y3Zwf3HpjS-_Qi5EI,UtUKi4HUBNY,UCNEI-oWOivQJ9IMn8GeCrMg,2025-06-08T20:24:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,Nimue - Juan Arenosa (Piano Cover),🎹The vibrant melody of “Nimue” by Juan Arenosa...,https://i.ytimg.com/vi/UtUKi4HUBNY/default.jpg,120,...,10,none,en-US,Nimue - Juan Arenosa (Piano Cover),🎹The vibrant melody of “Nimue” by Juan Arenosa...,en-US,10412,784.0,0,73
1,youtube#video,evXKA9D3t5Lb3hzqhZzfCCb7EH0,0PWC69jDOP8,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-31T17:19:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Cry - Cigarettes after sex (Piano Cover),🎹The vibrant melody of “cry” by cigarettes aft...,https://i.ytimg.com/vi/0PWC69jDOP8/default.jpg,120,...,10,none,en-US,Cry - Cigarettes after sex (Piano Cover),🎹The vibrant melody of “cry” by cigarettes aft...,en-US,28792,1414.0,0,110
2,youtube#video,6XryOqEiZhKZ7X-SB5rpVaUCu2Y,0wIpL0cYujQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-26T20:29:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,hometown glory - adele,🎹The vibrant melody of “hometown glory” by Ade...,https://i.ytimg.com/vi/0wIpL0cYujQ/default.jpg,120,...,10,none,en-US,hometown glory - adele,🎹The vibrant melody of “hometown glory” by Ade...,en-US,12000,495.0,0,72
3,youtube#video,zPnzLnqadx_g30YMdUaz0tJ_BAI,994ZZm2aX-E,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-19T16:59:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,"Welcome and goodbye - Dream, Ivory",🎹The vibrant melody of “welcome and goodbye” b...,https://i.ytimg.com/vi/994ZZm2aX-E/default.jpg,120,...,10,none,en-US,"Welcome and goodbye - Dream, Ivory",🎹The vibrant melody of “welcome and goodbye” b...,en-US,17402,784.0,0,92
4,youtube#video,n5AIwWOI4BTvQhipK7rJRNOvNy4,GkDEInsCcpQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-10T19:35:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Hope - Mirko Dukanovic,🎹The vibrant melody of “Hope” by Mirko Dukanov...,https://i.ytimg.com/vi/GkDEInsCcpQ/default.jpg,120,...,10,none,en-US,Hope - Mirko Dukanovic,🎹The vibrant melody of “Hope” by Mirko Dukanov...,en-US,18358,714.0,0,103


In [None]:
# Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

Found 168 video IDs.


In [None]:
# Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]


In [None]:
# Extract visual features
import numpy as np

def extract_features(img_path, video_id):
    try:
        img = Image.open(img_path).convert('RGB')
        img_array = np.array(img)

        avg_color = np.mean(img_array, axis=(0, 1))
        brightness = np.mean(img_array)
        contrast = np.std(img_array)

        return {
            "video_id": video_id,
            "avg_red": avg_color[0],
            "avg_green": avg_color[1],
            "avg_blue": avg_color[2],
            "brightness": brightness,
            "contrast": contrast
        }
    except Exception as e:
        print(f"Failed to extract for {video_id}: {e}")
        return None

feature_data = [extract_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
features_df = pd.DataFrame([f for f in feature_data if f is not None])
features_df.head()


Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast
0,UtUKi4HUBNY,34.080816,36.617812,43.345712,38.01478,58.72613
1,0PWC69jDOP8,37.773715,31.678142,43.084097,37.511985,63.705403
2,0wIpL0cYujQ,29.380017,30.419931,41.579375,33.793108,54.808984
3,994ZZm2aX-E,45.244774,23.591406,23.828802,30.888328,49.698068
4,GkDEInsCcpQ,29.958542,29.702031,36.463542,32.041372,55.126803


In [None]:
# Merge features_df with original df to created final "merged_df"

# Merge image features with original DataFrame
merged_df = pd.merge(df, features_df, left_on='id', right_on='video_id', how='left')

# Drop the redundant 'video_id' column
merged_df = merged_df.drop('video_id', axis=1)

# Export the merged DataFrame to CSV
merged_df.to_csv("video_data_with_features.csv", index=False)
print("Merged data with image features exported to video_data_with_features.csv")

Merged data with image features exported to video_data_with_features.csv


In [None]:
merged_df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,avg_red,avg_green,avg_blue,brightness,contrast
0,youtube#video,EZIz8pZ8r7Y3Zwf3HpjS-_Qi5EI,UtUKi4HUBNY,UCNEI-oWOivQJ9IMn8GeCrMg,2025-06-08T20:24:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,Nimue - Juan Arenosa (Piano Cover),🎹The vibrant melody of “Nimue” by Juan Arenosa...,https://i.ytimg.com/vi/UtUKi4HUBNY/default.jpg,120,...,en-US,10412,784.0,0,73,34.080816,36.617812,43.345712,38.01478,58.72613
1,youtube#video,evXKA9D3t5Lb3hzqhZzfCCb7EH0,0PWC69jDOP8,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-31T17:19:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Cry - Cigarettes after sex (Piano Cover),🎹The vibrant melody of “cry” by cigarettes aft...,https://i.ytimg.com/vi/0PWC69jDOP8/default.jpg,120,...,en-US,28792,1414.0,0,110,37.773715,31.678142,43.084097,37.511985,63.705403
2,youtube#video,6XryOqEiZhKZ7X-SB5rpVaUCu2Y,0wIpL0cYujQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-26T20:29:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,hometown glory - adele,🎹The vibrant melody of “hometown glory” by Ade...,https://i.ytimg.com/vi/0wIpL0cYujQ/default.jpg,120,...,en-US,12000,495.0,0,72,29.380017,30.419931,41.579375,33.793108,54.808984
3,youtube#video,zPnzLnqadx_g30YMdUaz0tJ_BAI,994ZZm2aX-E,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-19T16:59:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,"Welcome and goodbye - Dream, Ivory",🎹The vibrant melody of “welcome and goodbye” b...,https://i.ytimg.com/vi/994ZZm2aX-E/default.jpg,120,...,en-US,17402,784.0,0,92,45.244774,23.591406,23.828802,30.888328,49.698068
4,youtube#video,n5AIwWOI4BTvQhipK7rJRNOvNy4,GkDEInsCcpQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-10T19:35:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Hope - Mirko Dukanovic,🎹The vibrant melody of “Hope” by Mirko Dukanov...,https://i.ytimg.com/vi/GkDEInsCcpQ/default.jpg,120,...,en-US,18358,714.0,0,103,29.958542,29.702031,36.463542,32.041372,55.126803


In [None]:
# Export features to CSV
merged_df.to_csv("video_details_v2.csv", index=False)
print("Image features exported to image_features.csv")

Image features exported to image_features.csv


# Step 2: OCR + Text Feature Extraction

In [None]:
# Install Tesseract OCR engine and text analysis libraries
!sudo apt-get install tesseract-ocr -y
!pip install pytesseract textblob

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
import pytesseract
from PIL import Image
from textblob import TextBlob
import os

def extract_text_features(img_path, video_id):
    try:
        img = Image.open(img_path)
        raw_text = pytesseract.image_to_string(img)
        word_count = len(raw_text.strip().split())
        sentiment = TextBlob(raw_text).sentiment.polarity  # -1 to 1

        return {
            "video_id": video_id,
            "thumbnail_text": raw_text.strip(),
            "text_word_count": word_count,
            "text_sentiment": sentiment
        }
    except Exception as e:
        print(f"Error on {video_id}: {e}")
        return None

# Run OCR on all thumbnails from Step 1
ocr_data = [extract_text_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
ocr_df = pd.DataFrame([o for o in ocr_data if o is not None])
ocr_df.head()

Unnamed: 0,video_id,thumbnail_text,text_word_count,text_sentiment
0,UtUKi4HUBNY,,0,0.0
1,0PWC69jDOP8,,0,0.0
2,0wIpL0cYujQ,,0,0.0
3,994ZZm2aX-E,,0,0.0
4,GkDEInsCcpQ,,0,0.0


In [None]:
# Merge Step 1 visual features with Step 2 OCR features
combined_df = pd.merge(features_df, ocr_df, on="video_id", how="left")
combined_df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment
0,UtUKi4HUBNY,34.080816,36.617812,43.345712,38.01478,58.72613,,0,0.0
1,0PWC69jDOP8,37.773715,31.678142,43.084097,37.511985,63.705403,,0,0.0
2,0wIpL0cYujQ,29.380017,30.419931,41.579375,33.793108,54.808984,,0,0.0
3,994ZZm2aX-E,45.244774,23.591406,23.828802,30.888328,49.698068,,0,0.0
4,GkDEInsCcpQ,29.958542,29.702031,36.463542,32.041372,55.126803,,0,0.0


In [None]:
combined_df.to_csv("video_details_v3.csv", index=False)

# Step 3: ResNet-based Image Embeddings (Final thumbnail features)

In [None]:
!pip install tensorflow pillow



In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np
import os

# Load ResNet50 without the final classification layer
resnet_model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
def extract_resnet_embedding(img_path):
    try:
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        features = resnet_model.predict(x)
        return features.flatten()
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return None

# Generate embeddings for all thumbnails
embedding_data = []
for vid in video_ids:
    img_path = f"thumbnails/{vid}.jpg"
    features = extract_resnet_embedding(img_path)
    if features is not None:
        embedding_data.append({
            "video_id": vid,
            "embedding": features
        })

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [None]:
# Expand 2048-dim embeddings into individual columns
embedding_df = pd.DataFrame([
    {"video_id": item["video_id"], **{f"resnet_{i}": val for i, val in enumerate(item["embedding"])}}
    for item in embedding_data
])

embedding_df.head()

Unnamed: 0,video_id,resnet_0,resnet_1,resnet_2,resnet_3,resnet_4,resnet_5,resnet_6,resnet_7,resnet_8,...,resnet_2038,resnet_2039,resnet_2040,resnet_2041,resnet_2042,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047
0,UtUKi4HUBNY,0.929408,0.136946,0.0,2.21788,0.027102,0.2453,1.182714,0.694704,6.137449,...,0.114528,0.162208,1.016833,0.003332,0.869126,0.015228,0.245706,0.0,0.007246,1.304372
1,0PWC69jDOP8,1.405159,0.246543,0.020386,1.818661,0.545413,0.710962,0.620515,1.142112,5.466237,...,0.013464,0.543352,0.50019,0.137755,1.421233,0.011291,1.604342,0.136785,0.539702,0.390965
2,0wIpL0cYujQ,0.795795,0.007518,0.003491,1.529563,0.177467,0.092707,1.88847,0.508117,4.373508,...,0.033329,0.3687,0.491139,0.0,0.273961,0.0,0.212104,0.002156,0.075826,0.394722
3,994ZZm2aX-E,0.263163,0.181081,0.00991,1.114493,0.021098,0.297155,0.487551,0.401928,4.178885,...,1.205674,0.567707,0.1401,0.0,0.02506,0.0,0.821357,0.387807,0.040469,0.572708
4,GkDEInsCcpQ,1.052668,0.085881,0.0,2.600837,0.119419,0.0,0.722449,1.217966,5.448364,...,0.003399,0.076299,0.04549,0.00998,2.122664,0.026763,0.579949,0.023417,0.013444,0.511335


In [None]:
# Merge ResNet embeddings with your combined DataFrame
full_df = pd.merge(combined_df, embedding_df, on="video_id", how="left")
full_df.to_csv("video_details_v4.csv", index=False)

# Step 4: Text-based feature extraction

In [None]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v4.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,resnet_0,...,resnet_2038,resnet_2039,resnet_2040,resnet_2041,resnet_2042,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,0.6324,...,1.652033,0.664808,0.042745,0.008452,1.696102,0.878194,0.444913,0.082426,1.004559,0.362394
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,0.758998,...,3.210624,0.56691,0.910501,0.0,1.924997,0.134057,0.083707,0.363867,0.266076,0.201331
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,0.767088,...,0.704443,0.04881,0.488042,0.1903,0.161197,0.718608,0.556553,0.399344,0.657456,0.888009
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,1.481094,...,2.362229,0.006484,0.637998,0.398325,0.016811,0.250087,0.553193,1.987138,1.022643,0.242517
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,0.376556,...,2.80849,0.040986,0.040346,0.0,0.369061,0.414916,0.593644,0.537817,1.485391,0.748773


In [None]:
from textblob import TextBlob
import re

# Define sets of keywords
clickbait_words = {
    "amazing", "shocking", "unbelievable", "top", "ultimate", "must",
    "insane", "you won’t believe", "secret", "revealed", "hack"
}
power_words = {
    "best", "new", "crazy", "instant", "guaranteed", "proven", "genius"
}
timed_words = {
    "now", "today", "this week", "this month", "this year", "2024", "2025"
}

def extract_title_features(title):
    blob = TextBlob(title)
    words = title.split()
    punctuation = set("!?.,:;-()[]{}")

    upper_words = [w for w in words if w.isupper()]
    letters = re.findall(r'[A-Za-z]', title)
    uppercase_letters = [c for c in letters if c.isupper()]

    clickbait_score = sum(word.lower() in clickbait_words for word in words)
    power_word_hit = any(word.lower() in power_words for word in words)
    timed_word_hit = any(word.lower() in timed_words for word in words)

    return {
        "title_sentiment": blob.sentiment.polarity,
        "title_subjectivity": blob.sentiment.subjectivity,
        "has_question": int("?" in title),
        "has_exclamation": int("!" in title),
        "starts_with_keyword": int(words[0].lower() in {"how", "why", "what", "when", "where", "who"} if words else 0),
        "title_length": len(title),
        "word_count": len(words),
        "punctuation_count": sum(1 for c in title if c in punctuation),
        "uppercase_word_count": len(upper_words),
        "percent_letters_uppercase": round(len(uppercase_letters) / len(letters), 3) if letters else 0,
        "has_numbers": int(bool(re.search(r"\d", title))),
        "clickbait_score": clickbait_score,
        "has_power_words": int(power_word_hit),
        "has_timed_words": int(timed_word_hit)
    }

In [None]:
# Load the CSV that has titles in it
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details.csv"
title_df = pd.read_csv(url)

# Extract only the title and video id from the original csv so we can join it to df by video id to add the title column
title_df = title_df[['snippet.title', 'id', 'channel_id', 'statistics.viewCount', 'statistics.likeCount', 'statistics.commentCount']]
title_df = title_df.rename(columns={'snippet.title': 'title', 'id': 'video_id', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount'})
title_df.head()

Unnamed: 0,title,video_id,channel_id,viewCount,likeCount,commentCount
0,I Tried to Blind Speedrun all of Mario Kart World,rR5udw_g8MY,UCRC6cNamj9tYAO6h_RXd5xA,275430,14353.0,516
1,Nintendo Switch 2 Welcome Tour is Sad and Awful,5z9C_5yl0es,UCRC6cNamj9tYAO6h_RXd5xA,659523,34330.0,4158
2,I Became The Drift King of Mario Kart World,mve1fSGfwRQ,UCRC6cNamj9tYAO6h_RXd5xA,444844,25988.0,1230
3,"I Watched E3 2005, Twenty Years Later. It's no...",QLeTqvcx6uY,UCRC6cNamj9tYAO6h_RXd5xA,305167,16748.0,1366
4,I must Carry The Glass with CallMeKevin,hENv3EUe6f0,UCRC6cNamj9tYAO6h_RXd5xA,424552,24444.0,779


In [None]:
# Merge to add title column
df = pd.merge(df, title_df, left_on='video_id', right_on='video_id', how='left')
df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,resnet_0,...,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047,title,channel_id,viewCount,likeCount,commentCount
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,0.6324,...,0.878194,0.444913,0.082426,1.004559,0.362394,I Tried to Blind Speedrun all of Mario Kart World,UCRC6cNamj9tYAO6h_RXd5xA,275430,14353.0,516
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,0.758998,...,0.134057,0.083707,0.363867,0.266076,0.201331,Nintendo Switch 2 Welcome Tour is Sad and Awful,UCRC6cNamj9tYAO6h_RXd5xA,659523,34330.0,4158
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,0.767088,...,0.718608,0.556553,0.399344,0.657456,0.888009,I Became The Drift King of Mario Kart World,UCRC6cNamj9tYAO6h_RXd5xA,444844,25988.0,1230
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,1.481094,...,0.250087,0.553193,1.987138,1.022643,0.242517,"I Watched E3 2005, Twenty Years Later. It's no...",UCRC6cNamj9tYAO6h_RXd5xA,305167,16748.0,1366
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,0.376556,...,0.414916,0.593644,0.537817,1.485391,0.748773,I must Carry The Glass with CallMeKevin,UCRC6cNamj9tYAO6h_RXd5xA,424552,24444.0,779


In [None]:
# Apply the extraction function to all titles
title_feature_df = df["title"].apply(extract_title_features).apply(pd.Series)

# Merge with the original DataFrame
df = pd.concat([df, title_feature_df], axis=1)
df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,resnet_0,...,starts_with_keyword,title_length,word_count,punctuation_count,uppercase_word_count,percent_letters_uppercase,has_numbers,clickbait_score,has_power_words,has_timed_words
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,0.6324,...,0.0,49.0,10.0,0.0,1.0,0.175,0.0,0.0,0.0,0.0
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,0.758998,...,0.0,47.0,9.0,0.0,0.0,0.158,1.0,0.0,0.0,0.0
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,0.767088,...,0.0,43.0,9.0,0.0,1.0,0.229,0.0,0.0,0.0,0.0
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,1.481094,...,0.0,57.0,11.0,2.0,2.0,0.179,1.0,0.0,0.0,0.0
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,0.376556,...,0.0,39.0,7.0,0.0,1.0,0.212,0.0,1.0,0.0,0.0


In [None]:
# Identify ResNet columns
resnet_cols = [col for col in df.columns if "resnet_" in col]

# Identify non-ResNet columns
other_cols = [col for col in df.columns if col not in resnet_cols]

# Create a new list of columns with ResNet columns at the end
new_column_order = other_cols + resnet_cols

# Reindex the DataFrame with the new column order
df = df.reindex(columns=new_column_order)

df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,title,...,resnet_2038,resnet_2039,resnet_2040,resnet_2041,resnet_2042,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,I Tried to Blind Speedrun all of Mario Kart World,...,1.652033,0.664808,0.042745,0.008452,1.696102,0.878194,0.444913,0.082426,1.004559,0.362394
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,Nintendo Switch 2 Welcome Tour is Sad and Awful,...,3.210624,0.56691,0.910501,0.0,1.924997,0.134057,0.083707,0.363867,0.266076,0.201331
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,I Became The Drift King of Mario Kart World,...,0.704443,0.04881,0.488042,0.1903,0.161197,0.718608,0.556553,0.399344,0.657456,0.888009
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,"I Watched E3 2005, Twenty Years Later. It's no...",...,2.362229,0.006484,0.637998,0.398325,0.016811,0.250087,0.553193,1.987138,1.022643,0.242517
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,I must Carry The Glass with CallMeKevin,...,2.80849,0.040986,0.040346,0.0,0.369061,0.414916,0.593644,0.537817,1.485391,0.748773


In [46]:
# Export final dataset with image + text features
df.to_csv("video_details_v5.csv", index=False)

# Step 5: Labeling 'virality' using target variables

In [39]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v5.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,title,...,resnet_2038,resnet_2039,resnet_2040,resnet_2041,resnet_2042,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,I Tried to Blind Speedrun all of Mario Kart World,...,1.652033,0.664808,0.042745,0.008452,1.696102,0.878194,0.444913,0.082426,1.004559,0.362394
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,Nintendo Switch 2 Welcome Tour is Sad and Awful,...,3.210624,0.56691,0.910501,0.0,1.924997,0.134057,0.083707,0.363867,0.266076,0.201331
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,I Became The Drift King of Mario Kart World,...,0.704443,0.04881,0.488042,0.1903,0.161197,0.718608,0.556553,0.399344,0.657456,0.888009
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,"I Watched E3 2005, Twenty Years Later. It's no...",...,2.362229,0.006484,0.637998,0.398325,0.016811,0.250087,0.553193,1.987138,1.022643,0.242517
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,I must Carry The Glass with CallMeKevin,...,2.80849,0.040986,0.040346,0.0,0.369061,0.414916,0.593644,0.537817,1.485391,0.748773


In [40]:
# Ensure proper numeric types
df["viewCount"] = pd.to_numeric(df["viewCount"], errors="coerce")
df["likeCount"] = pd.to_numeric(df["likeCount"], errors="coerce")
df["commentCount"] = pd.to_numeric(df["commentCount"], errors="coerce")

# Optional: drop rows with missing values
df = df.dropna(subset=["viewCount", "likeCount", "commentCount"])

In [41]:
percentile = 0.85

# Select the numeric columns *before* grouping and applying quantile
thresholds = df[["channel_id", "viewCount", "likeCount", "commentCount"]].groupby("channel_id").quantile(percentile)[
    ["viewCount", "likeCount", "commentCount"]
].rename(columns={
    "viewCount": "views_threshold",
    "likeCount": "likes_threshold",
    "commentCount": "comments_threshold"
}).reset_index()

In [42]:
# Merge thresholds into main dataframe (only run once)
df = df.merge(thresholds, on="channel_id", how="left")
df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,title,...,resnet_2041,resnet_2042,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047,views_threshold,likes_threshold,comments_threshold
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,I Tried to Blind Speedrun all of Mario Kart World,...,0.008452,1.696102,0.878194,0.444913,0.082426,1.004559,0.362394,1929057.65,60045.7,2797.75
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,Nintendo Switch 2 Welcome Tour is Sad and Awful,...,0.0,1.924997,0.134057,0.083707,0.363867,0.266076,0.201331,1929057.65,60045.7,2797.75
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,I Became The Drift King of Mario Kart World,...,0.1903,0.161197,0.718608,0.556553,0.399344,0.657456,0.888009,1929057.65,60045.7,2797.75
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,"I Watched E3 2005, Twenty Years Later. It's no...",...,0.398325,0.016811,0.250087,0.553193,1.987138,1.022643,0.242517,1929057.65,60045.7,2797.75
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,I must Carry The Glass with CallMeKevin,...,0.0,0.369061,0.414916,0.593644,0.537817,1.485391,0.748773,1929057.65,60045.7,2797.75


In [43]:
# Label virality using thresholds
df["viral"] = (
    (df["viewCount"] > df["views_threshold"]) &
    (df["likeCount"] > df["likes_threshold"]) &
    (df["commentCount"] > df["comments_threshold"])
).astype(int)

In [44]:
# Drop the threshold columns from the main DataFrame
df = df.drop(["views_threshold", "likes_threshold", "comments_threshold"], axis=1)

df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment,title,...,resnet_2039,resnet_2040,resnet_2041,resnet_2042,resnet_2043,resnet_2044,resnet_2045,resnet_2046,resnet_2047,viral
0,rR5udw_g8MY,140.592743,131.640642,117.352292,129.861892,71.016322,,0,0.0,I Tried to Blind Speedrun all of Mario Kart World,...,0.664808,0.042745,0.008452,1.696102,0.878194,0.444913,0.082426,1.004559,0.362394,0
1,5z9C_5yl0es,165.267865,136.143646,78.115677,126.509062,85.693186,,0,0.0,Nintendo Switch 2 Welcome Tour is Sad and Awful,...,0.56691,0.910501,0.0,1.924997,0.134057,0.083707,0.363867,0.266076,0.201331,0
2,mve1fSGfwRQ,141.777899,151.830069,137.094392,143.567454,75.68795,,0,0.0,I Became The Drift King of Mario Kart World,...,0.04881,0.488042,0.1903,0.161197,0.718608,0.556553,0.399344,0.657456,0.888009,0
3,QLeTqvcx6uY,104.694306,123.52901,71.574913,99.932743,82.475221,PD!\n7 ONS\ntae Your/Daddy\n\nsaa rat Aina air...,12,0.0,"I Watched E3 2005, Twenty Years Later. It's no...",...,0.006484,0.637998,0.398325,0.016811,0.250087,0.553193,1.987138,1.022643,0.242517,0
4,hENv3EUe6f0,138.603299,173.867396,143.15434,151.875012,76.828971,y ip a»,3,0.0,I must Carry The Glass with CallMeKevin,...,0.040986,0.040346,0.0,0.369061,0.414916,0.593644,0.537817,1.485391,0.748773,0


In [45]:
# Calculate the percentage of viral videos
viral_percentage = (df["viral"].sum() / len(df)) * 100

print(f"Percentage of viral videos: {viral_percentage:.2f}%")

Percentage of viral videos: 8.15%


In [47]:
# Export final dataset with image + text features
df.to_csv("video_details_v5.csv", index=False)