<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality-Testing/blob/main/notebooks/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extraction: Run as group


In [192]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality-Testing/refs/heads/main/data/javathecraft.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.tags,snippet.categoryId,snippet.liveBroadcastContent,snippet.localized.title,snippet.localized.description,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,b34ChkfyVzp7gCbnFTjnVzKde0s,p-0jT5i-ndc,UCwNWbByE5rbxEW9XXdeoCVg,2025-05-02T18:50:26Z,UCwNWbByE5rbxEW9XXdeoCVg,This is JUST an AFK Platform [Hardcore Minecraft],Hardcore Minecraft is something I never dared ...,https://i.ytimg.com/vi/p-0jT5i-ndc/default.jpg,120,...,"['minecraft', 'minecraft hardcore', 'minecraft...",20,none,This is JUST an AFK Platform [Hardcore Minecraft],Hardcore Minecraft is something I never dared ...,en-US,35761,1816,0,158
1,youtube#video,ZTKo9A7zbms4ZnbYOIXUxShp8WA,1h0IHzqnLfs,UCwNWbByE5rbxEW9XXdeoCVg,2024-11-19T14:00:02Z,UCwNWbByE5rbxEW9XXdeoCVg,I Spent a Year Building a Mountaintop Japanese...,Hardcore Minecraft is something I never dared ...,https://i.ytimg.com/vi/1h0IHzqnLfs/default.jpg,120,...,"['minecraft', 'minecraft hardcore', 'minecraft...",20,none,I Spent a Year Building a Mountaintop Japanese...,Hardcore Minecraft is something I never dared ...,en-US,44698,2862,0,217
2,youtube#video,9AgN5lSS0P9iRGad4DXANU2DW-E,Sf5f7OwoMnQ,UCwNWbByE5rbxEW9XXdeoCVg,2023-05-14T14:00:25Z,UCwNWbByE5rbxEW9XXdeoCVg,I Built All This JUST For a Nether Portal in H...,Hardcore Minecraft is something I never dared ...,https://i.ytimg.com/vi/Sf5f7OwoMnQ/default.jpg,120,...,"['minecraft', 'minecraft hardcore', 'minecraft...",20,none,I Built All This JUST For a Nether Portal in H...,Hardcore Minecraft is something I never dared ...,en-US,123912,8740,0,659
3,youtube#video,I6Si0mOXPTyhb11cy-6PUQDNtnM,uqOucsxf3ok,UCwNWbByE5rbxEW9XXdeoCVg,2023-04-22T14:00:29Z,UCwNWbByE5rbxEW9XXdeoCVg,Becoming Immortal in Hardcore Minecraft with U...,Hardcore Minecraft is something I never dared ...,https://i.ytimg.com/vi/uqOucsxf3ok/default.jpg,120,...,"['minecraft', 'minecraft hardcore', 'minecraft...",20,none,Becoming Immortal in Hardcore Minecraft with U...,Hardcore Minecraft is something I never dared ...,en-US,130833,7681,0,403
4,youtube#video,rfmdlqdVNzFWbRXhO9dE6xAih1E,9_58KM4Mhls,UCwNWbByE5rbxEW9XXdeoCVg,2023-04-08T14:00:05Z,UCwNWbByE5rbxEW9XXdeoCVg,I Built a Japanese Ryokan-Styled Villager Bree...,Hardcore Minecraft is something I never dared ...,https://i.ytimg.com/vi/9_58KM4Mhls/default.jpg,120,...,"['minecraft', 'minecraft hardcore', 'minecraft...",20,none,I Built a Japanese Ryokan-Styled Villager Bree...,Hardcore Minecraft is something I never dared ...,en-US,269549,13507,0,585


In [193]:
# Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

# Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]

Found 6 video IDs.


In [194]:
# Extract visual features
import numpy as np

def extract_features(img_path, video_id):
    try:
        img = Image.open(img_path).convert('RGB')
        img_array = np.array(img)

        avg_color = np.mean(img_array, axis=(0, 1))
        brightness = np.mean(img_array)
        contrast = np.std(img_array)

        return {
            "video_id": video_id,
            "avg_red": avg_color[0],
            "avg_green": avg_color[1],
            "avg_blue": avg_color[2],
            "brightness": brightness,
            "contrast": contrast
        }
    except Exception as e:
        print(f"Failed to extract for {video_id}: {e}")
        return None

feature_data = [extract_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
features_df = pd.DataFrame([f for f in feature_data if f is not None])
features_df.head()

# Merge image features with original DataFrame
merged_df = pd.merge(df, features_df, left_on='id', right_on='video_id', how='left')

# Drop the redundant 'video_id' column
merged_df = merged_df.drop('video_id', axis=1)

df = merged_df

In [195]:
from textblob import TextBlob
import re

# Define sets of keywords
clickbait_words = {
    "amazing", "incredible", "shocking", "jaw-dropping", "mind-blowing",
    "unbelievable", "you won’t believe", "you’ll never guess", "what happens next",
    "epic", "ultimate", "must", "insane", "secret", "exposed", "revealed",
    "hack", "these reasons", "10 reasons", "this trick", "don’t miss",
    "game changer", "craziest", "revealed", "the truth about", "deal of the day"
}

power_words = {
    "best", "top", "new", "essential", "easy", "quick", "instant", "effortless",
    "guaranteed", "proven", "genius", "exclusive", "remarkable", "powerful",
    "revolutionary", "breakthrough", "must-have", "unlock", "master", "ultimate",
    "secret", "simple", "transform", "hacks", "tips", "tricks"
}

timed_words = {
    "now", "today", "just now", "breaking", "this morning", "this afternoon",
    "this evening", "tonight", "this week", "this weekend", "this month",
    "this season", "this year", "last minute", "last week", "2023", "2024",
    "2025", "coming soon", "newly released", "upcoming", "recent", "daily",
    "weekly", "monthly", "yearly"
}

def extract_title_features(title):
    blob = TextBlob(title)
    words = title.split()
    punctuation = set("!?.,:;-()[]{}")

    upper_words = [w for w in words if w.isupper()]
    letters = re.findall(r'[A-Za-z]', title)
    uppercase_letters = [c for c in letters if c.isupper()]
    digits = re.findall(r'\d', title)


    clickbait_score = sum(word.lower() in clickbait_words for word in words)
    power_word_count = sum(word.lower() in power_words for word in words)
    timed_word_count = sum(word.lower() in timed_words for word in words)


    return {
        "title_sentiment": blob.sentiment.polarity,
        "title_subjectivity": blob.sentiment.subjectivity,
        "num_question_marks": title.count("?"),
        "num_exclamation_marks": title.count("!"),
        "starts_with_keyword": int(words[0].lower() in {"how", "why", "what", "when", "where", "who"} if words else 0),
        "title_length": len(title),
        "word_count": len(words),
        "punctuation_count": sum(1 for c in title if c in punctuation),
        "uppercase_word_count": len(upper_words),
        "percent_letters_uppercase": round(len(uppercase_letters) / len(letters), 3) if letters else 0,
        "num_digits": len(digits),
        "clickbait_score": clickbait_score,
        "num_power_words": power_word_count,
        "num_timed_words": timed_word_count
    }

In [196]:
# Load the CSV that has titles in it
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/javathecraft.csv"
title_df = pd.read_csv(url)

# Extract only the title and video id from the original csv so we can join it to df by video id to add the title column
title_df = title_df[['snippet.title', 'id', 'channel_id', 'statistics.viewCount', 'statistics.likeCount', 'statistics.commentCount']]
title_df = title_df.rename(columns={'snippet.title': 'title', 'id': 'video_id', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount'})
title_df.head()

# Merge to add title column
df = pd.merge(df, title_df, left_on='id', right_on='video_id', how='left')
df.head()

# Apply the extraction function to all titles
title_feature_df = df["title"].apply(extract_title_features).apply(pd.Series)

# Merge with the original DataFrame
df = pd.concat([df, title_feature_df], axis=1)
df.head()

df = df.rename(columns={'snippet.channelId': 'channel_id'})

In [197]:
# "High performance" threshold
percentile = 0.75

# Select the numeric columns *before* grouping and applying quantile
thresholds = df[["channel_id", "viewCount", "likeCount", "commentCount"]].groupby("channel_id").quantile(percentile)[
    ["viewCount", "likeCount", "commentCount"]
].rename(columns={
    "viewCount": "views_threshold",
    "likeCount": "likes_threshold",
    "commentCount": "comments_threshold"
}).reset_index()

# Merge thresholds into main dataframe (only run once)
df = df.merge(thresholds, on="channel_id", how="left")
df.head()

# Label virality using thresholds
df["viral"] = (
    (df["viewCount"] > df["views_threshold"]) &
    (df["likeCount"] > df["likes_threshold"]) &
    (df["commentCount"] > df["comments_threshold"])
).astype(int)

# Drop the threshold columns from the main DataFrame
df = df.drop(["views_threshold", "likes_threshold", "comments_threshold"], axis=1)

df.head()

# (OPTIONAL FOR DEBUGGING) Calculate the percentage of viral videos
viral_percentage = (df["viral"].sum() / len(df)) * 100

print(f"Percentage of viral videos: {viral_percentage:.2f}%")

Percentage of viral videos: 16.67%


In [198]:
# Dropping bad columns/features
columns_to_drop = ['kind',
                   'etag',
                   'channel_id_x',
                   'snippet.publishedAt',
                   'snippet.title',
                   'snippet.thumbnails.default.url',
                   'snippet.thumbnails.default.width',
                   'snippet.thumbnails.default.height',
                   'snippet.thumbnails.medium.url',
                   'snippet.thumbnails.medium.width',
                   'snippet.thumbnails.medium.height',
                   'snippet.thumbnails.high.url',
                   'snippet.thumbnails.high.width',
                   'snippet.thumbnails.high.height',
                   'snippet.thumbnails.standard.url',
                   'snippet.thumbnails.standard.width',
                   'snippet.thumbnails.standard.height',
                   'snippet.thumbnails.maxres.url',
                   'snippet.thumbnails.maxres.width',
                   'snippet.thumbnails.maxres.height',
                   'statistics.viewCount',
                   'statistics.likeCount',
                   'statistics.commentCount',
                   'snippet.channelTitle',
                   'snippet.categoryId',
                   'snippet.liveBroadcastContent',
                   'snippet.defaultAudioLanguage',
                  #  'snippet.defaultLanguage',
                   'title',
                   'channel_id_x',
                   'channel_id_y',
                   'snippet.localized.description',
                   'statistics.favoriteCount',
                   'id']
df = df.drop(columns=columns_to_drop, axis=1)

In [199]:
# Renaming columns for prettier features
df = df.rename(columns={'snippet.description': 'description', 'snippet.localized.title': 'title', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount', 'snippet.tags': 'tags'})

In [200]:
### Description-based features

import re
from textblob import TextBlob

# Fill NA for descriptions
df['description'] = df['description'].fillna('')

# Description length
df['description_length'] = df['description'].apply(len)

# Description sentiment
df['description_sentiment'] = df['description'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Description has keywords
keywords = ['subscribe', 'giveaway', 'limited time', 'offer', 'new video']
df['description_has_keywords'] = df['description'].apply(lambda x: any(kw in x.lower() for kw in keywords))

In [201]:
### Tag-based features

# Fill NA for tags
df['tags'] = df['tags'].fillna('').astype(str)

# Tag count
df['tag_count'] = df['tags'].apply(lambda x: len(x.split(',')) if x else 0)

# Tag sentiment
df['tag_sentiment'] = df['tags'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Unique tags
df['num_unique_tags'] = df['tags'].apply(lambda x: len(set(tag.strip().lower() for tag in x.split(','))) if x else 0)

In [202]:
!pip install textstat



In [203]:
### Title-based Semantic features

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import textstat

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute title embeddings
df['title_embedding'] = df['title'].fillna('').apply(lambda x: model.encode(x, convert_to_tensor=False))

# Embedding distance to known viral videos
viral_embeddings = df[df['viral'] == 1]['title_embedding'].tolist()

def max_cosine_similarity(vec, others):
    return np.max(cosine_similarity([vec], others)[0]) if others else 0

# Clickbait phrase match
clickbait_phrases = ['you won’t believe', 'what happens next', 'this is why', 'top secret', 'never seen before']
df['clickbait_phrase_match'] = df['title'].fillna('').str.lower().apply(lambda t: any(p in t for p in clickbait_phrases)).astype(int)

# Readability score
df['title_readability'] = df['title'].fillna('').apply(lambda x: textstat.flesch_reading_ease(x))

# Listicle and tutorial flags
df['is_listicle'] = df['title'].str.strip().str.lower().str.match(r'^\d+').astype(int)
df['is_tutorial'] = df['title'].str.lower().str.startswith('how to').astype(int)

In [204]:
### Thumbnail-based features
import cv2
import os
from collections import Counter
from matplotlib.colors import rgb_to_hsv

# Path to thumbnail images, assumes filename = <video_id>.jpg
thumbnail_dir = "/content/thumbnails/"

# Face detection setup
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def get_thumbnail_features(video_id):
    path = os.path.join(thumbnail_dir, f"{video_id}.jpg")
    if not os.path.exists(path):
        return pd.Series([0, -1, 0.0], index=['num_faces', 'dominant_color_hue', 'thumbnail_edge_density'])

    img = cv2.imread(path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Face count
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    num_faces = len(faces)

    # Dominant color hue
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hue_channel = hsv[:, :, 0].flatten()
    most_common_hue = Counter(hue_channel).most_common(1)[0][0]

    # Edge density
    edges = cv2.Canny(gray, 100, 200)
    edge_density = np.sum(edges > 0) / edges.size

    return pd.Series([num_faces, most_common_hue, edge_density], index=['num_faces', 'dominant_color_hue', 'thumbnail_edge_density'])

df[['num_faces', 'dominant_color_hue', 'thumbnail_edge_density']] = df['video_id'].apply(get_thumbnail_features)

In [205]:
### Word Count Refinement

# Power/timed word lists
power_words = ['secret', 'shocking', 'revealed', 'proven', 'insane']
timed_words = ['today', 'now', 'soon', 'in 24 hours', 'this week']

def count_matches(text, word_list):
    return sum(word in text.lower() for word in word_list)

df['power_word_count'] = df['title'].fillna('').apply(lambda x: count_matches(x, power_words))
df['timed_word_count'] = df['title'].fillna('').apply(lambda x: count_matches(x, timed_words))
df['num_tags'] = df['tags'].apply(lambda x: len(x.split(',')) if x else 0)
df['avg_tag_length'] = df['tags'].apply(lambda x: np.mean([len(tag) for tag in x.split(',')]) if x else 0)

In [206]:
# Sanity check to see if less interesting features are at all useful

timed_word_count_sum = df['timed_word_count'].sum()
power_word_count_sum = df['power_word_count'].sum()
is_listicle_sum = df['is_listicle'].sum()
is_tutorial_sum = df['is_tutorial'].sum()
num_faces_sum = df['num_faces'].sum()
true_clickbait_matches = df['clickbait_phrase_match'].sum()
print(f"Total number of timed_word_count values: {timed_word_count_sum}")
print(f"Total number of power_word_count values: {power_word_count_sum}")
print(f"Total number of is_listicle values: {is_listicle_sum}")
print(f"Total number of is_tutorial values: {is_tutorial_sum}")
print(f"Total number of num_faces values: {num_faces_sum}")
print(f"Total number of true clickbait_phrase_match values: {true_clickbait_matches}")

Total number of timed_word_count values: 0
Total number of power_word_count values: 0
Total number of is_listicle values: 0
Total number of is_tutorial values: 0
Total number of num_faces values: 2.0
Total number of true clickbait_phrase_match values: 0


In [207]:
pip install pytrends



In [208]:
# Trending score calculation
from typing import List
from pytrends.request import TrendReq
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def get_google_trend_score(keywords: List[str]) -> float:
    """
    Fetches daily interest for given keywords from Google Trends and returns
    the normalized average score (0.0 to 1.0).
    """
    try:
        pytrends = TrendReq()
        pytrends.build_payload(keywords, timeframe='now 1-d')
        data = pytrends.interest_over_time()
        if data.empty:
            return 0.0
        return float(data[keywords].iloc[-1].mean() / 100)
    except Exception:
        return 0.0


def get_twitter_trend_score(keywords: List[str]) -> float:
    """
    Scrapes Twitter's trending topics page and returns a normalized
    count of how many trending hashtags match the provided keywords.
    """
    try:
        url = "https://twitter.com/explore/tabs/trending"
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        hashtags = [tag.get_text().lstrip("#") for tag in soup.find_all("span")]
        matches = sum(
            1 for kw in keywords
            if any(kw.lower() in h.lower() for h in hashtags)
        )
        max_count = len(hashtags) or 1
        return matches / max_count
    except Exception:
        return 0.0


def get_youtube_trending_similarity(keywords: List[str]) -> float:
    """
    Scrapes YouTube's trending page and computes the maximum cosine similarity
    between your video's keywords and any trending video title.
    """
    try:
        resp = requests.get("https://www.youtube.com/feed/trending")
        soup = BeautifulSoup(resp.text, "html.parser")
        titles = [t.get('title') for t in soup.select('a#video-title') if t.get('title')]
        model = SentenceTransformer('all-MiniLM-L6-v2')
        emb = model.encode(" ".join(keywords))
        sims = [
            cosine_similarity([emb], [model.encode(t)])[0][0]
            for t in titles
        ]
        return max(sims) if sims else 0.0
    except Exception:
        return 0.0


def calculate_trending_score(keywords: List[str], w1: float = 0.4, w2: float = 0.3, w3: float = 0.3) -> float:
    """
    Combines Google Trends, Twitter, and YouTube similarity scores
    into a single trending_score.
    """
    g = get_google_trend_score(keywords)
    t = get_twitter_trend_score(keywords)
    y = get_youtube_trending_similarity(keywords)
    return round(w1 * g + w2 * t + w3 * y, 4)

# Apply function to make 'trending_score' feature
df['trending_score'] = df['title'].apply(calculate_trending_score)
df.head()

Unnamed: 0,channel_id,description,tags,title,avg_red,avg_green,avg_blue,brightness,contrast,video_id,...,is_listicle,is_tutorial,num_faces,dominant_color_hue,thumbnail_edge_density,power_word_count,timed_word_count,num_tags,avg_tag_length,trending_score
0,UCwNWbByE5rbxEW9XXdeoCVg,Hardcore Minecraft is something I never dared ...,"['minecraft', 'minecraft hardcore', 'minecraft...",This is JUST an AFK Platform [Hardcore Minecraft],164.887865,158.869618,176.134826,166.63077,52.270927,p-0jT5i-ndc,...,0,0,0.0,2.0,0.139201,0,0,22,22.636364,0.0
1,UCwNWbByE5rbxEW9XXdeoCVg,Hardcore Minecraft is something I never dared ...,"['minecraft', 'minecraft hardcore', 'minecraft...",I Spent a Year Building a Mountaintop Japanese...,98.027205,100.225382,97.791094,98.681227,78.515018,1h0IHzqnLfs,...,0,0,0.0,105.0,0.176007,0,0,21,22.904762,0.0
2,UCwNWbByE5rbxEW9XXdeoCVg,Hardcore Minecraft is something I never dared ...,"['minecraft', 'minecraft hardcore', 'minecraft...",I Built All This JUST For a Nether Portal in H...,126.868229,130.835451,127.853108,128.518929,76.582689,Sf5f7OwoMnQ,...,0,0,1.0,111.0,0.138472,0,0,22,22.863636,0.0
3,UCwNWbByE5rbxEW9XXdeoCVg,Hardcore Minecraft is something I never dared ...,"['minecraft', 'minecraft hardcore', 'minecraft...",Becoming Immortal in Hardcore Minecraft with U...,109.28599,122.795677,108.24592,113.442529,76.167648,uqOucsxf3ok,...,0,0,0.0,0.0,0.167274,0,0,22,22.681818,0.0
4,UCwNWbByE5rbxEW9XXdeoCVg,Hardcore Minecraft is something I never dared ...,"['minecraft', 'minecraft hardcore', 'minecraft...",I Built a Japanese Ryokan-Styled Villager Bree...,116.008108,113.065243,99.326667,109.466672,77.084797,9_58KM4Mhls,...,0,0,0.0,13.0,0.187743,0,0,22,22.909091,0.0


In [209]:
# Remove description features
columns_to_drop = ['description', 'description_length', 'description_sentiment', 'description_has_keywords']
df = df.drop(columns=columns_to_drop, axis=1)

In [210]:
# Leave only features, remove conceptual data (title, tags, etc.)
columns_to_drop = ['title', 'tags', 'video_id', 'channel_id']
df = df.drop(columns=columns_to_drop, axis=1)

In [211]:
# Remove 'title_embedding' until we can find a way to use it with our model
columns_to_drop = ['title_embedding']
df = df.drop(columns=columns_to_drop, axis=1)

In [212]:
print(df.columns)

Index(['avg_red', 'avg_green', 'avg_blue', 'brightness', 'contrast',
       'viewCount', 'likeCount', 'commentCount', 'title_sentiment',
       'title_subjectivity', 'num_question_marks', 'num_exclamation_marks',
       'starts_with_keyword', 'title_length', 'word_count',
       'punctuation_count', 'uppercase_word_count',
       'percent_letters_uppercase', 'num_digits', 'clickbait_score',
       'num_power_words', 'num_timed_words', 'viral', 'tag_count',
       'tag_sentiment', 'num_unique_tags', 'clickbait_phrase_match',
       'title_readability', 'is_listicle', 'is_tutorial', 'num_faces',
       'dominant_color_hue', 'thumbnail_edge_density', 'power_word_count',
       'timed_word_count', 'num_tags', 'avg_tag_length', 'trending_score'],
      dtype='object')


In [213]:
# Export
df.to_csv("feature_data.csv", index=False)