<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality/blob/main/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Download thumbnails and extract simple visual features


In [1]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.tags,snippet.categoryId,snippet.liveBroadcastContent,snippet.localized.title,snippet.localized.description,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",22,none,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,en-GB,63043,1751,0,25
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",22,none,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,en-GB,82459,3121,0,41
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",22,none,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,en-GB,219370,7947,0,48
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",22,none,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,en-GB,260149,9906,0,36
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",22,none,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,en-GB,182946,7056,0,44


In [2]:
# Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

Found 1287 video IDs.


In [3]:
# Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]


In [4]:
# Extract visual features
import numpy as np

def extract_features(img_path, video_id):
    try:
        img = Image.open(img_path).convert('RGB')
        img_array = np.array(img)

        avg_color = np.mean(img_array, axis=(0, 1))
        brightness = np.mean(img_array)
        contrast = np.std(img_array)

        return {
            "video_id": video_id,
            "avg_red": avg_color[0],
            "avg_green": avg_color[1],
            "avg_blue": avg_color[2],
            "brightness": brightness,
            "contrast": contrast
        }
    except Exception as e:
        print(f"Failed to extract for {video_id}: {e}")
        return None

feature_data = [extract_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
features_df = pd.DataFrame([f for f in feature_data if f is not None])
features_df.head()


Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast
0,ehEhzzttOvI,73.374931,65.419201,77.057917,71.950683,63.507962
1,Zz--BUJ4VuU,88.901267,74.133906,71.210868,78.082014,53.159171
2,n9HpZHxhkVw,83.173628,83.075104,83.087812,83.112182,65.593829
3,Txk6EyuHi0A,61.804375,48.056736,54.434601,54.765237,57.291874
4,7WVjn7yB4-Q,31.690226,19.795156,22.93283,24.806071,28.240555


In [5]:
# Merge features_df with original df to created final "merged_df"

# Merge image features with original DataFrame
merged_df = pd.merge(df, features_df, left_on='id', right_on='video_id', how='left')

# Drop the redundant 'video_id' column
merged_df = merged_df.drop('video_id', axis=1)

In [6]:
merged_df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,avg_red,avg_green,avg_blue,brightness,contrast
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,en-GB,63043,1751,0,25,73.374931,65.419201,77.057917,71.950683,63.507962
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,en-GB,82459,3121,0,41,88.901267,74.133906,71.210868,78.082014,53.159171
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,en-GB,219370,7947,0,48,83.173628,83.075104,83.087812,83.112182,65.593829
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,en-GB,260149,9906,0,36,61.804375,48.056736,54.434601,54.765237,57.291874
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,en-GB,182946,7056,0,44,31.690226,19.795156,22.93283,24.806071,28.240555


In [7]:
# Export features to CSV
merged_df.to_csv("video_details_v2.csv", index=False)
print("Image features exported to image_features.csv")

Image features exported to image_features.csv


# Step 2: Text-based feature extraction

In [8]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v2.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,avg_red,avg_green,avg_blue,brightness,contrast
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,en-GB,63043,1751,0,25,73.374931,65.419201,77.057917,71.950683,63.507962
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,en-GB,82459,3121,0,41,88.901267,74.133906,71.210868,78.082014,53.159171
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,en-GB,219370,7947,0,48,83.173628,83.075104,83.087812,83.112182,65.593829
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,en-GB,260149,9906,0,36,61.804375,48.056736,54.434601,54.765237,57.291874
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,en-GB,182946,7056,0,44,31.690226,19.795156,22.93283,24.806071,28.240555


In [9]:
from textblob import TextBlob
import re

# Define sets of keywords
clickbait_words = {
    "amazing", "shocking", "unbelievable", "top", "ultimate", "must",
    "insane", "you won‚Äôt believe", "secret", "revealed", "hack"
}
power_words = {
    "best", "new", "crazy", "instant", "guaranteed", "proven", "genius"
}
timed_words = {
    "now", "today", "this week", "this month", "this year", "2024", "2025"
}

def extract_title_features(title):
    blob = TextBlob(title)
    words = title.split()
    punctuation = set("!?.,:;-()[]{}")

    upper_words = [w for w in words if w.isupper()]
    letters = re.findall(r'[A-Za-z]', title)
    uppercase_letters = [c for c in letters if c.isupper()]

    clickbait_score = sum(word.lower() in clickbait_words for word in words)
    power_word_hit = any(word.lower() in power_words for word in words)
    timed_word_hit = any(word.lower() in timed_words for word in words)

    return {
        "title_sentiment": blob.sentiment.polarity,
        "title_subjectivity": blob.sentiment.subjectivity,
        "has_question": int("?" in title),
        "has_exclamation": int("!" in title),
        "starts_with_keyword": int(words[0].lower() in {"how", "why", "what", "when", "where", "who"} if words else 0),
        "title_length": len(title),
        "word_count": len(words),
        "punctuation_count": sum(1 for c in title if c in punctuation),
        "uppercase_word_count": len(upper_words),
        "percent_letters_uppercase": round(len(uppercase_letters) / len(letters), 3) if letters else 0,
        "has_numbers": int(bool(re.search(r"\d", title))),
        "clickbait_score": clickbait_score,
        "has_power_words": int(power_word_hit),
        "has_timed_words": int(timed_word_hit)
    }

In [10]:
# Load the CSV that has titles in it
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details.csv"
title_df = pd.read_csv(url)

# Extract only the title and video id from the original csv so we can join it to df by video id to add the title column
title_df = title_df[['snippet.title', 'id', 'channel_id', 'statistics.viewCount', 'statistics.likeCount', 'statistics.commentCount']]
title_df = title_df.rename(columns={'snippet.title': 'title', 'id': 'video_id', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount'})
title_df.head()

Unnamed: 0,title,video_id,channel_id,viewCount,likeCount,commentCount
0,im on a horse mf üé∂,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,63043,1751,25
1,KSI GETS TROLLED,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,82459,3121,41
2,Oh Baby A Triple üé∂,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,219370,7947,48
3,KSIMON on Among Us,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,260149,9906,36
4,Deji is the GOAT Jester,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,182946,7056,44


In [11]:
# Merge to add title column
df = pd.merge(df, title_df, left_on='id', right_on='video_id', how='left')
df.head()

Unnamed: 0,kind,etag,id,channel_id_x,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,avg_green,avg_blue,brightness,contrast,title,video_id,channel_id_y,viewCount,likeCount,commentCount
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,65.419201,77.057917,71.950683,63.507962,im on a horse mf üé∂,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,63043,1751,25
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,74.133906,71.210868,78.082014,53.159171,KSI GETS TROLLED,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,82459,3121,41
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,83.075104,83.087812,83.112182,65.593829,Oh Baby A Triple üé∂,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,219370,7947,48
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,48.056736,54.434601,54.765237,57.291874,KSIMON on Among Us,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,260149,9906,36
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,19.795156,22.93283,24.806071,28.240555,Deji is the GOAT Jester,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,182946,7056,44


In [12]:
# Apply the extraction function to all titles
title_feature_df = df["title"].apply(extract_title_features).apply(pd.Series)

# Merge with the original DataFrame
df = pd.concat([df, title_feature_df], axis=1)
df.head()

Unnamed: 0,kind,etag,id,channel_id_x,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,starts_with_keyword,title_length,word_count,punctuation_count,uppercase_word_count,percent_letters_uppercase,has_numbers,clickbait_score,has_power_words,has_timed_words
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,0.0,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,0.0,16.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,0.0,18.0,5.0,0.0,1.0,0.308,0.0,0.0,0.0,0.0
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,0.0,18.0,4.0,0.0,1.0,0.533,0.0,0.0,0.0,0.0
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,0.0,23.0,5.0,0.0,1.0,0.316,0.0,0.0,0.0,0.0


In [13]:
# Export final dataset with image + text features
df.to_csv("video_details_v5.csv", index=False)

# Step 3: Labeling 'virality' using target variables

In [60]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v5.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id_x,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,starts_with_keyword,title_length,word_count,punctuation_count,uppercase_word_count,percent_letters_uppercase,has_numbers,clickbait_score,has_power_words,has_timed_words
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,0.0,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,0.0,16.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,0.0,18.0,5.0,0.0,1.0,0.308,0.0,0.0,0.0,0.0
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,0.0,18.0,4.0,0.0,1.0,0.533,0.0,0.0,0.0,0.0
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,0.0,23.0,5.0,0.0,1.0,0.316,0.0,0.0,0.0,0.0


In [15]:
# Ensure proper numeric types
df["viewCount"] = pd.to_numeric(df["viewCount"], errors="coerce")
df["likeCount"] = pd.to_numeric(df["likeCount"], errors="coerce")
df["commentCount"] = pd.to_numeric(df["commentCount"], errors="coerce")

# Optional: drop rows with missing values
df = df.dropna(subset=["viewCount", "likeCount", "commentCount"])

In [20]:
new_column_names = {'snippet.channelId': 'channel_id'}
df = df.rename(columns=new_column_names)

In [21]:
percentile = 0.85

# Select the numeric columns *before* grouping and applying quantile
thresholds = df[["channel_id", "viewCount", "likeCount", "commentCount"]].groupby("channel_id").quantile(percentile)[
    ["viewCount", "likeCount", "commentCount"]
].rename(columns={
    "viewCount": "views_threshold",
    "likeCount": "likes_threshold",
    "commentCount": "comments_threshold"
}).reset_index()

In [23]:
# Merge thresholds into main dataframe (only run once)
df = df.merge(thresholds, on="channel_id", how="left")
df.head()

Unnamed: 0,kind,etag,id,channel_id_x,snippet.publishedAt,channel_id,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,punctuation_count,uppercase_word_count,percent_letters_uppercase,has_numbers,clickbait_score,has_power_words,has_timed_words,views_threshold,likes_threshold,comments_threshold
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7364257.1,232069.4,4784.4
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,0.0,3.0,1.0,0.0,0.0,0.0,0.0,7364257.1,232069.4,4784.4
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,0.0,1.0,0.308,0.0,0.0,0.0,0.0,7364257.1,232069.4,4784.4
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,0.0,1.0,0.533,0.0,0.0,0.0,0.0,7364257.1,232069.4,4784.4
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,0.0,1.0,0.316,0.0,0.0,0.0,0.0,7364257.1,232069.4,4784.4


In [24]:
# Label virality using thresholds
df["viral"] = (
    (df["viewCount"] > df["views_threshold"]) &
    (df["likeCount"] > df["likes_threshold"]) &
    (df["commentCount"] > df["comments_threshold"])
).astype(int)

In [25]:
# Drop the threshold columns from the main DataFrame
df = df.drop(["views_threshold", "likes_threshold", "comments_threshold"], axis=1)

df.head()

Unnamed: 0,kind,etag,id,channel_id_x,snippet.publishedAt,channel_id,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,title_length,word_count,punctuation_count,uppercase_word_count,percent_letters_uppercase,has_numbers,clickbait_score,has_power_words,has_timed_words,viral
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,16.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,18.0,5.0,0.0,1.0,0.308,0.0,0.0,0.0,0.0,0
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,18.0,4.0,0.0,1.0,0.533,0.0,0.0,0.0,0.0,0
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,23.0,5.0,0.0,1.0,0.316,0.0,0.0,0.0,0.0,0


In [26]:
# Calculate the percentage of viral videos
viral_percentage = (df["viral"].sum() / len(df)) * 100

print(f"Percentage of viral videos: {viral_percentage:.2f}%")

Percentage of viral videos: 7.46%


In [81]:
# Export dataset with image + text features
df.to_csv("video_details_v6.csv", index=False)

# Step 4: Fixing up existing features

In [107]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v6.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id_x,snippet.publishedAt,channel_id,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,title_length,word_count,punctuation_count,uppercase_word_count,percent_letters_uppercase,has_numbers,clickbait_score,has_power_words,has_timed_words,viral
0,youtube#video,wBKZ0VcYOPzLHspKdYzLGZzy30M,ehEhzzttOvI,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T19:26:21Z,UCh5mLn90vUaB1PbRRx_AiaA,im on a horse mf üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/ehEhzzttOvI/default.jpg,120,...,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,youtube#video,Z2JgBtMARKVEKkggEeYWqlOjnCw,Zz--BUJ4VuU,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T12:41:51Z,UCh5mLn90vUaB1PbRRx_AiaA,KSI GETS TROLLED,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Zz--BUJ4VuU/default.jpg,120,...,16.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0
2,youtube#video,_1RWd-LFaHSj8kpdF-gRBnwslCs,n9HpZHxhkVw,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-19T06:32:26Z,UCh5mLn90vUaB1PbRRx_AiaA,Oh Baby A Triple üé∂,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/n9HpZHxhkVw/default.jpg,120,...,18.0,5.0,0.0,1.0,0.308,0.0,0.0,0.0,0.0,0
3,youtube#video,_CyMdLH1eNiiU-zhhYAojeFm-6w,Txk6EyuHi0A,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T17:24:24Z,UCh5mLn90vUaB1PbRRx_AiaA,KSIMON on Among Us,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/Txk6EyuHi0A/default.jpg,120,...,18.0,4.0,0.0,1.0,0.533,0.0,0.0,0.0,0.0,0
4,youtube#video,sDQROYKCRudJRykGNTcb_ic-ofU,7WVjn7yB4-Q,UCh5mLn90vUaB1PbRRx_AiaA,2025-06-18T08:57:10Z,UCh5mLn90vUaB1PbRRx_AiaA,Deji is the GOAT Jester,üçó: Order food NOW at: https://www.eatsides.com...,https://i.ytimg.com/vi/7WVjn7yB4-Q/default.jpg,120,...,23.0,5.0,0.0,1.0,0.316,0.0,0.0,0.0,0.0,0


In [108]:
# Dropping bad columns/features
columns_to_drop = ['kind',
                   'etag',
                   'channel_id_x',
                   'snippet.publishedAt',
                   'snippet.title',
                   'snippet.thumbnails.default.url',
                   'snippet.thumbnails.default.width',
                   'snippet.thumbnails.default.height',
                   'snippet.thumbnails.medium.url',
                   'snippet.thumbnails.medium.width',
                   'snippet.thumbnails.medium.height',
                   'snippet.thumbnails.high.url',
                   'snippet.thumbnails.high.width',
                   'snippet.thumbnails.high.height',
                   'snippet.thumbnails.standard.url',
                   'snippet.thumbnails.standard.width',
                   'snippet.thumbnails.standard.height',
                   'snippet.thumbnails.maxres.url',
                   'snippet.thumbnails.maxres.width',
                   'snippet.thumbnails.maxres.height',
                   'statistics.viewCount',
                   'statistics.likeCount',
                   'statistics.commentCount',
                   'snippet.channelTitle',
                   'snippet.categoryId',
                   'snippet.liveBroadcastContent',
                   'snippet.defaultAudioLanguage',
                   'title',
                   'channel_id_x',
                   'channel_id_y',
                   'snippet.localized.description',
                   'statistics.favoriteCount',
                   'id']
df = df.drop(columns=columns_to_drop, axis=1)

In [109]:
df = df.rename(columns={'snippet.description': 'description', 'snippet.localized.title': 'title', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount', 'snippet.tags': 'tags'})

In [110]:
print(df.columns)

Index(['channel_id', 'description', 'tags', 'title', 'avg_red', 'avg_green',
       'avg_blue', 'brightness', 'contrast', 'video_id', 'viewCount',
       'likeCount', 'commentCount', 'title_sentiment', 'title_subjectivity',
       'has_question', 'has_exclamation', 'starts_with_keyword',
       'title_length', 'word_count', 'punctuation_count',
       'uppercase_word_count', 'percent_letters_uppercase', 'has_numbers',
       'clickbait_score', 'has_power_words', 'has_timed_words', 'viral'],
      dtype='object')


In [111]:
# Export dataset with fixed up existing features
df.to_csv("video_details_v7.csv", index=False)

# Step 5: Adding new features

In [120]:
# Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/data/video_details_v7.csv"
df = pd.read_csv(url)
df.head()
print(df.columns)

Index(['channel_id', 'description', 'tags', 'title', 'avg_red', 'avg_green',
       'avg_blue', 'brightness', 'contrast', 'video_id', 'viewCount',
       'likeCount', 'commentCount', 'title_sentiment', 'title_subjectivity',
       'has_question', 'has_exclamation', 'starts_with_keyword',
       'title_length', 'word_count', 'punctuation_count',
       'uppercase_word_count', 'percent_letters_uppercase', 'has_numbers',
       'clickbait_score', 'has_power_words', 'has_timed_words', 'viral'],
      dtype='object')


In [121]:
### Description-based features

import re
from textblob import TextBlob

# Fill NA for descriptions
df['description'] = df['description'].fillna('')

# Description length
df['description_length'] = df['description'].apply(len)

# Description sentiment
df['description_sentiment'] = df['description'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Description has keywords
keywords = ['subscribe', 'giveaway', 'limited time', 'offer', 'new video']
df['description_has_keywords'] = df['description'].apply(lambda x: any(kw in x.lower() for kw in keywords))

# Description link count
df['description_link_count'] = df['description'].apply(lambda x: len(re.findall(r'http[s]?://', x)))

In [122]:
### Tag-based features

# Fill NA for tags
df['tags'] = df['tags'].fillna('').astype(str)

# Tag count
df['tag_count'] = df['tags'].apply(lambda x: len(x.split(',')) if x else 0)

# Tag sentiment
df['tag_sentiment'] = df['tags'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Unique tags
df['num_unique_tags'] = df['tags'].apply(lambda x: len(set(tag.strip().lower() for tag in x.split(','))) if x else 0)


In [124]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.7-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.7-py3-none-any.whl (175 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m175.3/175.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/939.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚

In [125]:
### Title-based Semantic features

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import textstat

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute title embeddings
df['title_embedding'] = df['title'].fillna('').apply(lambda x: model.encode(x, convert_to_tensor=False))

# Embedding distance to known viral videos
viral_embeddings = df[df['viral'] == 1]['title_embedding'].tolist()

def max_cosine_similarity(vec, others):
    return np.max(cosine_similarity([vec], others)[0]) if others else 0

df['embedding_distance_to_known_viral'] = df['title_embedding'].apply(
    lambda x: max_cosine_similarity(x, viral_embeddings)
)

# Clickbait phrase match
clickbait_phrases = ['you won‚Äôt believe', 'what happens next', 'this is why', 'top secret', 'never seen before']
df['clickbait_phrase_match'] = df['title'].fillna('').str.lower().apply(lambda t: any(p in t for p in clickbait_phrases))

# Readability score
df['title_readability'] = df['title'].fillna('').apply(lambda x: textstat.flesch_reading_ease(x))

# Embedding distance to average title (unsupervised)
all_embeddings = df['title_embedding'].tolist()
avg_embedding = np.mean(all_embeddings, axis=0)
df['title_embedding_distance_to_viral'] = df['title_embedding'].apply(
    lambda x: 1 - cosine_similarity([x], [avg_embedding])[0][0]
)

# Listicle and tutorial flags
df['is_listicle'] = df['title'].str.strip().str.lower().str.match(r'^\d+').astype(int)
df['is_tutorial'] = df['title'].str.lower().str.startswith('how to').astype(int)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [126]:
### Thumbnail-based features
import cv2
import os
from collections import Counter
from matplotlib.colors import rgb_to_hsv

# Path to thumbnail images, assumes filename = <video_id>.jpg
thumbnail_dir = "/content/thumbnails/"

# Face detection setup
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def get_thumbnail_features(video_id):
    path = os.path.join(thumbnail_dir, f"{video_id}.jpg")
    if not os.path.exists(path):
        return pd.Series([0, -1, 0.0], index=['num_faces', 'dominant_color_hue', 'thumbnail_edge_density'])

    img = cv2.imread(path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Face count
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    num_faces = len(faces)

    # Dominant color hue
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hue_channel = hsv[:, :, 0].flatten()
    most_common_hue = Counter(hue_channel).most_common(1)[0][0]

    # Edge density
    edges = cv2.Canny(gray, 100, 200)
    edge_density = np.sum(edges > 0) / edges.size

    return pd.Series([num_faces, most_common_hue, edge_density], index=['num_faces', 'dominant_color_hue', 'thumbnail_edge_density'])

df[['num_faces', 'dominant_color_hue', 'thumbnail_edge_density']] = df['video_id'].apply(get_thumbnail_features)

In [127]:
### Word Count Refinement

# Power/timed word lists
power_words = ['secret', 'shocking', 'revealed', 'proven', 'insane']
timed_words = ['today', 'now', 'soon', 'in 24 hours', 'this week']

def count_matches(text, word_list):
    return sum(word in text.lower() for word in word_list)

df['power_word_count'] = df['title'].fillna('').apply(lambda x: count_matches(x, power_words))
df['timed_word_count'] = df['title'].fillna('').apply(lambda x: count_matches(x, timed_words))

# Optionally drop the binary flags
df.drop(columns=['has_power_words', 'has_timed_words'], inplace=True, errors='ignore')


In [138]:
# Checking through added features

print(df.columns)
print(df.head)
timed_word_count_sum = df['timed_word_count'].sum()
power_word_count_sum = df['power_word_count'].sum()
is_listicle_sum = df['is_listicle'].sum()
is_tutorial_sum = df['is_tutorial'].sum()
num_faces_sum = df['num_faces'].sum()
true_clickbait_matches = df['clickbait_phrase_match'].sum()
print(f"Total number of timed_word_count values: {timed_word_count_sum}")
print(f"Total number of power_word_count values: {power_word_count_sum}")
print(f"Total number of is_listicle values: {is_listicle_sum}")
print(f"Total number of is_tutorial values: {is_tutorial_sum}")
print(f"Total number of num_faces values: {num_faces_sum}")
print(f"Total number of true clickbait_phrase_match values: {true_clickbait_matches}")

Index(['channel_id', 'description', 'tags', 'title', 'avg_red', 'avg_green',
       'avg_blue', 'brightness', 'contrast', 'video_id', 'viewCount',
       'likeCount', 'commentCount', 'title_sentiment', 'title_subjectivity',
       'has_question', 'has_exclamation', 'starts_with_keyword',
       'title_length', 'word_count', 'punctuation_count',
       'uppercase_word_count', 'percent_letters_uppercase', 'has_numbers',
       'clickbait_score', 'viral', 'description_length',
       'description_sentiment', 'description_has_keywords',
       'description_link_count', 'tag_count', 'tag_sentiment',
       'num_unique_tags', 'title_embedding',
       'embedding_distance_to_known_viral', 'clickbait_phrase_match',
       'title_readability', 'title_embedding_distance_to_viral', 'is_listicle',
       'is_tutorial', 'num_faces', 'dominant_color_hue',
       'thumbnail_edge_density', 'power_word_count', 'timed_word_count'],
      dtype='object')
<bound method NDFrame.head of                     

In [139]:
# Removing any seemingly useless features
columns_to_drop = ['timed_word_count', 'power_word_count', 'is_listicle', 'is_tutorial', 'num_faces', 'clickbait_phrase_match']
df = df.drop(columns=columns_to_drop, axis=1)

In [140]:
# Export
df.to_csv("video_details_v8.csv", index=False)