<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality/blob/main/image_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Download thumbnails and extract simple visual features


In [1]:
# Step 1: Load the CSV
import pandas as pd

url = "https://raw.githubusercontent.com/rithikkulkarni/Video-Virality/refs/heads/main/video_details.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.categoryId,snippet.liveBroadcastContent,snippet.defaultLanguage,snippet.localized.title,snippet.localized.description,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,EZIz8pZ8r7Y3Zwf3HpjS-_Qi5EI,UtUKi4HUBNY,UCNEI-oWOivQJ9IMn8GeCrMg,2025-06-08T20:24:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,Nimue - Juan Arenosa (Piano Cover),🎹The vibrant melody of “Nimue” by Juan Arenosa...,https://i.ytimg.com/vi/UtUKi4HUBNY/default.jpg,120,...,10,none,en-US,Nimue - Juan Arenosa (Piano Cover),🎹The vibrant melody of “Nimue” by Juan Arenosa...,en-US,10412,784.0,0,73
1,youtube#video,evXKA9D3t5Lb3hzqhZzfCCb7EH0,0PWC69jDOP8,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-31T17:19:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Cry - Cigarettes after sex (Piano Cover),🎹The vibrant melody of “cry” by cigarettes aft...,https://i.ytimg.com/vi/0PWC69jDOP8/default.jpg,120,...,10,none,en-US,Cry - Cigarettes after sex (Piano Cover),🎹The vibrant melody of “cry” by cigarettes aft...,en-US,28792,1414.0,0,110
2,youtube#video,6XryOqEiZhKZ7X-SB5rpVaUCu2Y,0wIpL0cYujQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-26T20:29:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,hometown glory - adele,🎹The vibrant melody of “hometown glory” by Ade...,https://i.ytimg.com/vi/0wIpL0cYujQ/default.jpg,120,...,10,none,en-US,hometown glory - adele,🎹The vibrant melody of “hometown glory” by Ade...,en-US,12000,495.0,0,72
3,youtube#video,zPnzLnqadx_g30YMdUaz0tJ_BAI,994ZZm2aX-E,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-19T16:59:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,"Welcome and goodbye - Dream, Ivory",🎹The vibrant melody of “welcome and goodbye” b...,https://i.ytimg.com/vi/994ZZm2aX-E/default.jpg,120,...,10,none,en-US,"Welcome and goodbye - Dream, Ivory",🎹The vibrant melody of “welcome and goodbye” b...,en-US,17402,784.0,0,92
4,youtube#video,n5AIwWOI4BTvQhipK7rJRNOvNy4,GkDEInsCcpQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-10T19:35:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Hope - Mirko Dukanovic,🎹The vibrant melody of “Hope” by Mirko Dukanov...,https://i.ytimg.com/vi/GkDEInsCcpQ/default.jpg,120,...,10,none,en-US,Hope - Mirko Dukanovic,🎹The vibrant melody of “Hope” by Mirko Dukanov...,en-US,18358,714.0,0,103


In [2]:
# Step 2: Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

Found 168 video IDs.


In [3]:
# Step 3: Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]


In [4]:
# Step 4: Extract visual features
import numpy as np

def extract_features(img_path, video_id):
    try:
        img = Image.open(img_path).convert('RGB')
        img_array = np.array(img)

        avg_color = np.mean(img_array, axis=(0, 1))
        brightness = np.mean(img_array)
        contrast = np.std(img_array)

        return {
            "video_id": video_id,
            "avg_red": avg_color[0],
            "avg_green": avg_color[1],
            "avg_blue": avg_color[2],
            "brightness": brightness,
            "contrast": contrast
        }
    except Exception as e:
        print(f"Failed to extract for {video_id}: {e}")
        return None

feature_data = [extract_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
features_df = pd.DataFrame([f for f in feature_data if f is not None])
features_df.head()


Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast
0,UtUKi4HUBNY,34.080816,36.617812,43.345712,38.01478,58.72613
1,0PWC69jDOP8,37.773715,31.678142,43.084097,37.511985,63.705403
2,0wIpL0cYujQ,29.380017,30.419931,41.579375,33.793108,54.808984
3,994ZZm2aX-E,45.244774,23.591406,23.828802,30.888328,49.698068
4,GkDEInsCcpQ,29.958542,29.702031,36.463542,32.041372,55.126803


In [5]:
# Step 5: Merge features_df with original df to created final "merged_df"

# Merge image features with original DataFrame
merged_df = pd.merge(df, features_df, left_on='id', right_on='video_id', how='left')

# Drop the redundant 'video_id' column
merged_df = merged_df.drop('video_id', axis=1)

# Export the merged DataFrame to CSV
merged_df.to_csv("video_data_with_features.csv", index=False)
print("Merged data with image features exported to video_data_with_features.csv")

Merged data with image features exported to video_data_with_features.csv


In [6]:
merged_df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.defaultAudioLanguage,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,avg_red,avg_green,avg_blue,brightness,contrast
0,youtube#video,EZIz8pZ8r7Y3Zwf3HpjS-_Qi5EI,UtUKi4HUBNY,UCNEI-oWOivQJ9IMn8GeCrMg,2025-06-08T20:24:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,Nimue - Juan Arenosa (Piano Cover),🎹The vibrant melody of “Nimue” by Juan Arenosa...,https://i.ytimg.com/vi/UtUKi4HUBNY/default.jpg,120,...,en-US,10412,784.0,0,73,34.080816,36.617812,43.345712,38.01478,58.72613
1,youtube#video,evXKA9D3t5Lb3hzqhZzfCCb7EH0,0PWC69jDOP8,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-31T17:19:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Cry - Cigarettes after sex (Piano Cover),🎹The vibrant melody of “cry” by cigarettes aft...,https://i.ytimg.com/vi/0PWC69jDOP8/default.jpg,120,...,en-US,28792,1414.0,0,110,37.773715,31.678142,43.084097,37.511985,63.705403
2,youtube#video,6XryOqEiZhKZ7X-SB5rpVaUCu2Y,0wIpL0cYujQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-26T20:29:01Z,UCNEI-oWOivQJ9IMn8GeCrMg,hometown glory - adele,🎹The vibrant melody of “hometown glory” by Ade...,https://i.ytimg.com/vi/0wIpL0cYujQ/default.jpg,120,...,en-US,12000,495.0,0,72,29.380017,30.419931,41.579375,33.793108,54.808984
3,youtube#video,zPnzLnqadx_g30YMdUaz0tJ_BAI,994ZZm2aX-E,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-19T16:59:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,"Welcome and goodbye - Dream, Ivory",🎹The vibrant melody of “welcome and goodbye” b...,https://i.ytimg.com/vi/994ZZm2aX-E/default.jpg,120,...,en-US,17402,784.0,0,92,45.244774,23.591406,23.828802,30.888328,49.698068
4,youtube#video,n5AIwWOI4BTvQhipK7rJRNOvNy4,GkDEInsCcpQ,UCNEI-oWOivQJ9IMn8GeCrMg,2025-05-10T19:35:00Z,UCNEI-oWOivQJ9IMn8GeCrMg,Hope - Mirko Dukanovic,🎹The vibrant melody of “Hope” by Mirko Dukanov...,https://i.ytimg.com/vi/GkDEInsCcpQ/default.jpg,120,...,en-US,18358,714.0,0,103,29.958542,29.702031,36.463542,32.041372,55.126803


In [7]:
# Step 6: Export features to CSV
merged_df.to_csv("video_details_v2.csv", index=False)
print("Image features exported to image_features.csv")

Image features exported to image_features.csv


# Step 2: OCR + Text Feature Extraction

In [8]:
# Install Tesseract OCR engine and text analysis libraries
!sudo apt-get install tesseract-ocr -y
!pip install pytesseract textblob

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [9]:
import pytesseract
from PIL import Image
from textblob import TextBlob
import os

def extract_text_features(img_path, video_id):
    try:
        img = Image.open(img_path)
        raw_text = pytesseract.image_to_string(img)
        word_count = len(raw_text.strip().split())
        sentiment = TextBlob(raw_text).sentiment.polarity  # -1 to 1

        return {
            "video_id": video_id,
            "thumbnail_text": raw_text.strip(),
            "text_word_count": word_count,
            "text_sentiment": sentiment
        }
    except Exception as e:
        print(f"Error on {video_id}: {e}")
        return None

# Run OCR on all thumbnails from Step 1
ocr_data = [extract_text_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
ocr_df = pd.DataFrame([o for o in ocr_data if o is not None])
ocr_df.head()

Unnamed: 0,video_id,thumbnail_text,text_word_count,text_sentiment
0,UtUKi4HUBNY,,0,0.0
1,0PWC69jDOP8,,0,0.0
2,0wIpL0cYujQ,,0,0.0
3,994ZZm2aX-E,,0,0.0
4,GkDEInsCcpQ,,0,0.0


In [10]:
# Merge Step 1 visual features with Step 2 OCR features
combined_df = pd.merge(features_df, ocr_df, on="video_id", how="left")
combined_df.head()

Unnamed: 0,video_id,avg_red,avg_green,avg_blue,brightness,contrast,thumbnail_text,text_word_count,text_sentiment
0,UtUKi4HUBNY,34.080816,36.617812,43.345712,38.01478,58.72613,,0,0.0
1,0PWC69jDOP8,37.773715,31.678142,43.084097,37.511985,63.705403,,0,0.0
2,0wIpL0cYujQ,29.380017,30.419931,41.579375,33.793108,54.808984,,0,0.0
3,994ZZm2aX-E,45.244774,23.591406,23.828802,30.888328,49.698068,,0,0.0
4,GkDEInsCcpQ,29.958542,29.702031,36.463542,32.041372,55.126803,,0,0.0


In [13]:
combined_df.to_csv("video_details_v3.csv", index=False)