<a href="https://colab.research.google.com/github/rithikkulkarni/Video-Virality-Testing/blob/main/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extraction: Run as group


In [None]:
filename = "clipsjoy.csv"

# Load the CSV
import pandas as pd

url = f"https://raw.githubusercontent.com/rithikkulkarni/Video-Virality-Testing/refs/heads/main/data/{filename}"
df = pd.read_csv(url)
df.head()

Unnamed: 0,kind,etag,id,channel_id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,...,snippet.channelTitle,snippet.tags,snippet.categoryId,snippet.liveBroadcastContent,snippet.localized.title,snippet.localized.description,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount
0,youtube#video,74faJ5K8tRTdCsqs_PCfKv-MzPs,_fc9JPIDq24,UCUyAA4Ekq5nHmr5mjZ-L4eA,2023-09-01T07:16:35Z,UCUyAA4Ekq5nHmr5mjZ-L4eA,Nadeshot UPSET over GUARD Decision #tarik #sho...,,https://i.ytimg.com/vi/_fc9JPIDq24/default.jpg,120,...,ClipsJoy,"['The GUARD', 'Nadeshot', 'Valorant', 'Riot', ...",20,none,Nadeshot UPSET over GUARD Decision #tarik #sho...,,162,5,0,0
1,youtube#video,CZdvuqxHGJEp9WcnYxi_EpFOK1Y,eSrxybQRoCQ,UCUyAA4Ekq5nHmr5mjZ-L4eA,2023-09-01T00:14:23Z,UCUyAA4Ekq5nHmr5mjZ-L4eA,When SHROUD Was On Sentinels #shorts #valorant...,,https://i.ytimg.com/vi/eSrxybQRoCQ/default.jpg,120,...,ClipsJoy,,20,none,When SHROUD Was On Sentinels #shorts #valorant...,,4803,101,0,0
2,youtube#video,vjk5-2VRY_zFmit7GgB5rAerj5M,rvHHDG4XnWk,UCUyAA4Ekq5nHmr5mjZ-L4eA,2023-08-28T22:31:12Z,UCUyAA4Ekq5nHmr5mjZ-L4eA,FNS EXPLAINS Sentinels Roster & EG vs LOUD Exp...,FNS on Sentinels Roster & EG vs LOUD Expectati...,https://i.ytimg.com/vi/rvHHDG4XnWk/default.jpg,120,...,ClipsJoy,"['jarso', 'stewie2k', 'tarik', 'shahzam', 'sin...",20,none,FNS EXPLAINS Sentinels Roster & EG vs LOUD Exp...,FNS on Sentinels Roster & EG vs LOUD Expectati...,4333,32,0,10
3,youtube#video,RoiQp8R8_WvWNVR6B0TBET-cMlU,8HcnGACiegA,UCUyAA4Ekq5nHmr5mjZ-L4eA,2023-08-28T22:14:15Z,UCUyAA4Ekq5nHmr5mjZ-L4eA,NRG s0m & FNS EXPLAIN Leaving NRG Roster,NRG s0m & FNS on WHY They're Leaving The NRG R...,https://i.ytimg.com/vi/8HcnGACiegA/default.jpg,120,...,ClipsJoy,"['jarso', 'stewie2k', 'tarik', 'shahzam', 'sin...",20,none,NRG s0m & FNS EXPLAIN Leaving NRG Roster,NRG s0m & FNS on WHY They're Leaving The NRG R...,2273,15,0,1
4,youtube#video,2rs9CuPKHUafmLaPFajuVHP1sJo,4rNxrXDb3do,UCUyAA4Ekq5nHmr5mjZ-L4eA,2023-08-27T00:01:51Z,UCUyAA4Ekq5nHmr5mjZ-L4eA,Evil Geniuses Are YOUR VALORANT CHAMPIONS | Ta...,"SEN Tarik, TenZ, Zekken, Rawkus React to EG WI...",https://i.ytimg.com/vi/4rNxrXDb3do/default.jpg,120,...,ClipsJoy,"['jarso', 'stewie2k', 'tarik', 'shahzam', 'sin...",20,none,Evil Geniuses Are YOUR VALORANT CHAMPIONS | Ta...,"SEN Tarik, TenZ, Zekken, Rawkus React to EG WI...",711,8,0,0


In [None]:
# Extract video IDs
video_ids = df["id"].dropna().unique().tolist()
print(f"Found {len(video_ids)} video IDs.")

# Download thumbnails
import os
import requests
from PIL import Image
from io import BytesIO

os.makedirs("thumbnails", exist_ok=True)

def download_thumbnail(video_id):
    url = f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg"
    path = f"thumbnails/{video_id}.jpg"
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(path)
        return path
    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return None

paths = [download_thumbnail(vid) for vid in video_ids]

Found 165 video IDs.


In [None]:
# Extract visual features
import numpy as np

def extract_features(img_path, video_id):
    try:
        img = Image.open(img_path).convert('RGB')
        img_array = np.array(img)

        avg_color = np.mean(img_array, axis=(0, 1))
        brightness = np.mean(img_array)
        contrast = np.std(img_array)

        return {
            "video_id": video_id,
            "avg_red": avg_color[0],
            "avg_green": avg_color[1],
            "avg_blue": avg_color[2],
            "brightness": brightness,
            "contrast": contrast
        }
    except Exception as e:
        print(f"Failed to extract for {video_id}: {e}")
        return None

feature_data = [extract_features(f"thumbnails/{vid}.jpg", vid) for vid in video_ids]
features_df = pd.DataFrame([f for f in feature_data if f is not None])
features_df.head()

# Merge image features with original DataFrame
merged_df = pd.merge(df, features_df, left_on='id', right_on='video_id', how='left')

# Drop the redundant 'video_id' column
merged_df = merged_df.drop('video_id', axis=1)

df = merged_df

In [None]:
from textblob import TextBlob
import re

# Define sets of keywords
clickbait_words = {
    "amazing", "incredible", "shocking", "jaw-dropping", "mind-blowing",
    "unbelievable", "you won’t believe", "you’ll never guess", "what happens next",
    "epic", "ultimate", "must", "insane", "secret", "exposed", "revealed",
    "hack", "these reasons", "10 reasons", "this trick", "don’t miss",
    "game changer", "craziest", "revealed", "the truth about", "deal of the day"
}

power_words = {
    "best", "top", "new", "essential", "easy", "quick", "instant", "effortless",
    "guaranteed", "proven", "genius", "exclusive", "remarkable", "powerful",
    "revolutionary", "breakthrough", "must-have", "unlock", "master", "ultimate",
    "secret", "simple", "transform", "hacks", "tips", "tricks"
}

timed_words = {
    "now", "today", "just now", "breaking", "this morning", "this afternoon",
    "this evening", "tonight", "this week", "this weekend", "this month",
    "this season", "this year", "last minute", "last week", "2023", "2024",
    "2025", "coming soon", "newly released", "upcoming", "recent", "daily",
    "weekly", "monthly", "yearly"
}

def extract_title_features(title):
    blob = TextBlob(title)
    words = title.split()
    punctuation = set("!?.,:;-()[]{}")

    upper_words = [w for w in words if w.isupper()]
    letters = re.findall(r'[A-Za-z]', title)
    uppercase_letters = [c for c in letters if c.isupper()]
    digits = re.findall(r'\d', title)


    clickbait_score = sum(word.lower() in clickbait_words for word in words)
    power_word_count = sum(word.lower() in power_words for word in words)
    timed_word_count = sum(word.lower() in timed_words for word in words)


    return {
        "title_sentiment": blob.sentiment.polarity,
        "title_subjectivity": blob.sentiment.subjectivity,
        "num_question_marks": title.count("?"),
        "num_exclamation_marks": title.count("!"),
        "starts_with_keyword": int(words[0].lower() in {"how", "why", "what", "when", "where", "who"} if words else 0),
        "title_length": len(title),
        "word_count": len(words),
        "punctuation_count": sum(1 for c in title if c in punctuation),
        "uppercase_word_count": len(upper_words),
        "percent_letters_uppercase": round(len(uppercase_letters) / len(letters), 3) if letters else 0,
        "num_digits": len(digits),
        "clickbait_score": clickbait_score,
        "num_power_words": power_word_count,
        "num_timed_words": timed_word_count
    }

In [None]:
# Load the CSV that has titles in it
import pandas as pd

title_df = pd.read_csv(url)

# Extract only the title and video id from the original csv so we can join it to df by video id to add the title column
title_df = title_df[['snippet.title', 'id', 'channel_id', 'statistics.viewCount', 'statistics.likeCount', 'statistics.commentCount']]
title_df = title_df.rename(columns={'snippet.title': 'title', 'id': 'video_id', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount'})
title_df.head()

# Merge to add title column
df = pd.merge(df, title_df, left_on='id', right_on='video_id', how='left')
df.head()

# Apply the extraction function to all titles
title_feature_df = df["title"].apply(extract_title_features).apply(pd.Series)

# Merge with the original DataFrame
df = pd.concat([df, title_feature_df], axis=1)
df.head()

df = df.rename(columns={'snippet.channelId': 'channel_id'})

In [None]:
# "High performance" threshold
percentile = 0.75

# Select the numeric columns *before* grouping and applying quantile
thresholds = df[["channel_id", "viewCount", "likeCount", "commentCount"]].groupby("channel_id").quantile(percentile)[
    ["viewCount", "likeCount", "commentCount"]
].rename(columns={
    "viewCount": "views_threshold",
    "likeCount": "likes_threshold",
    "commentCount": "comments_threshold"
}).reset_index()

# Merge thresholds into main dataframe (only run once)
df = df.merge(thresholds, on="channel_id", how="left")
df.head()

# Label virality using thresholds
df["viral"] = (
    (df["viewCount"] > df["views_threshold"]) &
    (df["likeCount"] > df["likes_threshold"]) &
    (df["commentCount"] > df["comments_threshold"])
).astype(int)

# Drop the threshold columns from the main DataFrame
df = df.drop(["views_threshold", "likes_threshold", "comments_threshold"], axis=1)

df.head()

# (OPTIONAL FOR DEBUGGING) Calculate the percentage of viral videos
viral_percentage = (df["viral"].sum() / len(df)) * 100

print(f"Percentage of viral videos: {viral_percentage:.2f}%")

Percentage of viral videos: 12.73%


In [None]:
# Dropping bad columns/features
columns_to_drop = ['kind',
                   'etag',
                   'channel_id_x',
                   'snippet.publishedAt',
                   'snippet.title',
                   'snippet.thumbnails.default.url',
                   'snippet.thumbnails.default.width',
                   'snippet.thumbnails.default.height',
                   'snippet.thumbnails.medium.url',
                   'snippet.thumbnails.medium.width',
                   'snippet.thumbnails.medium.height',
                   'snippet.thumbnails.high.url',
                   'snippet.thumbnails.high.width',
                   'snippet.thumbnails.high.height',
                   'snippet.thumbnails.standard.url',
                   'snippet.thumbnails.standard.width',
                   'snippet.thumbnails.standard.height',
                   'snippet.thumbnails.maxres.url',
                   'snippet.thumbnails.maxres.width',
                   'snippet.thumbnails.maxres.height',
                   'statistics.viewCount',
                   'statistics.likeCount',
                   'statistics.commentCount',
                   'snippet.channelTitle',
                   'snippet.categoryId',
                   'snippet.liveBroadcastContent',
                   'snippet.defaultAudioLanguage',
                   'snippet.defaultLanguage',
                   'title',
                   'channel_id_x',
                   'channel_id_y',
                   'snippet.localized.description',
                   'statistics.favoriteCount',
                   'id']

# Check which columns exist in the DataFrame before dropping
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

df = df.drop(columns=existing_columns_to_drop, axis=1)

In [None]:
# Renaming columns for prettier features
df = df.rename(columns={'snippet.description': 'description', 'snippet.localized.title': 'title', 'statistics.viewCount': 'viewCount', 'statistics.likeCount': 'likeCount', 'statistics.commentCount': 'commentCount', 'snippet.tags': 'tags'})

In [None]:
### Description-based features

import re
from textblob import TextBlob

# Fill NA for descriptions
df['description'] = df['description'].fillna('')

# Description length
df['description_length'] = df['description'].apply(len)

# Description sentiment
df['description_sentiment'] = df['description'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Description has keywords
keywords = ['subscribe', 'giveaway', 'limited time', 'offer', 'new video']
df['description_has_keywords'] = df['description'].apply(lambda x: any(kw in x.lower() for kw in keywords))

In [None]:
### Tag-based features

# Fill NA for tags
df['tags'] = df['tags'].fillna('').astype(str)

# Tag sentiment
df['tag_sentiment'] = df['tags'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Unique tags
df['num_unique_tags'] = df['tags'].apply(lambda x: len(set(tag.strip().lower() for tag in x.split(','))) if x else 0)

In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.7-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.7-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.7


In [None]:
### Title-based Semantic features

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import textstat

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute title embeddings
df['title_embedding'] = df['title'].fillna('').apply(lambda x: model.encode(x, convert_to_tensor=False))

# Embedding distance to known viral videos
viral_embeddings = df[df['viral'] == 1]['title_embedding'].tolist()

def max_cosine_similarity(vec, others):
    return np.max(cosine_similarity([vec], others)[0]) if others else 0

# Clickbait phrase match
clickbait_phrases = ['you won’t believe', 'what happens next', 'this is why', 'top secret', 'never seen before']
df['clickbait_phrase_match'] = df['title'].fillna('').str.lower().apply(lambda t: any(p in t for p in clickbait_phrases)).astype(int)

# Readability score
df['title_readability'] = df['title'].fillna('').apply(lambda x: textstat.flesch_reading_ease(x))

# Listicle and tutorial flags
df['is_listicle'] = df['title'].str.strip().str.lower().str.match(r'^\d+').astype(int)
df['is_tutorial'] = df['title'].str.lower().str.startswith('how to').astype(int)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
### Thumbnail-based features
import cv2
import os
from collections import Counter
from matplotlib.colors import rgb_to_hsv

# Path to thumbnail images, assumes filename = <video_id>.jpg
thumbnail_dir = "/content/thumbnails/"

# Face detection setup
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def get_thumbnail_features(video_id):
    path = os.path.join(thumbnail_dir, f"{video_id}.jpg")
    if not os.path.exists(path):
        return pd.Series([0, -1, 0.0], index=['num_faces', 'dominant_color_hue', 'thumbnail_edge_density'])

    img = cv2.imread(path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Face count
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    num_faces = len(faces)

    # Dominant color hue
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hue_channel = hsv[:, :, 0].flatten()
    most_common_hue = Counter(hue_channel).most_common(1)[0][0]

    # Edge density
    edges = cv2.Canny(gray, 100, 200)
    edge_density = np.sum(edges > 0) / edges.size

    return pd.Series([num_faces, most_common_hue, edge_density], index=['num_faces', 'dominant_color_hue', 'thumbnail_edge_density'])

df[['num_faces', 'dominant_color_hue', 'thumbnail_edge_density']] = df['video_id'].apply(get_thumbnail_features)

In [None]:
### Word Count Refinement

def count_matches(text, word_list):
    return sum(word in text.lower() for word in word_list)

df['power_word_count'] = df['title'].fillna('').apply(lambda x: count_matches(x, power_words))
df['timed_word_count'] = df['title'].fillna('').apply(lambda x: count_matches(x, timed_words))
df['num_tags'] = df['tags'].apply(lambda x: len(x.split(',')) if x else 0)
df['avg_tag_length'] = df['tags'].apply(lambda x: np.mean([len(tag) for tag in x.split(',')]) if x else 0)

In [None]:
# Sanity check to see if less interesting features are at all useful

timed_word_count_sum = df['timed_word_count'].sum()
power_word_count_sum = df['power_word_count'].sum()
is_listicle_sum = df['is_listicle'].sum()
is_tutorial_sum = df['is_tutorial'].sum()
num_faces_sum = df['num_faces'].sum()
true_clickbait_matches = df['clickbait_phrase_match'].sum()
print(f"Total number of timed_word_count values: {timed_word_count_sum}")
print(f"Total number of power_word_count values: {power_word_count_sum}")
print(f"Total number of is_listicle values: {is_listicle_sum}")
print(f"Total number of is_tutorial values: {is_tutorial_sum}")
print(f"Total number of num_faces values: {num_faces_sum}")
print(f"Total number of true clickbait_phrase_match values: {true_clickbait_matches}")

Total number of timed_word_count values: 0
Total number of power_word_count values: 14
Total number of is_listicle values: 2
Total number of is_tutorial values: 0
Total number of num_faces values: 27.0
Total number of true clickbait_phrase_match values: 0


In [None]:
pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2


In [None]:
from typing import List
from pytrends.request import TrendReq
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the SentenceTransformer model once at import time.
# The weights and config will be downloaded and cached under ./model_cache.
MODEL = SentenceTransformer(
    "all-MiniLM-L6-v2",
    cache_folder="./model_cache"
)

def get_google_trend_score(keywords: List[str]) -> float:
    """
    Fetches daily interest for given keywords from Google Trends and returns
    the normalized average score (0.0 to 1.0).
    """
    try:
        pytrends = TrendReq()
        pytrends.build_payload(keywords, timeframe="now 1-d")
        data = pytrends.interest_over_time()
        if data.empty:
            return 0.0
        # values range from 0–100
        return float(data[keywords].iloc[-1].mean() / 100)
    except Exception:
        return 0.0

def get_twitter_trend_score(keywords: List[str]) -> float:
    """
    Scrapes Twitter's trending topics page and returns a normalized
    count of how many trending hashtags match the provided keywords.
    """
    try:
        url = "https://twitter.com/explore/tabs/trending"
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        hashtags = [tag.get_text().lstrip("#") for tag in soup.find_all("span")]
        matches = sum(
            1 for kw in keywords
            if any(kw.lower() in h.lower() for h in hashtags)
        )
        max_count = max(len(hashtags), 1)
        return matches / max_count
    except Exception:
        return 0.0

def get_youtube_trending_similarity(keywords: List[str]) -> float:
    """
    Scrapes YouTube's trending page and computes the maximum cosine similarity
    between the keyword string and any trending video title, using the
    preloaded SentenceTransformer model.
    """
    try:
        resp = requests.get("https://www.youtube.com/feed/trending")
        soup = BeautifulSoup(resp.text, "html.parser")
        titles = [
            t["title"] for t in soup.select("a#video-title")
            if t.get("title")
        ]
        if not titles:
            return 0.0

        # Encode the query once
        query_emb = MODEL.encode(" ".join(keywords), convert_to_numpy=True)

        # Compute similarity against each trending title
        sims = []
        for title in titles:
            title_emb = MODEL.encode(title, convert_to_numpy=True)
            sim = cosine_similarity(
                query_emb.reshape(1, -1),
                title_emb.reshape(1, -1)
            )[0][0]
            sims.append(sim)

        return max(sims)
    except Exception:
        return 0.0

def calculate_trending_score(
    keywords: List[str],
    w1: float = 0.4,
    w2: float = 0.3,
    w3: float = 0.3
) -> float:
    """
    Combines Google Trends, Twitter, and YouTube similarity scores
    into a single trending_score.
    """
    g = get_google_trend_score(keywords)
    t = get_twitter_trend_score(keywords)
    y = get_youtube_trending_similarity(keywords)
    return round(w1 * g + w2 * t + w3 * y, 4)

df['trending_score'] = df['title'].apply(calculate_trending_score)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Remove description features
columns_to_drop = ['description', 'description_length', 'description_sentiment', 'description_has_keywords']
df = df.drop(columns=columns_to_drop, axis=1)

In [None]:
# Leave only features, remove conceptual data (title, tags, etc.)
columns_to_drop = ['title', 'tags', 'video_id', 'channel_id', 'likeCount', 'viewCount', 'commentCount']
df = df.drop(columns=columns_to_drop, axis=1)

In [None]:
# Remove 'title_embedding' until we can find a way to use it with our model
columns_to_drop = ['title_embedding']
df = df.drop(columns=columns_to_drop, axis=1)

In [None]:
print(df.columns)

Index(['avg_red', 'avg_green', 'avg_blue', 'brightness', 'contrast',
       'title_sentiment', 'title_subjectivity', 'num_question_marks',
       'num_exclamation_marks', 'starts_with_keyword', 'title_length',
       'word_count', 'punctuation_count', 'uppercase_word_count',
       'percent_letters_uppercase', 'num_digits', 'clickbait_score',
       'num_power_words', 'num_timed_words', 'viral', 'tag_sentiment',
       'num_unique_tags', 'clickbait_phrase_match', 'title_readability',
       'is_listicle', 'is_tutorial', 'num_faces', 'dominant_color_hue',
       'thumbnail_edge_density', 'power_word_count', 'timed_word_count',
       'num_tags', 'avg_tag_length', 'trending_score'],
      dtype='object')


In [None]:
# Export
df.to_csv("feature_data.csv", index=False)