In [14]:
# Import reusable path setup
from setup_paths import setup_project_paths

# Get project paths
paths = setup_project_paths()

RAW_DIR = paths["RAW_DIR"]
PROCESSED_DIR = paths["PROCESSED_DIR"]

In [5]:
# 02_data_cleaning.ipynb
import os
import pandas as pd

In [6]:
# Ensure processed folder exists
os.makedirs('data/processed', exist_ok=True)

In [8]:
# Load raw data safely
raw_csv_path = 'data/raw/trending_videos.csv'
if not os.path.exists(raw_csv_path):
    raise FileNotFoundError(f"{raw_csv_path} not found. Please run 01_data_collection.ipynb first.")

df = pd.read_csv(raw_csv_path)

In [9]:
# Convert published_date to datetime
df['published_date'] = pd.to_datetime(df['published_date'])

In [10]:
# Fill missing values
df['likes'] = df['likes'].fillna(0)
df['comments'] = df['comments'].fillna(0)

In [11]:
# Feature engineering
df['title_length'] = df['title'].apply(len)
df['engagement_ratio'] = (df['likes'] + df['comments']) / df['views']

In [12]:
# Save cleaned data
processed_csv_path = 'data/processed/trending_videos_clean.csv'
df.to_csv(processed_csv_path, index=False)
print(f"Cleaned data saved to {processed_csv_path}")

df.head()

Cleaned data saved to data/processed/trending_videos_clean.csv


Unnamed: 0,video_id,title,channel,views,likes,comments,published_date,category_id,title_length,engagement_ratio
0,3cT5ML2l7KQ,VonOff1700 - Seen First (Official Video),VonOff1700,134175,10464,411,2025-10-21 19:00:02+00:00,10,40,0.081051
1,i36Zw32GfRQ,Reminders of Him | Official Trailer,Universal Pictures,5230773,10188,483,2025-10-21 15:00:23+00:00,24,35,0.00204
2,9_ofCQ0eOTc,Taco Tuesday Admin Abuse + New BRAINROTS!,CaylusBlox,2008316,32729,992,2025-10-21 23:07:13+00:00,20,41,0.016791
3,pCv0oP9JLKw,Morgan Wallen - 20 Cigarettes (Official Music ...,MorganWallenVEVO,601989,22640,1864,2025-10-20 18:01:00+00:00,10,52,0.040705
4,2DuX1l-clGY,Honest Trailers | The Thing,Screen Junkies,174906,12822,1075,2025-10-21 17:00:03+00:00,1,27,0.079454
