In [3]:
import pandas as pd
import json
import os

# Define base folders (assuming script is inside 'scripts/')
base_dir = os.path.abspath('..')
data_dir = os.path.join(base_dir, 'data')
cleaned_dir = os.path.join(base_dir, 'cleaned')

# Create cleaned directory if it doesn't exist
os.makedirs(cleaned_dir, exist_ok=True)

# List of country codes based on your files
country_codes = ['US', 'CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU']

for code in country_codes:
    csv_file = os.path.join(data_dir, f"{code}videos.csv")
    json_file = os.path.join(data_dir, f"{code}_category_id.json")
    
    try:
        # Load video data
        df = pd.read_csv(csv_file)

        # Load and parse JSON
        with open(json_file, 'r') as f:
            categories = json.load(f)

        # Create category mapping
        category_mapping = {
            int(item['id']): item['snippet']['title']
            for item in categories['items']
        }

        # Map category names
        df['category_name'] = df['category_id'].map(category_mapping)

        # Clean data
        df.drop_duplicates(inplace=True)
        df.dropna(subset=['video_id', 'title', 'category_id'], inplace=True)

        # Save cleaned file
        output_file = os.path.join(cleaned_dir, f"{code}videos_cleaned.csv")
        df.to_csv(output_file, index=False)

        print(f"[✔] Cleaned and saved: {output_file}")

    except Exception as e:
        print(f"[✘] Failed for {code}: {e}")


[✔] Cleaned and saved: c:\Users\srvsh\OneDrive\Desktop\sms\cloud_project\youtube-data-analytics-pipeline\cleaned\USvideos_cleaned.csv
[✔] Cleaned and saved: c:\Users\srvsh\OneDrive\Desktop\sms\cloud_project\youtube-data-analytics-pipeline\cleaned\CAvideos_cleaned.csv
[✔] Cleaned and saved: c:\Users\srvsh\OneDrive\Desktop\sms\cloud_project\youtube-data-analytics-pipeline\cleaned\DEvideos_cleaned.csv
[✔] Cleaned and saved: c:\Users\srvsh\OneDrive\Desktop\sms\cloud_project\youtube-data-analytics-pipeline\cleaned\FRvideos_cleaned.csv
[✔] Cleaned and saved: c:\Users\srvsh\OneDrive\Desktop\sms\cloud_project\youtube-data-analytics-pipeline\cleaned\GBvideos_cleaned.csv
[✔] Cleaned and saved: c:\Users\srvsh\OneDrive\Desktop\sms\cloud_project\youtube-data-analytics-pipeline\cleaned\INvideos_cleaned.csv
[✘] Failed for JP: 'utf-8' codec can't decode bytes in position 215323-215324: invalid continuation byte
[✘] Failed for KR: 'utf-8' codec can't decode bytes in position 261291-261292: invalid cont