# Clean Dataset

In [1]:
# common imports
import random
import json
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Extract country codes
import glob

countries = [
    item.split("/")[1].split("_")[0]
    for item in glob.glob("data/*_youtube_trending_data.csv")
]
countries.sort()
print("Countries: ", countries)   

Countries:  ['BR', 'CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']


In [34]:
if not [] and not False:
    print("Not empty or Not forced")


Not empty or Not forced


In [62]:
def get_data(country):
    df = pd.read_csv(f"data/{country}_youtube_trending_data.csv")
    columns = {
        "video_id": "id",
        "title": "title",
        "publishedAt": "published_at",
        "channelId": "channel_id",
        "channelTitle": "channel_title",
        "categoryId": "category_id",
        "trending_date": "trending_date",
        "tags": "tags",
        "view_count": "view_count",
        "likes": "likes",
        "dislikes": "dislikes",
        "comment_count": "comment_count",
        "thumbnail_link": "thumbnail_link",
        "comments_disabled": "comments_disabled",
        "ratings_disabled": "ratings_disabled",
        "description": "description",
    }
    df = df.rename(columns=columns)

    return df


def get_categories(country):
    # all dataset except for US are missing categoryId 29
    # so we are taking US as base for all
    with open(f"data/US_category_id.json") as file:
        raw_data = json.load(file)["items"]
        data = {int(item["id"]): item["snippet"]["title"] for item in raw_data}

    return pd.DataFrame(
        data={
            "id": list(data.keys()),
            "name": list(data.values()),
        }
    )


def clean_data(data):
    data = data.drop_duplicates(["id"], keep="last")

    # remove unused columns
    unused_columns = ["thumbnail_link", "description"]
    data = data.drop(unused_columns, axis=1)

    return data


for country in countries:
    # get data
    df = get_data(country)
    categories_df = get_categories(country)

    print(f"Country {country} has {len(df)} rows and {len(categories_df)} categories")

    # remove categories with no data
    current_categories = df.category_id.unique()
    current_categories.sort()
    categories_df = categories_df[categories_df["id"].isin(current_categories)]

    print(f"{len(categories_df)} categories after cleaning")

    df = clean_data(data=df)
    
    df.category_id = df.category_id.map(lambda id: categories_df[categories_df.id == id].name.values[0])
    df = df.rename(columns={"category_id": "category"})

    print(f"{len(df)} rows after cleaning\n\n")

    df.to_csv(f"data/{country}_cleaned_data.csv", index=False)

Country BR has 119792 rows and 32 categories
15 categories after cleaning
18853 rows after cleaning


Country CA has 119744 rows and 32 categories
15 categories after cleaning
23483 rows after cleaning


Country DE has 119743 rows and 32 categories
15 categories after cleaning
27947 rows after cleaning


Country FR has 119791 rows and 32 categories
15 categories after cleaning
24792 rows after cleaning


Country GB has 119795 rows and 32 categories
15 categories after cleaning
22856 rows after cleaning


Country IN has 113910 rows and 32 categories
15 categories after cleaning
33458 rows after cleaning


Country JP has 119787 rows and 32 categories
15 categories after cleaning
16374 rows after cleaning


Country KR has 116754 rows and 32 categories
15 categories after cleaning
14816 rows after cleaning


Country MX has 119599 rows and 32 categories
15 categories after cleaning
16993 rows after cleaning


Country RU has 118130 rows and 32 categories
15 categories after cleaning
71836 ro