# Setup

In [30]:
import os
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd

In [31]:
channels_by_category = {
    'Auto & Vehicles': [
                'Stokes twins squad', 
                'DASH CAM CN', 
                'Supercar Blondie',
                'The Classic Motorcycle Channel',
                'Salai Sathish777',
                'Exotic Rubber',
                'Sarah Lezito',
                'San Razka',
                'Suvo Stunts',
                'Nikolai Savic'
                ],
    'Entertainment': [
                'Anaya Kandhal',
                'Toys and Colors',
                'J House jr.',
                'Sierra & Rhia FAM',
                'KL BRO Biju Rithvik',
                'StarPlus',
                'MrBeast',
                'HAR PAL GEO',
                'Upin & Ipin',
                'Dylan Anderson'
                ],
    'Gaming': [
                'Linh Nhi Shorts',
                'Animal World',
                'LankyBox',
                'Kissy Show',
                'FACT FIRE KING',
                'Daquavis',
                'Sinotal Gaming',
                ' LetsPlay',
                'Nahz',
                'Canva India'
                ],
    'Sports': [
                'Celine Dept',
                'WWE',
                'Red Bull',
                'House of Highlights',
                'YOLO AVENTURAS',
                'SportsNation',
                'UR · Cristiano',
                'Omar Raja - ESPN',
                'ESPN',
                'Score 90 Shorts'
                ],
    'Music': [ 
                'Seyhan Müzik',
                'T-Series',
                'SHADE Of Love',
                'Sony Music South',
                'El Payaso Plim Plim',
                'Saregama Music',
                'Zee Music Company',
                'Prvnci',
                'Tips Official',
                'The Weeknd'
                ]
}

In [32]:
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

with open('secret/secret.txt') as f:
    api_key = f.readline().strip()

youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

# Load and Cache

Will use 8900 tokens so you can only run it once a day

In [33]:
cache_path = ".cache"
channel_data_path = f"{cache_path}/channel_data.csv"
video_data_path = f"{cache_path}/video_data.csv"
comment_data_path = f"{cache_path}/comment_data.csv"

cache_exists = os.path.exists(cache_path)

if not cache_exists:
    os.makedirs(cache_path)

channel_data_exist = os.path.exists(channel_data_path)

video_data_exist = os.path.exists(video_data_path)

comment_data_exist = os.path.exists(comment_data_path)

loads channel data
uses 100 tokens

In [None]:
if channel_data_exist:
    channel_data = pd.read_csv(channel_data_path)
else:
    channel_data = pd.DataFrame(columns=["channel_id", "category", "channel_name", "subscriber_count", "video_count", "view_count"])
    for category in channels_by_category:
        for channel in channels_by_category[category]:
            try:
                id_request = youtube.search().list(
                    part="id",
                    maxResults=1,
                    type="channel",
                    q=channel,
                    order="viewCount"
                )
                id_response = id_request.execute()
                
                channel_id = id_response["items"][0]["id"]["channelId"]

                request = youtube.channels().list(
                    part="statistics",
                    id=channel_id
                )
                response = request.execute()

                subscriber_count = response["items"][0]["statistics"]["subscriberCount"]
                video_count = response["items"][0]["statistics"]["videoCount"]
                view_count = response["items"][0]["statistics"]["viewCount"]

                row = {
                    "channel_id": channel_id,
                    "category": category,
                    "channel_name": channel,
                    "subscriber_count": subscriber_count,
                    "video_count": video_count,
                    "view_count": view_count
                }

                channel_data = pd.concat([channel_data, row], ignore_index=True)
            except Exception as e:
                print(f"Error on channel: {channel}: {e}")
                if e.status_code == 403:
                    break
                continue
    
    channel_data.to_csv(channel_data_path, index=False)

loads video data
uses 550 tokens

In [None]:
if video_data_exist:
    video_data = pd.read_csv(video_data_path)
else:
    video_data = pd.DataFrame(columns=["video_id", "channel_id", "video_title", "view_count", "like_count", "dislike_count", "comment_count"])
    for channel_id in channel_data["channel_id"]:
        try:
            request = youtube.search().list(
                part="id",
                maxResults=10,
                type="video",
                channelId=channel_id,
                order="viewCount"
            )
            response = request.execute()

            for item in response["items"]:
                video_id = item["id"]["videoId"]

                video_request = youtube.videos().list(
                    part="statistics",
                    id=video_id
                )
                video_response = video_request.execute()

                view_count = video_response["items"][0]["statistics"]["viewCount"]
                like_count = video_response["items"][0]["statistics"]["likeCount"]
                dislike_count = video_response["items"][0]["statistics"]["dislikeCount"]
                comment_count = video_response["items"][0]["statistics"]["commentCount"]

                row = {
                    "video_id": video_id,
                    "channel_id": channel_id,
                    "video_title": item["snippet"]["title"],
                    "view_count": view_count,
                    "like_count": like_count,
                    "dislike_count": dislike_count,
                    "comment_count": comment_count
                }

                video_data = pd.concat([video_data, row], ignore_index=True)
        except Exception as e:
            print(f"Error on channel: {channel_id}: {e}")
            if e.status_code == 403:
                break
            continue
    
    video_data.to_csv(video_data_path, index=False)

loads comment data
uses 8250 tokens

In [None]:
if comment_data_exist:
    comment_data = pd.read_csv(comment_data_path)
else:
    comment_data = pd.DataFrame(columns=["video_id", "comment", "like_count", "reply_count"])
    for video_id in video_data["video_id"]:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=15
            )
            response = request.execute()

            for item in response["items"]:
                comment = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
                like_count = item["snippet"]["topLevelComment"]["likeCount"]
                reply_count = item["snippet"]["totalReplyCount"]

                row = {
                    "video_id": video_id,
                    "comment": comment,
                    "like_count": like_count,
                    "reply_count": reply_count
                }

                comment_data = pd.concat([comment_data, row], ignore_index=True)
        except Exception as e:
            print(f"Error on video: {video_id}: {e}")
            if e.status_code == 403:
                    break
            continue
    
    comment_data.to_csv(comment_data_path, index=False)

# Use Data

In [None]:
display(channel_data.head())
display(video_data.head())
display(comment_data.head())