# Setup

In [11]:
import os
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd

In [12]:
CHANNELS_BY_CATEGORY = {
    'Auto & Vehicles': {
                'Stokes twins squad':'UCbp9MyKCTEww4CxEzc_Tp0Q', 
                'DASH CAM CN':'UCJZpPG8wegZJ3rsnc6-Bpnw', 
                'Supercar Blondie':'UC0_7evGVYla1lREeXjN_JLg',
                'The Classic Motorcycle Channel':'UCtjWY31B54jqxRFQM9xyhDw',
                'Salai Sathish777':'UCaEUPNkdnbQROROKm09SGoQ',
                'Exotic Rubber':'UCF2eaKLl9I8yXg31FW3GH2g',
                'Sarah Lezito':'UCT7dKiVfnUSRk7ckOfIwWvw',
                'San Razka':'UCcDiv0JjkAnirZvcwcZYXTw',
                'Suvo Stunts':'UCzRnN9ntMZf83mquKpT6a0w',
                'Nikolai Savic':'UCqcRe8F3rETPbzOFJHzk'
    },
    'Entertainment': {
                'Anaya Kandhal':'UCJTqi2KOenudYnEOUSAj5Hg',
                'Toys and Colors':'UCIte5_YtyuMKsSytQDgWdMA',
                'J House jr.':'UCGHGx_VOEWUE_WINENxxKbA',
                'Sierra & Rhia FAM':'UCe6n0z9UbsxYCS8P83f84tw',
                'KL BRO Biju Rithvik':'UCABNiUtltfp4piL69GD2BmA',
                'StarPlus':'UCAGZZ-Ua-yB-fqObs1GKzag',
                'MrBeast':'UClD832S2f_F_W4epwAd5sOw',
                'HAR PAL GEO':'UCNoMrALPTsDJaNiIdMk_kCw',
                'Upin & Ipin':'UCB8VtllrypUr7YP9WiYuXCA',
                'Dylan Anderson':'UCRY8HfhFV5w9toYlR8bfupQ'
    },
    'Gaming': {
                'Linh Nhi Shorts':'UCOGJ0sPUo9_39Qiru4Lx51w',
                'Animal World':'UCEexsbjV02l4bPWfyNlMJ3A',
                'LankyBox':'UCU7d91rxwcqHh-fSgCNU73Q',
                'Kissy Show':'UCRA3T1IofrU3HSys0FJGbZQ',
                'FACT FIRE KING':'UCDjrDvzuRUmwIBmY8GBOPVw',
                'Daquavis':'UCvwYaSSj-ceh5-nfSv71WKw',
                'Sinotal Gaming':'UCe6qB7dYiurwyKSxrgCdmLA',
                'LetsPlay':'UCia7dsxxnoUsJMicn-0gbTw',
                'Nahz':'UCq-BiqUyulZMOZEr0xtGOjg',
                'Canva India':'UC2V8gNWRrOrBFOm08tNSugQ'
    },
    'Sports': {
                'Celine Dept':'UCTq7Bs_Whk-WYwhU9CoezCQ',
                'WWE':'UC2NNoTGy7-nNSA1DOefkCpA',
                'Red Bull':'UCuLq0ME8tqK6pzCm9rpkwjQ',
                'House of Highlights':'UCYXbyePIPf6lJkOuVvdugQw',
                'YOLO AVENTURAS':'UCqTxve5-0sVxxBB8dX3DkFA',
                'SportsNation':'UC7dzGIZnOoGVjODipT1183Q',
                'UR · Cristiano':'UCHzgN0TNou2bCHkLsCV-tLw',
                'Omar Raja - ESPN':'UCLFhLRABMCFl5H6hS-NgrPg',
                'ESPN':'UCGVx9_dtZD0MuFdNxmNL1eg',
                'Score 90 Shorts':'UCbVtlOOHT_3VMKhuM1d9ziA'
    },
    'Music': { 
                'Seyhan Müzik':'UCNqr9QkPw2ThS6reZ_VglVA',
                'T-Series':'UCpN-WGQgGsOrXm7nHXDdTDA',
                'SHADE Of Love':'UCR5hcvWZc9XlSEK7sPlMpTw',
                'Sony Music South':'UCXNslTpQcYUNS1g6TEwQ_bQ',
                'El Payaso Plim Plim':'UCHU3DmaXhrrZ9oBQF7TnpIQ',
                'Saregama Music':'UCerohJ7AcKkilNALvAx6-_w',
                'Zee Music Company':'UCE-C1Zk9eL5K-aGJcLGurrw',
                # 'Prvnci':'',
                'Tips Official':'UCohjG0ZUU_VTvgZ_EpM0OrA',
                'The Weeknd':'UCPY2tjsvvV_Y1l_n9LxaLnQ'
    }
}

In [13]:
with open('secret/secret.txt') as f:
    api_key = f.readline().strip()

youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

# Load and Cache

loads channel data

In [14]:
def get_channel_data(channel_data_path, channels_by_category):    
    if os.path.exists(channel_data_path):
        channel_data = pd.read_csv(channel_data_path)
    else:
        channel_data = pd.DataFrame(columns=["channel_id", "category", "channel_name", "subscriber_count", "video_count", "view_count", "playlist_id"])
        for category in channels_by_category:
            for channel in channels_by_category[category]:
                channel_id = channels_by_category[category][channel]
                try:

                    request = youtube.channels().list(
                        part="statistics,contentDetail",
                        id=channel_id
                    )
                    response = request.execute()
                    try:
                        subscriber_count = response["items"][0]["statistics"]["subscriberCount"]
                    except KeyError:
                        subscriber_count = float('nan')
                    try:
                        video_count = response["items"][0]["statistics"]["videoCount"]
                    except KeyError:
                        video_count = float('nan')
                    try:
                        view_count = response["items"][0]["statistics"]["viewCount"]
                    except KeyError:
                        view_count = float('nan')

                    try:
                        playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
                    except KeyError:
                        playlist_id = float('nan')
                        
                    row = pd.DataFrame({
                        "channel_id": [channel_id],
                        "category": [category],
                        "channel_name": [channel],
                        "subscriber_count": [subscriber_count],
                        "video_count": [video_count],
                        "view_count": [view_count],
                        "playlist_id": [playlist_id]
                    })

                    channel_data = pd.concat([channel_data, row], ignore_index=True)
                except Exception as e:
                    print(f"Error on channel: {channel}: {e}")
                    if hasattr(e, 'status_code') and e.status_code == 403:
                        break
                    continue
        
        if len(channel_data) > 0:
            channel_data.to_csv(channel_data_path, index=False)
    
    return channel_data

loads video data

In [15]:
def get_video_data(video_data_path, channel_data):
    if os.path.exists(video_data_path):
        video_data = pd.read_csv(video_data_path)
    else:
        video_data = pd.DataFrame(columns=["video_id", "channel_id", "video_title", "view_count", "like_count", "dislike_count", "comment_count"])
        video_ids = {}
        all_video_ids = ""
        try:
            for playlist_id in channel_data["playlist_id"]:
                
                    request = youtube.playlistItems.list(
                        part="id",
                        playlist_id=playlist_id
                    )

                    response = request.execute()

                    for item in response["items"][:10]:
                        try:
                            video_ids[item["id"]["videoId"]] = playlist_id
                            if all_video_ids == "":
                                all_video_ids += item["id"]["videoId"]
                            else:
                                all_video_ids += ',' + item["id"]["videoId"]
                        except KeyError:
                            continue

            video_request = youtube.videos().list(
                part="statistics",
                id=all_video_ids
            )
            video_response = video_request.execute()

        except Exception as e:
            print(f"Error {e}")
            video_response = {"items": []}

        for video in video_response["items"]:

            try:
                video_id = video["id"]
            except KeyError:
                video_id = float('nan')
            try:
                view_count = video["statistics"]["viewCount"]
            except KeyError:
                view_count = float('nan')
            try:
                like_count = video["statistics"]["likeCount"]
            except KeyError:
                like_count = float('nan')
            try:
                dislike_count = video["statistics"]["dislikeCount"]
            except KeyError:
                dislike_count = float('nan')
            try:
                comment_count = video["statistics"]["commentCount"]
            except KeyError:
                comment_count = float('nan')

            row = pd.DataFrame({
                "video_id": [video_id],
                "playlist_id": [video_ids[video_id]],
                "video_title": [item["snippet"]["title"]],
                "view_count": [view_count],
                "like_count": [like_count],
                "dislike_count": [dislike_count],
                "comment_count": [comment_count]
            })

            video_data = pd.concat([video_data, row], ignore_index=True)

        if len(video_data) > 0:    
            video_data.to_csv(video_data_path, index=False)
    
    return video_data

loads comment data

In [16]:
def get_comment_data(comment_data_path, video_data):
    if os.path.exists(comment_data_path):
        comment_data = pd.read_csv(comment_data_path)
    else:
        comment_data = pd.DataFrame(columns=["video_id", "comment", "like_count", "reply_count"])
        for video_id in video_data["video_id"]:
            try:
                request = youtube.commentThreads().list(
                    part="snippet",
                    videoId=video_id,
                    maxResults=15
                )
                response = request.execute()

                for item in response["items"]:
                    try:
                        comment = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
                    except KeyError:
                        comment = float('nan')
                    try:
                        like_count = item["snippet"]["topLevelComment"]["likeCount"]
                    except KeyError:
                        like_count = float('nan')
                    try:
                        reply_count = item["snippet"]["totalReplyCount"]
                    except KeyError:
                        reply_count = float('nan')

                    row = pd.DataFrame({
                        "video_id": [video_id],
                        "comment": [comment],
                        "like_count": [like_count],
                        "reply_count": [reply_count]
                    })

                    comment_data = pd.concat([comment_data, row], ignore_index=True)
            except Exception as e:
                print(f"Error on video: {video_id}: {e}")
                if hasattr(e, 'status_code') and e.status_code == 403:
                        break
                continue
        
        if len(comment_data) > 0:
            comment_data.to_csv(comment_data_path, index=False)
    
    return comment_data

In [17]:
CACHE_PATH = ".cache"

if not os.path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH)

data = {}

In [18]:
SEARCH = []

if len(SEARCH) > 0: 

    for search in SEARCH:
        
        search_cache_path = f"{CACHE_PATH}/{search}"

        if not os.path.exists(search_cache_path):
            os.makedirs(search_cache_path)

        channel_data_path = f"{search_cache_path}/channel_data.csv"
        video_data_path = f"{search_cache_path}/video_data.csv"
        comment_data_path = f"{search_cache_path}/comment_data.csv"
        

        channels_by_category = {}

        request = youtube.search().list(
            part="ids,snippet",
            maxResults=10,
            type="channel",
            q=search,
            order="viewCount"
        )

        response = request.execute()

        for item in response["items"]:
            channels_by_category[item["snippet"]["title"]] = item["id"]["channelId"]

        channel_data = get_channel_data(channel_data_path, channels_by_category)
        video_data = get_video_data(video_data_path, channel_data)
        comment_data = get_comment_data(comment_data_path, video_data)

        data[search] = {
            "channel_data": channel_data,
            "video_data": video_data,
            "comment_data": comment_data
        }


channel_data_path = f"{CACHE_PATH}/channel_data.csv"
video_data_path = f"{CACHE_PATH}/video_data.csv"
comment_data_path = f"{CACHE_PATH}/comment_data.csv"

channel_data = get_channel_data(channel_data_path, CHANNELS_BY_CATEGORY)
video_data = get_video_data(video_data_path, channel_data)
comment_data = get_comment_data(comment_data_path, video_data)

data["default"] = {
    "channel_data": channel_data,
    "video_data": video_data,
    "comment_data": comment_data
}    

Error on channel: Stokes twins squad: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=statistics%2CcontentDetail&id=UCbp9MyKCTEww4CxEzc_Tp0Q&key=AIzaSyDqo7U_-fTqzXUq4jCmSqdiYCyVI5qoD7U&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Error on channel: Anaya Kandhal: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=statistics%2CcontentDetail&id=UCJTqi2KOenudYnEOUSAj5Hg&key=AIzaSyDqo7U_-fTqzXUq4jCmSqdiYCyVI5qoD7U&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have excee

In [19]:
LOAD_IN_EXISTING = []

for csv in LOAD_IN_EXISTING:
    if os.path.exists(csv):
        search = csv.split("/")[-1]
        type = csv.split("/")[-2]

        if data.get(search) is None:
            data[search] = {}
            data[search][type] = pd.read_csv(csv)

# Use Data

In [20]:
for key in data:
    print(key)
    for type in data[key]:
        display(data[key][type].head())

default


Unnamed: 0,channel_id,category,channel_name,subscriber_count,video_count,view_count,playlist_id


Unnamed: 0,video_id,channel_id,video_title,view_count,like_count,dislike_count,comment_count


Unnamed: 0,video_id,comment,like_count,reply_count
