In [1]:
# import necessary dependencies
import os
import pandas as pd
# imports from library used for Google API calls the method called 'build'
from googleapiclient.discovery import build
from config import gkey
import json
from pprint import pprint
import numpy as np

In [2]:
# build method allows user to store an API call process in a variable if we specifty the desired API, version, and API key
youtube = build('youtube', 'v3', developerKey = gkey)

In [3]:
# call the Regions resource from the youtube API, part must be specified for every call to the API
# snippet in this case is the only part we can call
request = youtube.i18nRegions().list(
        part="snippet"
    )

# the execute method needs to be called on the resulting request to get the json
response = request.execute()

In [4]:
# creat empty lists to hold the region codes and names
region_codes = []
region_names = []

# iterate through the response from the Regions resource and pull the name and region code ("gl") from the snippet part
# and then add the names and codes to the empty lists above
# the items key in the dictionary provided by the reponse holds the 'snippets' for each region
for item in response["items"]:
    name = item['snippet']['name']
    region_names.append(name)
    code = item['snippet']['gl']
    region_codes.append(code)

In [5]:
# create a data frame to hold the above data
countries_dict = {"Country Name":region_names,
                 "Region ID": region_codes}
countries_df =pd.DataFrame(countries_dict)

In [6]:
# Next we need to pull the top 50 (50 being the maximum number allowed by the API) videos by each region code pulled

# create empty lists to hold the data desired from the video resource response
category_ids = []
channel_ids = []
forkids = []
views = []
likes = []
comments = []
regions = []


In [7]:
# Pulling the top 50 videos by the regions pulled above
# for this, the snippet, statistics and status section are what we would like to pull based on examining the documentation
# for statistics, we have three variables to pull from that part

# to simplify the code, created a list of the desired keys from the statistics part to loop through
stats_vars = ["viewCount", "likeCount","commentCount"]

# also creating a list of the empty lists above that correspond in position with the desired parts in the stats_vars list
stats_lists = [views,likes,comments]

In [8]:
# set a variable equal to 1 in prep for printing statement which indicates a region has been pulled and processed 
y = 1
# for loop through regions
for region in region_codes:
    #request data on videos from api using the youtube variable
    request = youtube.videos().list(
            # pulling the snippet, statistics, and status parts as they contain the date we want
            part="snippet,statistics,status",
            # pulls the most popular videos for the indicated regions 
            chart="mostPopular",
            # indicates the maximum number of videos to return (50 is the maximum number permitted by the API)
            maxResults=50,
            # indicates region to pull videos for
            regionCode=region)
    # execute the request and store the response in the variable 'data'
    data = request.execute()

    # like the regions response, the data we're concerned with (the parts), are stored under the 'items' key
    # we need to loop through each of the top 50 videos for the indicated region
    # and pull the desired datapoints to store them in the empty lists that we created above
    for i in data["items"]:
        # add region ID to the regions list
        regions.append(region)
        
        # add the category ID for the video the category_ids list
        try:
            category_ids.append(i["snippet"]["categoryId"])
        except KeyError:
            category_ids.append("Error")
        
        # add the channel ID for the video the channel_ids list
        try:
            channel_ids.append(i["snippet"]["channelId"])
        except KeyError:
            channel_Ids.append("Error")
        # add the Boolean response if the video was 'made for kids' to the forkids list   
        try:    
            forkids.append(i["status"]["madeForKids"])
        except KeyError:
            forkids.append("Error")

        # loop through desired (counts ofviews, likes, comments) stats from 'statistics' entry 
        # and add to corresponding lists
        x = 0
        for stat in stats_vars:
            try:
                stats_lists[x].append(int(i["statistics"][stat]))
            except KeyError:
                stats_lists[x].append(np.nan)
            x+=1
    # print statement indicating the number and name of the region processed
    print(f"Region {y}, {region}, of {len(region_codes)} regions completed")
    
    # add 1 to the number of regions processed
    y+=1

Region 1, AE, of 109 regions completed
Region 2, BH, of 109 regions completed
Region 3, DZ, of 109 regions completed
Region 4, EG, of 109 regions completed
Region 5, IQ, of 109 regions completed
Region 6, JO, of 109 regions completed
Region 7, KW, of 109 regions completed
Region 8, LB, of 109 regions completed
Region 9, LY, of 109 regions completed
Region 10, MA, of 109 regions completed
Region 11, OM, of 109 regions completed
Region 12, QA, of 109 regions completed
Region 13, SA, of 109 regions completed
Region 14, TN, of 109 regions completed
Region 15, YE, of 109 regions completed
Region 16, AZ, of 109 regions completed
Region 17, BY, of 109 regions completed
Region 18, BG, of 109 regions completed
Region 19, BD, of 109 regions completed
Region 20, BA, of 109 regions completed
Region 21, CZ, of 109 regions completed
Region 22, DK, of 109 regions completed
Region 23, AT, of 109 regions completed
Region 24, CH, of 109 regions completed
Region 25, DE, of 109 regions completed
Region 26

In [9]:
# check length of arrays to make sure they will successfully be made into a dataframe
print(len(region),len(likes),len(comments),len(views), len(category_ids), len(forkids))

2 5450 5450 5450 5450 5450


In [10]:
# create dictionary from the now filled lists 
vid_dict = {"Region":regions,
            "Category ID":category_ids,
           "Made For Kids":forkids,
            "Channel IDs":channel_ids,
           "View Count":views,
           "Like Count":likes,
           "Comment Count":comments}
# convert that dictionary into a dataframe
vid_df = pd.DataFrame(vid_dict)

# dispaly the dataframe
vid_df.head()

Unnamed: 0,Region,Category ID,Made For Kids,Channel IDs,View Count,Like Count,Comment Count
0,AE,10,False,UCANLZYMidaCbLQFWXBC95Jg,8723803,907375.0,34269.0
1,AE,24,False,UCvC4D8onUfXzvjTOM-dBfEA,16155641,797863.0,32167.0
2,AE,23,False,UC8-Th83bH_thdKZDJCrn88g,1573681,106091.0,2986.0
3,AE,17,False,UCt2JXOLNxqry7B_4rRZME3Q,7870274,321039.0,20645.0
4,AE,17,False,UCJUCcJUeh0Cz2xyKwkw5Q1w,634870,73362.0,2049.0


In [16]:
# add in the country names to the video dataframe by merging with the earlier created country dataframe
vid_df = vid_df.merge(countries_df,left_on='Region', right_on = 'Region ID')

In [21]:
# dropping the redundant 'Region ID' column that was created as a result of the merge
vid_df = vid_df.drop(columns = ["Region ID"])

Unnamed: 0,Region,Category ID,Made For Kids,Channel IDs,View Count,Like Count,Comment Count,Country Name
0,AE,10,False,UCANLZYMidaCbLQFWXBC95Jg,8723803,907375.0,34269.0,United Arab Emirates
1,AE,24,False,UCvC4D8onUfXzvjTOM-dBfEA,16155641,797863.0,32167.0,United Arab Emirates
2,AE,23,False,UC8-Th83bH_thdKZDJCrn88g,1573681,106091.0,2986.0,United Arab Emirates
3,AE,17,False,UCt2JXOLNxqry7B_4rRZME3Q,7870274,321039.0,20645.0,United Arab Emirates
4,AE,17,False,UCJUCcJUeh0Cz2xyKwkw5Q1w,634870,73362.0,2049.0,United Arab Emirates
...,...,...,...,...,...,...,...,...
5445,PG,1,False,UCGwu0nbY2wSkW8N-cghnLpA,4042926,307716.0,14343.0,Papua New Guinea
5446,PG,24,False,UCIvbV-a0t2tU6LsKSZz7r-w,513301,15374.0,1153.0,Papua New Guinea
5447,PG,20,False,UC2S7CGceq5tsuhcVyACdA3g,260199,32973.0,3174.0,Papua New Guinea
5448,PG,22,False,UC8ha6SsRNvDGkwcPTCXkW3g,324359,18713.0,1763.0,Papua New Guinea


In [23]:
# quick view of the data grouped by country, showing the 10 largest view counts
country_grouping = vid_df.groupby("Country Name")[["View Count","Like Count", "Comment Count"]].sum().nlargest(10, "View Count")
country_grouping

Unnamed: 0_level_0,View Count,Like Count,Comment Count
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cambodia,1396637439,49731107.0,410731.0
Malaysia,871962081,39451388.0,560994.0
Sri Lanka,810345237,32056445.0,184376.0
Laos,697526754,26441315.0,178777.0
New Zealand,675363683,36955333.0,827172.0
El Salvador,641533202,28275248.0,748404.0
Kazakhstan,640518334,24179803.0,363356.0
Portugal,618809241,31118941.0,677684.0
Ireland,614195293,35390848.0,819455.0
Philippines,596430461,25739444.0,637529.0


In [24]:
# convert data to csv for later analysis
vid_df.to_csv('Youtube_Data.csv')