## Setup

In [243]:
from googleapiclient.discovery import build
import json
import re
from collections import Counter
from datetime import datetime
from googleapiclient.discovery import build
import pandas as pd
import numpy as np
import glob

In [244]:
with open ("../performative.txt", "r") as f:
    lines = f.readlines()
    API_KEY = lines[8].strip()
    
yt = build("youtube", "v3", developerKey=API_KEY)

## Initial testing

In [245]:
# request = yt.search().list(
#     part = "snippet",
#     q = "Mr. Beast", 
#     type = "channel", 
#     maxResults = 5  
# )
# response = request.execute()

In [246]:
# print(json.dumps(response, indent=2))

In [247]:
# def get_channel_data(channei_id):
#     response = yt.channels().list()

## Getting data from api

### Helpers

In [248]:
GENRE_MAP = {
    "1":  "Film & Animation",
    "2":  "Autos & Vehicles",
    "10": "Music",
    "15": "Pets & Animals",
    "17": "Sports",
    "19": "Travel & Events",
    "20": "Gaming",
    "22": "People & Blogs",
    "23": "Comedy",
    "24": "Entertainment",
    "25": "News & Politics",
    "26": "Howto & Style",
    "27": "Education",
    "28": "Science & Technology",
    "29": "Nonprofits & Activism",
}

In [249]:
### REQUIRES: ascending sorted list of datetime objects
def calculate_upload_gap(dates):
    if len(dates) < 2:
        return None
    gaps = [(dates[i+1] - dates[i]).total_seconds() for i in range(len(dates) - 1)]
    avg_secs_between_uploads = round(sum(gaps) / len(gaps), 2)
    return avg_secs_between_uploads

In [250]:
improper_duration_counter = 0 #TODO: hacky global variable, replace with logging

In [251]:
def to_seconds(duration):
    global improper_duration_counter #TODO: hacky global variable, replace with logging (need this to edit variable (of higher scope?))
    if not duration:
        improper_duration_counter += 1
        print("missing duration")
        return 0 #TODO: reduces accuracy
    match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", duration)
    if not match:
        improper_duration_counter += 1
        print("duration format doesn't match")
        return 0 #TODO: reduces accuracy
    return int(match.group(1) or 0) * 3600 + int(match.group(2) or 0) * 60 + int(match.group(3) or 0)

### data sourcing functions

In [252]:
def get_profile(channel_id):
    response = yt.channels().list(
        part="snippet,contentDetails,topicDetails,brandingSettings", #TODO: never used brandingSettings
        id=channel_id
    ).execute()

    item = response["items"][0]
    snippet = item["snippet"]
    uploads_id = item["contentDetails"]["relatedPlaylists"]["uploads"]
    raw_categories = item.get("topicDetails", {}).get("topicCategories", [])
    categories = [url.split("/")[-1].replace("_", " ") for url in raw_categories]

    return snippet, uploads_id, categories

In [253]:
def get_recent_videos_and_dates(uploads_id):
    response = yt.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=uploads_id,
        maxResults=10
    ).execute()

    videos=[
        {
            "video_id": item["contentDetails"]["videoId"],
            "published_at": item["snippet"]["publishedAt"],
        }
        for item in response["items"]
    ]

    upload_dates = sorted([datetime.fromisoformat(v["published_at"].replace("Z", "+00:00")) for v in videos])
    video_ids = [v["video_id"] for v in videos]
    return video_ids, upload_dates


In [254]:
def get_videos_data(video_ids):
    response = yt.videos().list(
        part="snippet,contentDetails",
        id=",".join(video_ids)
    ).execute()

    tags, genres, durations, titles = [], [], [], []

    for v in response["items"]:
        tags.extend(v["snippet"].get("tags", []))
        genres.append(GENRE_MAP.get(v["snippet"].get("categoryId"), "Unknown"))
        durations.append(to_seconds(v["contentDetails"].get("duration")))
        titles.append(v["snippet"].get("title"))
    
    return tags, genres, durations, titles

In [255]:
def main(channel_id):
    snippet, uploads_id, categories = get_profile(channel_id)
    video_ids, upload_dates = get_recent_videos_and_dates(uploads_id)
    avg_upload_gap = calculate_upload_gap(upload_dates)
    tags, genres, durations, titles = get_videos_data(video_ids)

    return {
        "channel_id": channel_id,
        "channel_name": snippet.get("title"),
        "description": snippet.get("description"),
        "country": snippet.get("country"),
        "defaultLanguage": snippet.get("defaultLanguage"),
        "created_date": snippet.get("publishedAt"),
        "category": categories,
        "aggregated_tags": list(set(tags)),
        "most_common_video_genre": Counter(genres).most_common(1)[0][0] if genres else None,
        "all_video_genres": list(set(genres)),
        "avg_duration_seconds": round(sum(durations) / len(durations), 2) if durations else None,
        "avg_seconds_between_uploads": avg_upload_gap,
        "recent_video_titles": titles,
    }


## Script for dataset

columns for dataset:  
- from given dataset  
    - channel_id
    - channel_name
    - description
    - country
    - defaultLanguage
    - created_date
    - category
- from yt api
    - aggregated_tags
    - most_common_video_genre
    - all_video_genres
    - avg_duration_seconds
    - avg_seconds_between_uploads
    - recent_video_titles

In [256]:
df = pd.read_csv("../data/raw/youtube_channel_info_v2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   channel_name         15830 non-null  object
 1   channel_id           15830 non-null  object
 2   view_count           15830 non-null  int64 
 3   category             15722 non-null  object
 4   country              14030 non-null  object
 5   defaultLanguage      1725 non-null   object
 6   subscriber_count     15830 non-null  int64 
 7   created_date         15830 non-null  object
 8   description          14929 non-null  object
 9   custom_url           15808 non-null  object
 10  thumbnail            15830 non-null  object
 11  video_count          15830 non-null  int64 
 12  videos_last_30_days  15830 non-null  int64 
 13  views_last_30_days   15830 non-null  int64 
 14  uploads_playlist_id  15830 non-null  object
 15  last_10_video_ids    15774 non-null  object
dtypes: i

In [257]:
df.head() #TODO: consider removing music channels?

Unnamed: 0,channel_name,channel_id,view_count,category,country,defaultLanguage,subscriber_count,created_date,description,custom_url,thumbnail,video_count,videos_last_30_days,views_last_30_days,uploads_playlist_id,last_10_video_ids
0,BLACKPINK,UCOmHUn--16B90oW2L6FRR3A,39962585446,"Music of Asia, Pop music, Music, Electronic music",KR,,99000000,2016-06-29T03:15:23Z,BLACKPINK Official YouTube Channel 블랙핑크 공식 유튜브...,@blackpink,https://yt3.ggpht.com/U3VrCkKjzTpQ3VYv4SCPjNfD...,636,1,3256869,UUOmHUn--16B90oW2L6FRR3A,"W-9-keHNwV8,pAeqY-TEZc0,WB_AOdAEuBM,zTnAvaoHR4..."
1,HYBE LABELS,UC3IZKseVpdzPSBaWxBxundA,41604896923,"Hip hop music, Pop music, Music, Music of Asia",KR,,78700000,2008-06-04T08:23:22Z,Welcome to the official YouTube channel of HYB...,@hybelabels,https://yt3.ggpht.com/ytc/AIdro_l8g0yRFG8xoe_q...,2817,79,46074833,UU3IZKseVpdzPSBaWxBxundA,"NPqiynCpqgI,nbOuR-KYZy8,xf8N3JYEhV4,iIWKCYP2zY..."
2,BILLIE EILISH,UCVNE660NcgYzi18LwwUZb7Q,14316364,,,,82300,2019-01-18T05:14:32Z,,@billieeilish8477,https://yt3.ggpht.com/ytc/AIdro_n3NfvUAVtcUJvW...,1,0,0,UUVNE660NcgYzi18LwwUZb7Q,C9odQay_d6s
3,Shemaroo,UCF1JIbMUs6uqoZEY1Haw0GQ,27011004224,"Entertainment, Film",IN,,60400000,2007-09-01T11:44:51Z,"Welcome to ShemarooEnt, one of the finest dest...",@shemaroo,https://yt3.ggpht.com/ytc/AIdro_mhdF4RNZM9OHP-...,13374,63,10560897,UUF1JIbMUs6uqoZEY1Haw0GQ,"KPYx-APpvo4,HscCQYIX4fg,TP64Sacd5Nw,G8a5vnD2A0..."
4,JuegaGerman,UCYiGq8XF7YQD00x7wAd62Zg,17573028501,"Role-playing video game, Action game, Video ga...",CL,,54200000,2013-05-19T00:09:13Z,Lento pero seguro.,@juegagerman,https://yt3.ggpht.com/vOsrLzWD4z1dbr470nEXydi3...,2368,12,22980501,UUYiGq8XF7YQD00x7wAd62Zg,"JjWksJW1Z0M,_NhOyRPXA5s,EYXTxxjU0Ks,5eAyv6b84p..."


In [258]:
cols = ["channel_id", "channel_name", "description", "country", "defaultLanguage", "created_date", "category", "aggregated_tags", "most_common_video_genre", "all_video_genres", "avg_duration_seconds", "avg_seconds_between_uploads", "recent_video_titles"]

In [259]:
df = df.drop(columns=[c for c in df.columns if c not in cols])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   channel_name     15830 non-null  object
 1   channel_id       15830 non-null  object
 2   category         15722 non-null  object
 3   country          14030 non-null  object
 4   defaultLanguage  1725 non-null   object
 5   created_date     15830 non-null  object
 6   description      14929 non-null  object
dtypes: object(7)
memory usage: 865.8+ KB


index 2 billie eilish is impersonation account...

In [260]:
df = df.drop(index=2).reset_index(drop=True)

In [None]:
channels = []
periodic_update = 0
exception_counter = 0

for channel_id in df["channel_id"][13396:]:
    print(channel_id)
    try:
        channel_data = main(channel_id)
        channels.append(channel_data)
    except Exception as e:
        exception_counter += 1;
        if "quotaExceeded" in str(e):
            print("Quota limit reached")
            break
        channels.append({"channel_id": channel_id})  # all other columns will be NaN automatically
        print(f"Failed for {channel_id}: {type(e).__name__}: {e}") #TODO: integrate exception handling into entire data extraction flow
    periodic_update += 1
    if periodic_update >= 100:
        print("another 100 down!")
        periodic_update = 0

In [240]:
with open("../data/processed/channelsa13313_.json", "w") as f:
    json.dump(channels, f, indent=2)

In [205]:
new_df = pd.DataFrame(channels)
print(f"Exception counter:{exception_counter}")
print(f"Improper duration counter:{improper_duration_counter}")

Exception counter:29
Improper duration counter:308


0-99: exception counter = 0, improper duration counter = 24  
100-3424: improper duration counter = 278  
3425-6727: exception counter = 29, improper duration counter = 337  
6727-10033: exception counter = 16, improper duration counter = 291 -> 53 indices forward   
10034-13312 : exception counter = 29, improper duration counter = 308  -> 83 indices forward  
13313-:

exception counter not counting the quota limit reached 

note: Failed for UC6AHeJa7s9-Brs6VyXy6Dwg: HttpError: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=snippet%2CcontentDetails&playlistId=UU6AHeJa7s9-Brs6VyXy6Dwg&maxResults=10&key=lol=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.". Details: "[{'message': "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.", 'domain': 'youtube.playlistItem', 'reason': 'playlistNotFound', 'location': 'playlistId', 'locationType': 'parameter'}]">

## concating dfs

In [215]:
all_data = []
for file in glob.glob("../data/processed/*.json"):
    print(file)
    with open(file, "r") as f:
        all_data.extend(json.load(f))

new_df = pd.DataFrame(all_data)

../data/processed\channels0_99.json
../data/processed\channels100_.json
../data/processed\channels3425_.json
../data/processed\channels6727_.json
../data/processed\channelsa10034_.json


In [216]:
new_df = new_df.dropna(thresh=8)

In [217]:
new_df = new_df.reset_index(drop=True)

In [218]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13313 entries, 0 to 13312
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   channel_id                   13313 non-null  object 
 1   channel_name                 13313 non-null  object 
 2   description                  13313 non-null  object 
 3   country                      11803 non-null  object 
 4   defaultLanguage              1486 non-null   object 
 5   created_date                 13313 non-null  object 
 6   category                     13313 non-null  object 
 7   aggregated_tags              13313 non-null  object 
 8   most_common_video_genre      13313 non-null  object 
 9   all_video_genres             13313 non-null  object 
 10  avg_duration_seconds         13313 non-null  float64
 11  avg_seconds_between_uploads  13282 non-null  float64
 12  recent_video_titles          13313 non-null  object 
dtypes: float64(2), o

In [219]:
new_df.tail()

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
13308,UCbfwKVaFIfVDyVDndGrTKTg,Orange Juice - Live,live stream of games.,CA,,2020-09-14T13:18:42.20162Z,"[Action game, Video game culture, Role-playing...",[],Gaming,[Gaming],8299.3,5368762.67,"[yapyap with reckers/nat/jakecrabtree/odog, fa..."
13309,UCDLmS9vkPcTz3cAc-c9QIzg,Tanner Fox,"I love making videos, thanks for stopping by!",US,,2011-09-04T01:10:25Z,"[Vehicle, Lifestyle (sociology)]","[purple Bugatti, 1000hp, Vlog, Tanner fox, sup...",Entertainment,[Entertainment],945.8,12984423.0,"[IT’S FINALLY DONE! GTR 5.0 REVEAL , 1000 MILE..."
13310,UCS4vEeyVLcDcwD3p6dPz6GA,Gillian Bower Slime,Hey everyone! I'm Gillian Bower and I am soooo...,AU,,2018-02-11T02:30:34Z,"[Hobby, Lifestyle (sociology)]","[slime how to, how to make slime, miniature sl...",Howto & Style,[Howto & Style],657.2,5021832.78,[EXTREME Slime Makeover! Can SUPER OLD SLIME B...
13311,UCJNpM63pyQtKW07zBJ9eRBA,TwistedGrim,ENGLISH\nCartoon channel made for entertaining...,CL,,2012-05-28T00:09:55Z,"[Entertainment, Film, Video game culture]","[girl, chile, funny, chicken, michis, cat, ani...",Film & Animation,[Film & Animation],49.7,19247464.33,[Emi Rave #twistedgrim #kato #emi #jonhamm #ra...
13312,UC_s2G_LGpuX-wJrhvucMl4Q,BravoCompanyUSA,Short films and informational pieces about the...,,,2010-09-24T13:49:27Z,"[Society, Military, Lifestyle (sociology), Tel...","[BravoCompanyUSA, mindset, magpul dynamics, wa...",News & Politics,"[Entertainment, News & Politics, Education]",414.2,25076920.44,[BCM Regiment Blades - built for US Army Speci...


In [220]:
new_df[new_df["channel_id"].duplicated(keep=False)]

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
3422,UC0ASolYU_Yh3yShLFQC0stg,Elliott Hulse | STRENGTH,I MAKE MEN STRONG.,US,,2007-03-08T16:55:56Z,"[Physical fitness, Lifestyle (sociology), Health]","[how to burn fat over 40, #lose weight, elliot...",Education,[Education],945.2,525800.0,[Elliott Hulse still competing at age 46 (18 y...
3423,UCjPYYvIdTqn9U52p9IxJ72Q,Isaiah Photo,too easy! challenging myself every day!,US,,2016-01-24T21:37:00Z,"[Lifestyle (sociology), Sport]",[],Entertainment,[Entertainment],250.2,613475.22,"[illegal shoes vs robot legs, i ran in illegal..."
3424,UCl4-WBRqWA2MlxmZorKOV7w,B is for Build,Welcome to B is for Build. A place where Chris...,US,,2015-06-17T05:37:59Z,"[Vehicle, Lifestyle (sociology), Hobby]","[gm, acura, hyundai, kia, mitsubishi, volvo, m...",Autos & Vehicles,[Autos & Vehicles],1789.8,3235065.22,[Health Update & Is Resin 3d Printing Worth It...
3434,UC0ASolYU_Yh3yShLFQC0stg,Elliott Hulse | STRENGTH,I MAKE MEN STRONG.,US,,2007-03-08T16:55:56Z,"[Physical fitness, Lifestyle (sociology), Health]","[cyclical weight gain, permanent fat loss, fit...",Education,[Education],945.2,525800.0,[Elliott Hulse still competing at age 46 (18 y...
3436,UCjPYYvIdTqn9U52p9IxJ72Q,Isaiah Photo,too easy! challenging myself every day!,US,,2016-01-24T21:37:00Z,"[Lifestyle (sociology), Sport]",[],Entertainment,[Entertainment],250.2,613475.22,"[illegal shoes vs robot legs, i ran in illegal..."
3442,UCl4-WBRqWA2MlxmZorKOV7w,B is for Build,Welcome to B is for Build. A place where Chris...,US,,2015-06-17T05:37:59Z,"[Vehicle, Lifestyle (sociology), Hobby]","[ram, toyota, jaguar, acura, mazda, ford, infi...",Autos & Vehicles,[Autos & Vehicles],1789.8,3235065.22,[Health Update & Is Resin 3d Printing Worth It...


In [221]:
df_copy = df.copy()
df_copy[df_copy["channel_name"] == "BravoCompanyUSA"]

Unnamed: 0,channel_name,channel_id,category,country,defaultLanguage,created_date,description
13395,BravoCompanyUSA,UC_s2G_LGpuX-wJrhvucMl4Q,"Television program, Society, Military, Lifesty...",,,2010-09-24T13:49:27Z,Short films and informational pieces about the...


check for duplicates/missed rows at end

## misc

In [59]:
channel_data = main("UCOmHUn--16B90oW2L6FRR3A")
print(json.dumps(channel_data, indent=2))

{
  "channel_id": "UCOmHUn--16B90oW2L6FRR3A",
  "channel_name": "BLACKPINK",
  "description": "BLACKPINK Official YouTube Channel\n\ube14\ub799\ud551\ud06c \uacf5\uc2dd \uc720\ud29c\ube0c \ucc44\ub110\uc785\ub2c8\ub2e4.\n\nJISOO, JENNIE, ROS\u00c9, LISA\n\uc9c0\uc218, \uc81c\ub2c8, \ub85c\uc81c, \ub9ac\uc0ac",
  "country": "KR",
  "defaultLanguage": null,
  "created_date": "2016-06-29T03:15:23Z",
  "category": [
    "Electronic music",
    "Music",
    "Music of Asia",
    "Pop music"
  ],
  "aggregated_tags": [
    "JISOO",
    "YG",
    "JENNIE You & Me",
    "LISA",
    "JENNIE \uc720\uc564\ubbf8",
    "\uc640\uc774\uc9c0",
    "\ube14\ub9c1\ud06c",
    "\uc720\uc564\ubbf8",
    "JENNIE COACHELLA",
    "JENNIE You and ME",
    "BLINK",
    "\uc81c\ub2c8 \ucf54\uccbc\ub77c",
    "JENNIE",
    "\ube14\ud551",
    "\ub85c\uc81c",
    "ROS\u00c9",
    "\uc9c0\uc218",
    "\ube14\ub799\ud551\ud06c",
    "\uc81c\ub2c8 You and Me",
    "\uc81c\ub2c8 \uc720\uc564\ubbf8",
    "YG Entertainme

In [46]:
df.head()

Unnamed: 0,channel_name,channel_id,category,country,defaultLanguage,created_date,description
0,BLACKPINK,UCOmHUn--16B90oW2L6FRR3A,"Music of Asia, Pop music, Music, Electronic music",KR,,2016-06-29T03:15:23Z,BLACKPINK Official YouTube Channel 블랙핑크 공식 유튜브...
1,HYBE LABELS,UC3IZKseVpdzPSBaWxBxundA,"Hip hop music, Pop music, Music, Music of Asia",KR,,2008-06-04T08:23:22Z,Welcome to the official YouTube channel of HYB...
2,BILLIE EILISH,UCVNE660NcgYzi18LwwUZb7Q,,,,2019-01-18T05:14:32Z,
3,Shemaroo,UCF1JIbMUs6uqoZEY1Haw0GQ,"Entertainment, Film",IN,,2007-09-01T11:44:51Z,"Welcome to ShemarooEnt, one of the finest dest..."
4,JuegaGerman,UCYiGq8XF7YQD00x7wAd62Zg,"Role-playing video game, Action game, Video ga...",CL,,2013-05-19T00:09:13Z,Lento pero seguro.


In [45]:
df_test = pd.DataFrame([channel_data])
df_test

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_days_between_uploads,recent_video_titles
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,BLACKPINK Official YouTube Channel\n블랙핑크 공식 유튜...,KR,,2016-06-29T03:15:23Z,"[Electronic music, Pop music, Music of Asia, M...","[JISOO, YG, JENNIE You & Me, LISA, JENNIE 유앤미,...",Music,[Music],211.6,11.33,[BLACKPINK - WORLD TOUR [DEADLINE] IN HONG KON...


In [52]:
df["category"][0]

'Music of Asia, Pop music, Music, Electronic music'

In [50]:
df_test["category"][0][0]

'Electronic music'