## Setup

In [202]:
from googleapiclient.discovery import build
import json
import re
from collections import Counter
from datetime import datetime
from googleapiclient.discovery import build
import pandas as pd
import numpy as np

In [203]:
with open ("../performative.txt", "r") as f:
    lines = f.readlines()
    API_KEY = lines[2].strip()
    
yt = build("youtube", "v3", developerKey=API_KEY)

## Initial testing

In [204]:
# request = yt.search().list(
#     part = "snippet",
#     q = "Mr. Beast", 
#     type = "channel", 
#     maxResults = 5  
# )
# response = request.execute()

In [205]:
# print(json.dumps(response, indent=2))

In [206]:
# def get_channel_data(channei_id):
#     response = yt.channels().list()

## Getting data from api

### Helpers

In [207]:
GENRE_MAP = {
    "1":  "Film & Animation",
    "2":  "Autos & Vehicles",
    "10": "Music",
    "15": "Pets & Animals",
    "17": "Sports",
    "19": "Travel & Events",
    "20": "Gaming",
    "22": "People & Blogs",
    "23": "Comedy",
    "24": "Entertainment",
    "25": "News & Politics",
    "26": "Howto & Style",
    "27": "Education",
    "28": "Science & Technology",
    "29": "Nonprofits & Activism",
}

In [208]:
### REQUIRES: ascending sorted list of datetime objects
def calculate_upload_gap(dates):
    if len(dates) < 2:
        return None
    gaps = [(dates[i+1] - dates[i]).total_seconds() for i in range(len(dates) - 1)]
    avg_secs_between_uploads = round(sum(gaps) / len(gaps), 2)
    return avg_secs_between_uploads

In [209]:
improper_duration_counter = 0 #TODO: hacky global variable, replace with logging

In [210]:
def to_seconds(duration):
    global improper_duration_counter #TODO: hacky global variable, replace with logging (need this to edit variable (of higher scope?))
    if not duration:
        improper_duration_counter += 1
        print("missing duration")
        return 0 #TODO: reduces accuracy
    match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", duration)
    if not match:
        improper_duration_counter += 1
        print("duration format doesn't match")
        return 0 #TODO: reduces accuracy
    return int(match.group(1) or 0) * 3600 + int(match.group(2) or 0) * 60 + int(match.group(3) or 0)

### data sourcing functions

In [211]:
def get_profile(channel_id):
    response = yt.channels().list(
        part="snippet,contentDetails,topicDetails,brandingSettings", #TODO: never used brandingSettings
        id=channel_id
    ).execute()

    item = response["items"][0]
    snippet = item["snippet"]
    uploads_id = item["contentDetails"]["relatedPlaylists"]["uploads"]
    raw_categories = item.get("topicDetails", {}).get("topicCategories", [])
    categories = [url.split("/")[-1].replace("_", " ") for url in raw_categories]

    return snippet, uploads_id, categories

In [212]:
def get_recent_videos_and_dates(uploads_id):
    response = yt.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=uploads_id,
        maxResults=10
    ).execute()

    videos=[
        {
            "video_id": item["contentDetails"]["videoId"],
            "published_at": item["snippet"]["publishedAt"],
        }
        for item in response["items"]
    ]

    upload_dates = sorted([datetime.fromisoformat(v["published_at"].replace("Z", "+00:00")) for v in videos])
    video_ids = [v["video_id"] for v in videos]
    return video_ids, upload_dates


In [213]:
def get_videos_data(video_ids):
    response = yt.videos().list(
        part="snippet,contentDetails",
        id=",".join(video_ids)
    ).execute()

    tags, genres, durations, titles = [], [], [], []

    for v in response["items"]:
        tags.extend(v["snippet"].get("tags", []))
        genres.append(GENRE_MAP.get(v["snippet"].get("categoryId"), "Unknown"))
        durations.append(to_seconds(v["contentDetails"].get("duration")))
        titles.append(v["snippet"].get("title"))
    
    return tags, genres, durations, titles

In [214]:
def main(channel_id):
    snippet, uploads_id, categories = get_profile(channel_id)
    video_ids, upload_dates = get_recent_videos_and_dates(uploads_id)
    avg_upload_gap = calculate_upload_gap(upload_dates)
    tags, genres, durations, titles = get_videos_data(video_ids)

    return {
        "channel_id": channel_id,
        "channel_name": snippet.get("title"),
        "description": snippet.get("description"),
        "country": snippet.get("country"),
        "defaultLanguage": snippet.get("defaultLanguage"),
        "created_date": snippet.get("publishedAt"),
        "category": categories,
        "aggregated_tags": list(set(tags)),
        "most_common_video_genre": Counter(genres).most_common(1)[0][0] if genres else None,
        "all_video_genres": list(set(genres)),
        "avg_duration_seconds": round(sum(durations) / len(durations), 2) if durations else None,
        "avg_seconds_between_uploads": avg_upload_gap,
        "recent_video_titles": titles,
    }


## Script for dataset

columns for dataset:  
- from given dataset  
    - channel_id
    - channel_name
    - description
    - country
    - defaultLanguage
    - created_date
    - category
- from yt api
    - aggregated_tags
    - most_common_video_genre
    - all_video_genres
    - avg_duration_seconds
    - avg_seconds_between_uploads
    - recent_video_titles

In [215]:
df = pd.read_csv("../data/raw/youtube_channel_info_v2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   channel_name         15830 non-null  object
 1   channel_id           15830 non-null  object
 2   view_count           15830 non-null  int64 
 3   category             15722 non-null  object
 4   country              14030 non-null  object
 5   defaultLanguage      1725 non-null   object
 6   subscriber_count     15830 non-null  int64 
 7   created_date         15830 non-null  object
 8   description          14929 non-null  object
 9   custom_url           15808 non-null  object
 10  thumbnail            15830 non-null  object
 11  video_count          15830 non-null  int64 
 12  videos_last_30_days  15830 non-null  int64 
 13  views_last_30_days   15830 non-null  int64 
 14  uploads_playlist_id  15830 non-null  object
 15  last_10_video_ids    15774 non-null  object
dtypes: i

In [216]:
df.head() #TODO: consider removing music channels?

Unnamed: 0,channel_name,channel_id,view_count,category,country,defaultLanguage,subscriber_count,created_date,description,custom_url,thumbnail,video_count,videos_last_30_days,views_last_30_days,uploads_playlist_id,last_10_video_ids
0,BLACKPINK,UCOmHUn--16B90oW2L6FRR3A,39962585446,"Music of Asia, Pop music, Music, Electronic music",KR,,99000000,2016-06-29T03:15:23Z,BLACKPINK Official YouTube Channel Î∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†ÌäúÎ∏å...,@blackpink,https://yt3.ggpht.com/U3VrCkKjzTpQ3VYv4SCPjNfD...,636,1,3256869,UUOmHUn--16B90oW2L6FRR3A,"W-9-keHNwV8,pAeqY-TEZc0,WB_AOdAEuBM,zTnAvaoHR4..."
1,HYBE LABELS,UC3IZKseVpdzPSBaWxBxundA,41604896923,"Hip hop music, Pop music, Music, Music of Asia",KR,,78700000,2008-06-04T08:23:22Z,Welcome to the official YouTube channel of HYB...,@hybelabels,https://yt3.ggpht.com/ytc/AIdro_l8g0yRFG8xoe_q...,2817,79,46074833,UU3IZKseVpdzPSBaWxBxundA,"NPqiynCpqgI,nbOuR-KYZy8,xf8N3JYEhV4,iIWKCYP2zY..."
2,BILLIE EILISH,UCVNE660NcgYzi18LwwUZb7Q,14316364,,,,82300,2019-01-18T05:14:32Z,,@billieeilish8477,https://yt3.ggpht.com/ytc/AIdro_n3NfvUAVtcUJvW...,1,0,0,UUVNE660NcgYzi18LwwUZb7Q,C9odQay_d6s
3,Shemaroo,UCF1JIbMUs6uqoZEY1Haw0GQ,27011004224,"Entertainment, Film",IN,,60400000,2007-09-01T11:44:51Z,"Welcome to ShemarooEnt, one of the finest dest...",@shemaroo,https://yt3.ggpht.com/ytc/AIdro_mhdF4RNZM9OHP-...,13374,63,10560897,UUF1JIbMUs6uqoZEY1Haw0GQ,"KPYx-APpvo4,HscCQYIX4fg,TP64Sacd5Nw,G8a5vnD2A0..."
4,JuegaGerman,UCYiGq8XF7YQD00x7wAd62Zg,17573028501,"Role-playing video game, Action game, Video ga...",CL,,54200000,2013-05-19T00:09:13Z,Lento pero seguro.,@juegagerman,https://yt3.ggpht.com/vOsrLzWD4z1dbr470nEXydi3...,2368,12,22980501,UUYiGq8XF7YQD00x7wAd62Zg,"JjWksJW1Z0M,_NhOyRPXA5s,EYXTxxjU0Ks,5eAyv6b84p..."


In [217]:
cols = ["channel_id", "channel_name", "description", "country", "defaultLanguage", "created_date", "category", "aggregated_tags", "most_common_video_genre", "all_video_genres", "avg_duration_seconds", "avg_seconds_between_uploads", "recent_video_titles"]

In [218]:
df = df.drop(columns=[c for c in df.columns if c not in cols])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   channel_name     15830 non-null  object
 1   channel_id       15830 non-null  object
 2   category         15722 non-null  object
 3   country          14030 non-null  object
 4   defaultLanguage  1725 non-null   object
 5   created_date     15830 non-null  object
 6   description      14929 non-null  object
dtypes: object(7)
memory usage: 865.8+ KB


index 2 billie eilish is impersonation account...

In [219]:
df = df.drop(index=2).reset_index(drop=True)

In [220]:
channels = []
periodic_update = 0
exception_counter = 0

for channel_id in df["channel_id"][:100]:
    print(channel_id)
    try:
        channel_data = main(channel_id)
        channels.append(channel_data)
    except Exception as e:
        exception_counter += 1;
        channels.append({"channel_id": channel_id})  # all other columns will be NaN automatically
        print(f"Failed for {channel_id}: {type(e).__name__}: {e}") #TODO: integrate exception handling into entire data extraction flow
    periodic_update += 1
    if periodic_update >= 100:
        print("another 100 down!")
        periodic_update = 0

UCOmHUn--16B90oW2L6FRR3A
UC3IZKseVpdzPSBaWxBxundA
UCF1JIbMUs6uqoZEY1Haw0GQ
UCYiGq8XF7YQD00x7wAd62Zg
UC4NALVCmcmL5ntpV0thoH6w
UCJrDMFOdv1I2k8n9oK_V21w
duration format doesn't match
UCX6OQ3DkcsbYNE6H8uQQuVA
UCK1i2UviaXLUNrZlAFpw_jA
UCIwFjwMjI0y7PDBVEO9-bkQ
UCpEhnqL0y41EpW2TvWAHD7Q
UC22nIfOTM7KLIQuFGMKzQbg
UC55IWqFLDH1Xp7iu1_xknRA
duration format doesn't match
UC56gTxNs4f9xZ7Pa2i5xNzg
UC0C-w0YjGpqDXGB8IHb662A
UCppHT7SZKKvar4Oc9J4oljQ
UCBnZ16ahKA2DZ_T5W0FPUXg
UC9CoOnJkIBMdeijd9qYoT_g
UC295-Dw_tDNtZXFeAPAW6Aw
UCt4t-jeY85JegMlZ-E5UWtA
duration format doesn't match
UCRx3mKNUdl8QE06nEug7p6Q
duration format doesn't match
duration format doesn't match
duration format doesn't match
duration format doesn't match
duration format doesn't match
duration format doesn't match
duration format doesn't match
duration format doesn't match
duration format doesn't match
UCfM3zsQsOnfWNUppiycmBuw
UC3gNmTGu-TTbFPpfSs5kNkg
missing duration
UCvlE5gTbOvjiolFlEm-c_Ow
UC6-F5tO8uklgE9Zy8IvbdFw
UC1ciY6kR3yj3kaKZ6R7ewA

In [226]:
with open("../data/processed/channels.json", "w") as f:
    json.dump(channels, f, indent=2)

In [221]:
new_df = pd.DataFrame(channels)
print(f"Exception counter:{exception_counter}")
print(f"Improper duration counter:{improper_duration_counter}")

Exception counter:0
Improper duration counter:24


In [222]:
new_df.head(10)

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,BLACKPINK Official YouTube Channel\nÎ∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†Ìäú...,KR,,2016-06-29T03:15:23Z,"[Electronic music, Pop music, Music of Asia, M...","[JISOO, YG, JENNIE You & Me, LISA, JENNIE Ïú†Ïï§ÎØ∏,...",Music,[Music],211.6,1017992.89,[BLACKPINK - WORLD TOUR [DEADLINE] IN HONG KON...
1,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,Welcome to the official YouTube channel of HYB...,KR,,2008-06-04T08:23:22Z,"[Pop music, Music, Music of Asia]","[ÌïòÏù¥Î∏å, ÌïòÏù¥Î∏åÎ†àÏù¥Î∏îÏ¶à, HYBE LABELS, HYBE]",Music,[Music],89.6,131588.11,[SANTOS BRAVOS ‚ÄúKAWASAKI (&TEAM Remix)‚Äù Lyric ...
2,UCF1JIbMUs6uqoZEY1Haw0GQ,Shemaroo,"Welcome to ShemarooEnt, one of the finest dest...",IN,,2007-09-01T11:44:51Z,"[Film, Entertainment]","[salman khan movies, ramcharana moves, Mega Po...",Entertainment,[Entertainment],5336.1,45200.0,[Mega Power Star Ram Charan üëë | Zanjeer (4K Ac...
3,UCYiGq8XF7YQD00x7wAd62Zg,JuegaGerman,Lento pero seguro.,CL,,2013-05-19T00:09:13Z,"[Action game, Video game culture, Action-adven...","[revenia, juega german, juego de miedo, click ...",Gaming,[Gaming],2046.9,280466.0,"[Fotos Tomadas En El Momento PERFECTO üì∏, Traba..."
4,UC4NALVCmcmL5ntpV0thoH6w,LooLoo Kids - Nursery Rhymes and Children's Songs,LooLoo Kidsüíñ is an educational YouTube channel...,US,en,2014-08-05T20:15:33Z,"[Entertainment, Music, Film]","[kids videos, children songs, farm song nurser...",Music,[Music],148.5,181623.89,[Old Macdonald Had a Farm Song + Johny Johny Y...
5,UCJrDMFOdv1I2k8n9oK_V21w,Tips Official,The proud history of Tips Music Limited (Forme...,IN,,2007-05-22T10:13:28Z,"[Film, Music, Music of Asia, Pop music]","[Itna Main Chahoon Tujhe Koi Kisi Ko Na Chahe,...",Music,[Music],2059.1,16523.78,[90's Evergreen Songs | 90's Blockbuster Songs...
6,UCX6OQ3DkcsbYNE6H8uQQuVA,MrBeast,SUBSCRIBE FOR A COOKIE!\nNew MrBeast or MrBeas...,US,en,2012-02-20T00:43:50Z,"[Entertainment, Lifestyle (sociology)]",[],Entertainment,[Entertainment],303.8,384399.78,"[Every Step You Take, Win $1,000, Surprising M..."
7,UCK1i2UviaXLUNrZlAFpw_jA,El Reino Infantil,El Reino Infantil es la comunidad digital para...,AR,,2011-06-02T16:20:07Z,"[Film, Music, Entertainment]","[mes de la amistad, bebe looloo kids, youtube,...",Music,"[Entertainment, Music]",466.4,86800.0,[Cinco Patitos ü™ø | FAMILIA BLU üíô Canci√≥n Infan...
8,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,,CA,,2007-01-15T21:17:27Z,"[Soul music, Music, Pop music, Electronic musi...","[sorry, purpose, believe, anyone, beauty and a...",Music,[Music],179.1,307323.22,"[Justin Bieber - BAD HONEY, Justin Bieber - SP..."
9,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,Sony Entertainment Television is one of the le...,IN,,2006-09-20T22:24:59Z,"[Music of Asia, Entertainment, Television prog...","[cid, new business ideas, business deals, Kuna...",Entertainment,[Entertainment],2678.6,4802.44,[Wheel Of Fortune with Akshay Kumar | Mon-Fri ...


In [None]:
full_df = pd.concat(full_df)

In [70]:
df.head(10)

Unnamed: 0,channel_name,channel_id,category,country,defaultLanguage,created_date,description
0,BLACKPINK,UCOmHUn--16B90oW2L6FRR3A,"Music of Asia, Pop music, Music, Electronic music",KR,,2016-06-29T03:15:23Z,BLACKPINK Official YouTube Channel Î∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†ÌäúÎ∏å...
1,HYBE LABELS,UC3IZKseVpdzPSBaWxBxundA,"Hip hop music, Pop music, Music, Music of Asia",KR,,2008-06-04T08:23:22Z,Welcome to the official YouTube channel of HYB...
2,BILLIE EILISH,UCVNE660NcgYzi18LwwUZb7Q,,,,2019-01-18T05:14:32Z,
3,Shemaroo,UCF1JIbMUs6uqoZEY1Haw0GQ,"Entertainment, Film",IN,,2007-09-01T11:44:51Z,"Welcome to ShemarooEnt, one of the finest dest..."
4,JuegaGerman,UCYiGq8XF7YQD00x7wAd62Zg,"Role-playing video game, Action game, Video ga...",CL,,2013-05-19T00:09:13Z,Lento pero seguro.
5,LooLoo Kids - Nursery Rhymes and Children's Songs,UC4NALVCmcmL5ntpV0thoH6w,"Entertainment, Music, Film",US,en,2014-08-05T20:15:33Z,LooLoo Kidsüíñ is an educational YouTube channel...
6,Tips Official,UCJrDMFOdv1I2k8n9oK_V21w,"Film, Music, Music of Asia, Pop music",IN,,2007-05-22T10:13:28Z,The proud history of Tips Music Limited (Forme...
7,MrBeast,UCX6OQ3DkcsbYNE6H8uQQuVA,"Lifestyle (sociology), Entertainment, Film",US,en,2012-02-20T00:43:50Z,SUBSCRIBE FOR A COOKIE! New MrBeast or MrBeast...
8,El Reino Infantil,UCK1i2UviaXLUNrZlAFpw_jA,"Music, Entertainment, Film",AR,,2011-06-02T16:20:07Z,El Reino Infantil es la comunidad digital para...
9,Justin Bieber,UCIwFjwMjI0y7PDBVEO9-bkQ,"Hip hop music, Music, Pop music, Electronic mu...",CA,,2007-01-15T21:17:27Z,


## misc

In [59]:
channel_data = main("UCOmHUn--16B90oW2L6FRR3A")
print(json.dumps(channel_data, indent=2))

{
  "channel_id": "UCOmHUn--16B90oW2L6FRR3A",
  "channel_name": "BLACKPINK",
  "description": "BLACKPINK Official YouTube Channel\n\ube14\ub799\ud551\ud06c \uacf5\uc2dd \uc720\ud29c\ube0c \ucc44\ub110\uc785\ub2c8\ub2e4.\n\nJISOO, JENNIE, ROS\u00c9, LISA\n\uc9c0\uc218, \uc81c\ub2c8, \ub85c\uc81c, \ub9ac\uc0ac",
  "country": "KR",
  "defaultLanguage": null,
  "created_date": "2016-06-29T03:15:23Z",
  "category": [
    "Electronic music",
    "Music",
    "Music of Asia",
    "Pop music"
  ],
  "aggregated_tags": [
    "JISOO",
    "YG",
    "JENNIE You & Me",
    "LISA",
    "JENNIE \uc720\uc564\ubbf8",
    "\uc640\uc774\uc9c0",
    "\ube14\ub9c1\ud06c",
    "\uc720\uc564\ubbf8",
    "JENNIE COACHELLA",
    "JENNIE You and ME",
    "BLINK",
    "\uc81c\ub2c8 \ucf54\uccbc\ub77c",
    "JENNIE",
    "\ube14\ud551",
    "\ub85c\uc81c",
    "ROS\u00c9",
    "\uc9c0\uc218",
    "\ube14\ub799\ud551\ud06c",
    "\uc81c\ub2c8 You and Me",
    "\uc81c\ub2c8 \uc720\uc564\ubbf8",
    "YG Entertainme

In [46]:
df.head()

Unnamed: 0,channel_name,channel_id,category,country,defaultLanguage,created_date,description
0,BLACKPINK,UCOmHUn--16B90oW2L6FRR3A,"Music of Asia, Pop music, Music, Electronic music",KR,,2016-06-29T03:15:23Z,BLACKPINK Official YouTube Channel Î∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†ÌäúÎ∏å...
1,HYBE LABELS,UC3IZKseVpdzPSBaWxBxundA,"Hip hop music, Pop music, Music, Music of Asia",KR,,2008-06-04T08:23:22Z,Welcome to the official YouTube channel of HYB...
2,BILLIE EILISH,UCVNE660NcgYzi18LwwUZb7Q,,,,2019-01-18T05:14:32Z,
3,Shemaroo,UCF1JIbMUs6uqoZEY1Haw0GQ,"Entertainment, Film",IN,,2007-09-01T11:44:51Z,"Welcome to ShemarooEnt, one of the finest dest..."
4,JuegaGerman,UCYiGq8XF7YQD00x7wAd62Zg,"Role-playing video game, Action game, Video ga...",CL,,2013-05-19T00:09:13Z,Lento pero seguro.


In [45]:
df_test = pd.DataFrame([channel_data])
df_test

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_days_between_uploads,recent_video_titles
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,BLACKPINK Official YouTube Channel\nÎ∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†Ìäú...,KR,,2016-06-29T03:15:23Z,"[Electronic music, Pop music, Music of Asia, M...","[JISOO, YG, JENNIE You & Me, LISA, JENNIE Ïú†Ïï§ÎØ∏,...",Music,[Music],211.6,11.33,[BLACKPINK - WORLD TOUR [DEADLINE] IN HONG KON...


In [52]:
df["category"][0]

'Music of Asia, Pop music, Music, Electronic music'

In [50]:
df_test["category"][0][0]

'Electronic music'