## Setup

In [2]:
from googleapiclient.discovery import build
import json
import re
from collections import Counter
from datetime import datetime
from googleapiclient.discovery import build
import pandas as pd
import numpy as np

In [3]:
with open ("../performative.txt", "r") as f:
    lines = f.readlines()
    API_KEY = lines[2].strip()
    
yt = build("youtube", "v3", developerKey=API_KEY)

## Initial testing

In [4]:
request = yt.search().list(
    part = "snippet",
    q = "Mr. Beast", 
    type = "channel", 
    maxResults = 5  
)
response = request.execute()

In [5]:
print(json.dumps(response, indent=2))

{
  "kind": "youtube#searchListResponse",
  "etag": "31HkOb37QXogpESQJXXSkfWvJb4",
  "nextPageToken": "CAUQAA",
  "regionCode": "CA",
  "pageInfo": {
    "totalResults": 43769,
    "resultsPerPage": 5
  },
  "items": [
    {
      "kind": "youtube#searchResult",
      "etag": "9VY0mroPCq4siyhZXwnBETMaJ7w",
      "id": {
        "kind": "youtube#channel",
        "channelId": "UCX6OQ3DkcsbYNE6H8uQQuVA"
      },
      "snippet": {
        "publishedAt": "2012-02-20T00:43:50Z",
        "channelId": "UCX6OQ3DkcsbYNE6H8uQQuVA",
        "title": "MrBeast",
        "description": "SUBSCRIBE FOR A COOKIE! New MrBeast or MrBeast Gaming video every single Saturday at noon eastern time!",
        "thumbnails": {
          "default": {
            "url": "https://yt3.ggpht.com/nxYrc_1_2f77DoBadyxMTmv7ZpRZapHR5jbuYe7PlPd5cIRJxtNNEYyOC0ZsxaDyJJzXrnJiuDE=s88-c-k-c0xffffffff-no-rj-mo"
          },
          "medium": {
            "url": "https://yt3.ggpht.com/nxYrc_1_2f77DoBadyxMTmv7ZpRZapHR5jbuYe7Pl

In [6]:
def get_channel_data(channei_id):
    response = yt.channels().list()

## Getting data from api

### Helpers

In [16]:
GENRE_MAP = {
    "1":  "Film & Animation",
    "2":  "Autos & Vehicles",
    "10": "Music",
    "15": "Pets & Animals",
    "17": "Sports",
    "19": "Travel & Events",
    "20": "Gaming",
    "22": "People & Blogs",
    "23": "Comedy",
    "24": "Entertainment",
    "25": "News & Politics",
    "26": "Howto & Style",
    "27": "Education",
    "28": "Science & Technology",
    "29": "Nonprofits & Activism",
}

In [8]:
### REQUIRES: ascending sorted list of datetime objects
def calculate_upload_gap(dates):
    if len(dates) < 2:
        return None
    gaps = [(dates[i+1] - dates[i]).days for i in range(len(dates) - 1)]
    avg_days_between_uploads = round(sum(gaps) / len(gaps), 2)
    return avg_days_between_uploads

In [9]:
def to_seconds(duration):
    if not duration:
        return None
    match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", duration)
    if not match:
        return None
    return int(match.group(1) or 0) * 3600 + int(match.group(2) or 0) * 60 + int(match.group(3) or 0)

### data sourcing functions

In [None]:
def get_profile(channel_id):
    response = yt.channels().list(
        part="snippet,contentDetails,topicDetails,brandingSettings", #TODO: never used brandingSettings
        id=channel_id
    ).execute()

    item = response["items"][0]
    snippet = item["snippet"]
    uploads_id = item["contentDetails"]["relatedPlaylists"]["uploads"]
    raw_categories = item.get("topicDetails", {}).get("topicCategories", [])
    categories = [url.split("/")[-1].replace("_", " ") for url in raw_categories]

    return snippet, uploads_id, categories

In [11]:
def get_recent_videos_and_dates(uploads_id):
    response = yt.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=uploads_id,
        maxResults=10
    ).execute()

    videos=[
        {
            "video_id": item["contentDetails"]["videoId"],
            "published_at": item["snippet"]["publishedAt"],
        }
        for item in response["items"]
    ]

    upload_dates = sorted([datetime.fromisoformat(v["published_at"].replace("Z", "+00:00")) for v in videos])
    video_ids = [v["video_id"] for v in videos]
    return video_ids, upload_dates


In [None]:
def get_videos_data(video_ids):
    response = yt.videos().list(
        part="snippet,contentDetails",
        id=",".join(video_ids)
    ).execute()

    tags, genres, durations, titles = [], [], [], []

    for v in response["items"]:
        tags.extend(v["snippet"].get("tags", []))
        genres.append(GENRE_MAP.get(v["snippet"].get("categoryId"), "Unknown"))
        durations.append(to_seconds(v["contentDetails"].get("duration")))
        titles.append(v["snippet"].get("title"))
    
    return tags, genres, durations, titles

In [None]:
def main():
    channel_id = "UCD1P9oDMsLuZyfJ2PEciTwg" # this channel: https://www.youtube.com/@MikeBartner
    snippet, uploads_id, categories = get_profile(channel_id)
    video_ids, upload_dates = get_recent_videos_and_dates(uploads_id)
    avg_upload_gap = calculate_upload_gap(upload_dates)
    tags, genres, durations, titles = get_videos_data(video_ids)

    return {
        "channel_id": channel_id,
        "channel_name": snippet.get("title"),
        "description": snippet.get("description"),
        "country": snippet.get("country"),
        "defaultLanguage": snippet.get("defaultLanguage"),
        "created_date": snippet.get("publishedAt"),
        "category": categories,
        "aggregated_tags": list(set(tags)),
        "most_common_video_genre": Counter(genres).most_common(1)[0][0] if genres else None,
        "all_video_genres": list(set(genres)),
        "avg_duration_seconds": round(sum(durations) / len(durations), 2) if durations else None,
        "avg_days_between_uploads": avg_upload_gap,
        "recent_video_titles": titles,
    }


## Script for dataset

columns for dataset:  
- from given dataset  
    - channel_id
    - channel_name
    - description
    - country
    - defaultLanguage
    - created_date
    - category
- from yt api
    - aggregated_tags
    - most_common_video_genre
    - all_video_genres
    - avg_duration_seconds
    - avg_days_between_uploads
    - recent_video_titles

In [18]:
df = pd.read_csv("../data/raw/youtube_channel_info_v2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   channel_name         15830 non-null  object
 1   channel_id           15830 non-null  object
 2   view_count           15830 non-null  int64 
 3   category             15722 non-null  object
 4   country              14030 non-null  object
 5   defaultLanguage      1725 non-null   object
 6   subscriber_count     15830 non-null  int64 
 7   created_date         15830 non-null  object
 8   description          14929 non-null  object
 9   custom_url           15808 non-null  object
 10  thumbnail            15830 non-null  object
 11  video_count          15830 non-null  int64 
 12  videos_last_30_days  15830 non-null  int64 
 13  views_last_30_days   15830 non-null  int64 
 14  uploads_playlist_id  15830 non-null  object
 15  last_10_video_ids    15774 non-null  object
dtypes: i

In [20]:
cols = ["channel_id", "channel_name", "description", "country", "defaultLanguage", "created_date", "category", "aggregated_tags", "most_common_video_genre", "all_video_genres", "avg_duration_seconds", "avg_days_between_uploads", "recent_video_titles"]

In [22]:
df = df.drop(columns=[c for c in df.columns if c not in cols])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   channel_name     15830 non-null  object
 1   channel_id       15830 non-null  object
 2   category         15722 non-null  object
 3   country          14030 non-null  object
 4   defaultLanguage  1725 non-null   object
 5   created_date     15830 non-null  object
 6   description      14929 non-null  object
dtypes: object(7)
memory usage: 865.8+ KB


## misc

In [33]:
channel_data = main()
print(json.dumps(channel_data, indent=2))

{
  "channel_id": "UCD1P9oDMsLuZyfJ2PEciTwg",
  "channel_name": "Mike Bartner",
  "description": "Just your average hockey fan that loves to make videos",
  "country": null,
  "defaultLanguage": null,
  "created_date": "2012-03-03T02:44:12Z",
  "topics": [
    "Ice hockey",
    "Sport"
  ],
  "aggregated_tags": [],
  "most_common_video_category": "Entertainment",
  "all_video_categories": [
    "Entertainment"
  ],
  "avg_duration_seconds": 584.9,
  "avg_days_between_uploads": 0.78,
  "recent_video_titles": [
    "My Reaction To The WILD Olympic Quarterfinals",
    "My Olympic Hockey Quarterfinal Predictions",
    "10 Takeaways From The Olympics Thus Far",
    "Frustrating win! #Hockey #OlympicHockey #USAHockey #AmericanHockey #Olympics",
    "Canada boutta hang 15 goals on France #Hockey #OlympicHockey #Olympics #HockeyCanada #USAHockey",
    "What a delight to watch #NHL #Hockey #TeamCanada #Canada #McDavid#MacKinnonCelebrini",
    "Put some respect on Slaf\u2019s name!!! #NHL #Hocke