---
title: "Data Collection"
format:
    html: 
        code-fold: false
---

<!-- {{< include instructions.qmd >}}  -->


{{< include overview.qmd >}} 

{{< include methods.qmd >}} 

# Code 

## YouTube Data Collection from YouTube Data API


In [1]:
import pandas as pd
import numpy as np
from dateutil import parser

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)
from IPython.display import JSON
from IPython.display import display_json
import json
# Google API
from googleapiclient.discovery import build

In [2]:
# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shenyuxi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shenyuxi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# pip install --upgrade google-api-python-client
# youtube_api_key = "YOUR API KEY"
youtube = build('youtube', 'v3', developerKey=youtube_api_key)

In [4]:
import random

def retrieve_randomized_videos(token=None):
    regions = ['US','GB', 'AU', 'NZ', 'IE']  # Example region codes
    random_region = random.choice(regions)  # Choose a random region
    
    # Retrieve available categories for the selected region
    random_category = random.choice([1, 10, 17, 20, 28, 23, 24, 25, 2])
    

    # Set up the request with or without pagination (pageToken)
    if token:
        request = youtube.videos().list(
            part="contentDetails,snippet,statistics,topicDetails",
            maxResults=50,
            chart="mostPopular",
            regionCode=random_region,
            videoCategoryId=random_category,  # Pick one random category for the search
            pageToken=token  # Use nextPageToken for pagination if available
        )
    else:
        request = youtube.videos().list(
            part="contentDetails,snippet,statistics,topicDetails",
            maxResults=50,
            chart="mostPopular",
            regionCode=random_region,
            videoCategoryId=random_category  # Pick one random category for the search
        )

    response = request.execute()

    return response


In [5]:
import time
from googleapiclient.errors import HttpError

def retrieve_200_videos():
    all_videos = []  # To store all retrieved videos

    # Set a target count
    target_count = 200
    current_count = 0

    # First request
    response = retrieve_randomized_videos()

    # Loop to gather data until we reach the target count
    while current_count < target_count:
        try:
            # Process current page of results
            for item in response.get('items', []):
                all_videos.append(item)
                current_count += 1
                
                # Stop if we have reached the target count
                if current_count >= target_count:
                    break

            # Check for nextPageToken
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                print("No more pages available.")
                break  # Exit the loop if no more pages

            # Fetch the next page
            response = retrieve_randomized_videos(next_page_token)

        except HttpError as e:
            current_count -= 1
            if current_count <= 0:
                break


    # Display the number of collected videos
    print(f"Total videos collected: {len(all_videos)}")
    return all_videos


In [6]:
import pandas as pd

def get_video_details(all_videos):
    """
    Extract relevant video details from a list of video data and return a structured DataFrame.

    Args:
        all_videos (list): List of video data dictionaries.

    Returns:
        pd.DataFrame: DataFrame containing relevant video details.
    """
    all_video_info = []

    # Define the fields to extract from each part of the API response
    stats_to_keep = {
        'snippet': ['channelTitle', 'title', 'tags', 'publishedAt'],
        'statistics': ['viewCount', 'likeCount', 'dislikeCount', 'favoriteCount', 'commentCount'],
        'contentDetails': ['duration', 'definition', 'caption'],
        'topicDetails': ['topicCategories']
    }

    # Iterate through all videos in the list
    for video in all_videos:
        video_info = {}
        video_info['video_id'] = video.get('id', None)

        # Extract details from each specified part
        for part, fields in stats_to_keep.items():
            for field in fields:
                try:
                    # Access nested fields safely
                    video_info[field] = video.get(part, {}).get(field, None)
                except Exception as e:
                    video_info[field] = None

        all_video_info.append(video_info)

    # Convert the list of dictionaries into a DataFrame
    return pd.DataFrame(all_video_info)

# Example Usage:
# Assuming `all_videos` contains the aggregated list of video data
all_videos = retrieve_200_videos()
df = get_video_details(all_videos)
df.head()


No more pages available.
Total videos collected: 150


Unnamed: 0,video_id,channelTitle,title,tags,publishedAt,viewCount,likeCount,dislikeCount,favoriteCount,commentCount,duration,definition,caption,topicCategories
0,JQHZL3KefNk,Date,Kai Cenat Got PAYBACK The Helmet Game AGAIN! 😭💀,"[Kai Cenat, Kai, Cenat, Kai Cenat Live, Kai Ce...",2024-11-19T19:39:52Z,67783623,2105665,,0,2469,PT40S,hd,False,[https://en.wikipedia.org/wiki/Sport]
1,MkoGUJFtt0Q,Vazho,The World's Most Hilarious and Hostile Mascot,,2024-11-17T19:20:51Z,17693526,446088,,0,1461,PT16S,hd,False,[https://en.wikipedia.org/wiki/Sport]
2,6enUpPF-WA4,Rowan University,Hey Coach! 👋 How many can Coach Jespersen gues...,,2024-11-14T21:23:59Z,21672109,1592856,,0,1701,PT1M,hd,False,[https://en.wikipedia.org/wiki/Sport]
3,xUokiJu4rUE,Autumn Nations Series,HIGHLIGHTS | ITALY V NEW ZEALAND | AUTUMN NATI...,,2024-11-23T22:48:19Z,461939,3772,,0,758,PT5M48S,hd,False,[https://en.wikipedia.org/wiki/Sport]
4,U2UQ7Io4OuU,Battleground MMA,🕵️‍♂️Rampage Jackson Admits Cheating🧴,"[ufc, jaxxon podcast, mma, mma shorts, ufc sho...",2024-11-17T20:49:36Z,6572493,360710,,0,1206,PT22S,hd,False,[https://en.wikipedia.org/wiki/Mixed_martial_a...


In [7]:
df = pd.DataFrame()
for _ in range(100):
    all_videos = retrieve_200_videos()
    current_df = get_video_details(all_videos)
    df = pd.concat([df, current_df],  axis=0, ignore_index=True)

print(df)

No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 183
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 40
No more pages available.
Total videos collected: 150
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 100
No more pages available.
Total videos collected: 150
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 150
No more pages available.
Total videos collected: 40
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 50
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 50
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 150
No more pages available.
Total videos collected: 200
No more pages available.
Total videos collected: 3

In [8]:
len(df.video_id.unique())

2578

In [21]:
df["viewCount"].describe()

count    1.359800e+04
mean     5.373338e+06
std      1.084820e+07
min      6.958000e+03
25%      7.280870e+05
50%      2.552836e+06
75%      6.193342e+06
max      2.172830e+08
Name: viewCount, dtype: float64

In [24]:
df["popularity"] = np.where(df["viewCount"] > 6000000, "high", "low") # since the median is around 255000000


In [26]:
df.to_csv("../../data/raw-data/youtube_data_raw.csv")

{{< include closing.qmd >}} 