# Setup

In [183]:
import os
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd

In [184]:
with open('secret/secret.txt') as f:
    api_key = f.readline().strip()

youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

# Load and Cache

In [185]:
def get_and_add_playlist_id_to_dataframe(dataframe, channel_ids):
    playlist_ids = []
    for channel_id in channel_ids:
        request = youtube.channels().list(
            part="contentDetails",
            id=channel_id
        )
        response = request.execute()
        playlist_ids.append(response['items'][0]['contentDetails']['relatedPlaylists']['uploads'])
    dataframe['playlist_id'] = playlist_ids
    return dataframe

In [186]:
import ast

def get_video_data(video_data_path, channel_data):
    if os.path.exists(video_data_path):
        video_data = pd.read_csv(video_data_path)
    else:
        video_data = pd.DataFrame(columns=["channel_id","video_id", "video_title", "publish_date" ,"view_count", "like_count", "dislike_count", "comment_count", "default_language", "duration","has_paid_product_placement"])

        try:
            for id,channel_id in zip(channel_data["video_id"],channel_data["channel_id"]):

                video_request = youtube.videos().list(
                    part="snippet,statistics,contentDetails",
                    id=id
                )

                video_response = video_request.execute()

                for video in video_response["items"]:

                    try:
                        video_id = video["id"]
                    except KeyError:
                        video_id = float('nan')
                    try:
                        view_count = video["statistics"]["viewCount"]
                    except KeyError:
                        view_count = float('nan')
                    try:
                        like_count = video["statistics"]["likeCount"]
                    except KeyError:
                        like_count = float('nan')
                    try:
                        dislike_count = video["statistics"]["dislikeCount"]
                    except KeyError:
                        dislike_count = float('nan')
                    try:
                        comment_count = video["statistics"]["commentCount"]
                    except KeyError:
                        comment_count = float('nan')
                    try:
                        video_title = video["snippet"]["title"]
                    except KeyError:
                        video_title = float('nan')
                    try:
                        publish_date = video["snippet"]["publishedAt"]
                    except KeyError:
                        publish_date = float('nan')
                    try:
                        default_language = video["snippet"]["defaultLanguage"]
                    except KeyError:
                        default_language = float('nan')
                    try:
                        duration = video["contentDetails"]["duration"]
                    except KeyError:
                        duration = float('nan')
                    try:
                        has_paid_product_placement = video["contentDetails"]["hasPaidProductPlacement"]
                    except KeyError:
                        has_paid_product_placement = float('nan')
                    

                    row = pd.DataFrame({
                        "video_id": [video_id],
                        "channel_id": [channel_id],
                        "video_title": [video_title],
                        "view_count": [view_count],
                        "like_count": [like_count],
                        "dislike_count": [dislike_count],
                        "comment_count": [comment_count],
                        "publish_date": [publish_date],
                        "default_language": [default_language],
                        "duration": [duration],
                        "has_paid_product_placement": [has_paid_product_placement]
                    })

                    video_data = pd.concat([video_data, row], ignore_index=True)

        except Exception as e:
            print(f"Error {e}")
            video_response = {"items": []}

        if len(video_data) > 0:    
            video_data.to_csv(video_data_path, index=False)
    
    return video_data

In [187]:
channel_data_path = f"ChannelID_Data.csv"
video_data_path = f"video_data.csv"

channel_data =  pd.read_csv(channel_data_path)
video_data = pd.read_csv(video_data_path)
 

# Clean Data

In [None]:
channel_data = channel_data.drop(columns=['Unnamed: 0'])
video_data = video_data.drop(columns=['dislike_count'])

# merge by channel_id
merged_data = pd.merge(channel_data, video_data, on="channel_id")
display(merged_data.head(1))

In [None]:
#rename view_x to channel_view_count
merged_data.rename(columns = {'view_count_x':'channel_view_count'}, inplace = True)

#rename view_y to video_view_count
merged_data.rename(columns = {'view_count_y':'video_view_count'}, inplace = True)

# duration in seconds
merged_data["duration"] = merged_data["duration"].apply(lambda x: pd.to_timedelta(x).seconds)

# convert publish_date to datetime
merged_data["publish_date"] = pd.to_datetime(merged_data["publish_date"])

# change default_language nan to "none"
merged_data["default_language"] = merged_data["default_language"].fillna("none")

# drop has paid product placement
merged_data = merged_data.drop(columns=["has_paid_product_placement"])

# drop nan values
merged_data = merged_data.dropna()

# convert number data to ints
merged_data["video_view_count"] = merged_data["video_view_count"].astype(int)
merged_data["channel_view_count"] = merged_data["channel_view_count"].astype(int)
merged_data["like_count"] = merged_data["like_count"].astype(int)
merged_data["comment_count"] = merged_data["comment_count"].astype(int)
merged_data["subscriber_count"] = merged_data["subscriber_count"].astype(int)
merged_data["video_count"] = merged_data["video_count"].astype(int)

display(merged_data.head(3))

In [None]:
# add is in covid column (2020-2021)
merged_data["is_in_covid"] = merged_data["publish_date"].apply(lambda x: 1 if x.year >= 2020 and x.year <= 2021 else 0)

# engagment_comments (comments/video_view)
merged_data["engagement_comments"] = merged_data["comment_count"] / merged_data["video_view_count"]

# engagment_likes (likes/video_view)
merged_data["engagement_likes"] = merged_data["like_count"] / merged_data["video_view_count"]

# engagment_subscribers (subscribers/video_view)
merged_data["engagement_subscribers"] = merged_data["subscriber_count"] / merged_data["video_view_count"]

# view per subscriber (channel_view/subscribers)
merged_data["view_per_subscriber"] = merged_data["channel_view_count"] / merged_data["subscriber_count"]

# video view per total channel view (video_view/channel_view)
merged_data["video_view_per_total_channel_view"] = merged_data["video_view_count"] / merged_data["channel_view_count"]

# video views per video count (video_view/video_count)
merged_data["video_views_per_video_count"] = merged_data["channel_view_count"] / merged_data["video_count"]

display(merged_data.head(3))