### Youtube Data Engagement
- How does engagement of live videos differ from non-live videos?
- What is the completion rate of educational/series type playlists?
- How does engagement rate of shorts differ from non-shorts?

In [None]:
# -*- coding: utf-8 -*-

# Sample Python code for youtube.channels.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/code-samples#python

import os

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import seaborn as sns

import numpy as np
import pandas as pd
import isodate

In [None]:
#scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

In [None]:
api_key = os.environ["youtube_API_KEY"]
#channel_user_name = 'AlexTheAnalyst'

In [None]:
api_service_name = "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey= api_key)

In [None]:
## function to get channel statistics 

def get_channel_stats (youtube, channel_user_name):
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        forUsername=channel_user_name
    )
    response = request.execute()
    
    data = dict( channel_name = response['items'][0]['snippet']['title'],
                subscribers = response['items'][0]['statistics']['subscriberCount'],
                views = response['items'][0]['statistics']['viewCount'],
                total_videos = response['items'][0]['statistics']['videoCount']
               )
    return data

In [None]:
channel_stats = get_channel_stats(youtube, 'michellechoii')

In [None]:
channel_stats

In [None]:
channel_data = pd.DataFrame(channel_stats, index = [0])

In [None]:
channel_data

In [None]:
Alex_the_analyst_channel_id = 'UC7cs8q-gJRlGwj4A8OmCmXg'
tech_tfq_channel_id = 'UCnz-ZXXER4jOvuED5trXfEA'
lawlumberacct = 'UCtbp5yMSXR3BKR3x8oCRZdQ'

In [None]:
## function to get channel statistics 

def get_playlist_stats (youtube, channel_id):
    request = youtube.playlists().list(
        part = "contentDetails, status, player, snippet",
    channelId = channel_id,
    maxResults = 50)
    
    all_data = []
    
    playlist_json = request.execute()
    
    playlist_count = playlist_json['pageInfo']['totalResults']
    
    for i in range(playlist_count): 
        playlist_data = dict(
            channel_name = playlist_json['items'][i]['snippet']['channelTitle'],
            playlist_id = playlist_json['items'][i]['id'],
            playlist_publiushed_at = playlist_json['items'][i]['snippet']['publishedAt'],
            playlist_title = playlist_json['items'][i]['snippet']['title'],
            description = playlist_json['items'][i]['snippet']['description'],
            video_count = playlist_json['items'][i]['contentDetails']['itemCount'],
            privacy_status =playlist_json['items'][i]['status']['privacyStatus']
            )
        all_data.append(playlist_data)
        
    return pd.DataFrame(all_data)

In [None]:
RA_playlist = get_playlist_stats(youtube, 'UC3p6NndPDqX8bwQFT3p8_Qw')

In [None]:
RA_playlist.head()

In [None]:
RA_playlist_ids = np.array(RA_playlist['playlist_id'])

In [None]:
RA_playlist_ids

In [None]:
#get_playlist_stats(youtube, lawlumberacct)

In [None]:
#get_playlist_stats(youtube, tech_tfq_channel_id)

In [None]:
get_playlist_stats(youtube, 'UC7cs8q-gJRlGwj4A8OmCmXg')

In [None]:
def parse_video_json(playlistitems_json,num_vids): 
    vid_data = []
    for i in range(num_vids):
        if playlistitems_json['items'][i]['status']['privacyStatus'] == 'public':
            playlist_items_data = dict(
            channel_name = playlistitems_json['items'][i]['snippet']['channelTitle'],
            vid_id = playlistitems_json['items'][i]['contentDetails']['videoId'],
            vid_position = playlistitems_json['items'][i]['snippet']['position'],
            vid_published_at = playlistitems_json['items'][i]['snippet']['publishedAt'],
            vid_title = playlistitems_json['items'][i]['snippet']['title'],
            vid_description = playlistitems_json['items'][i]['snippet']['description'],
            channel_id = playlistitems_json['items'][i]['snippet']['videoOwnerChannelId'],
            #channel_id = playlistitems_json['items'][i]['resourceId']['videoOwnerChannelId'],
            privacy_status =playlistitems_json['items'][i]['status']['privacyStatus']
            )
            vid_data.append(playlist_items_data)
        else:
            pass
    return vid_data

In [None]:
## function to get channel statistics 

def get_playlistitems_stats (youtube, playlist_id):
    res = youtube.playlistItems().list(
        part = "contentDetails, status, id, snippet",
        playlistId = playlist_id,
    maxResults = 50).execute()
    
    number_of_vids = len(res['items'])

    full_data = parse_video_json(res,number_of_vids) 
    
    nextPageToken = res.get('nextPageToken')
    
    while ('nextPageToken' in res):
        nextPage = youtube.playlistItems().list(
        part = "contentDetails, status, id, snippet",
        playlistId = playlist_id,
        maxResults = 50,
        pageToken = nextPageToken
        ).execute()
        
        full_data = full_data + parse_video_json(nextPage,len(nextPage['items']))

        if 'nextPageToken' not in nextPage:
            res.pop('nextPageToken', None)
        else:
            nextPageToken = nextPage.get('nextPageToken')
          
    df1 = pd.DataFrame(full_data)
    
    return df1

In [None]:
analyticsBootCampVideos = np.array(get_playlistitems_stats(youtube, 'PLUaB-1hjhk8FE_XZ87vPPSfHqb6OcM0cF')['vid_id'])

In [None]:
#get_playlistitems_stats(youtube, 'PLUaB-1hjhk8FE_XZ87vPPSfHqb6OcM0cF')

In [None]:
#get_playlistitems_stats (youtube, 'PLUaB-1hjhk8FE_XZ87vPPSfHqb6OcM0cF')

In [None]:
#get_playlistitems_stats (youtube, 'PLjAQjmBL37WtLOD7js9ECkmEbDZ6LCUjD')

In [None]:
def get_video_stats (youtube, video_ids =[]):
    videos_json = []
    full_data = []
    
    for vid_id in video_ids:
        results = youtube.videos().list(
            part = "contentDetails, id, liveStreamingDetails, localizations, player, snippet, statistics, status, topicDetails",
            id = vid_id,
        maxResults = 50).execute()

        videos_json.append(results)
    
    for i in range(len(videos_json)):
        vidDetails= dict(
            videoId = videos_json[i]['items'][0]['id'],
            channelId = videos_json[i]['items'][0]['snippet']['channelId'],
            publishedAt = videos_json[i]['items'][0]['snippet']['publishedAt'],
            viewCcount = videos_json[i]['items'][0]['statistics']['viewCount'],
            likeCount = videos_json[i]['items'][0]['statistics']['likeCount'],
            favoriteCount = videos_json[i]['items'][0]['statistics']['favoriteCount'],
            commentCount = videos_json[i]['items'][0]['statistics']['commentCount'],
            liveBroadcastContent = videos_json[i]['items'][0]['snippet']['liveBroadcastContent'],
            duration = videos_json[i]['items'][0]['contentDetails']['duration'],
            #liveStreamingDetails = videos_json['items'][0]['liveStreamingDetails']['liveStreamingDetails']['actualStartTime']
        )
        full_data.append(vidDetails)

    return pd.DataFrame(full_data)

In [None]:
def changeColumnDataType(df):
    df['viewCount'] = pd.to_numeric(df['viewCcount'])
    df['likeCount'] = pd.to_numeric(df['likeCount'])
    #df['commentCount'] = pd.to_numeric(df['commentCount'])
    df['publishedAt'] = pd.to_datetime(df['publishedAt'])
    df['duration'] = [isodate.parse_duration(i).total_seconds() for i in df['duration']]
    return df

add video name
add description (Can separate the SQl, python etc etc from each other)
include a subscriber count table 
include a playlists table 
include a playlist items table
#most viewed
#least viewed 
#group by content type
number of shares
shorts / thumbnails
engagement of live videos

another youtube channel tocheck out
https://www.youtube.com/watch?v=ZML_EJrBhnY&list=PLavw5C92dz9Ef4E-1Zi9KfCTXS_IN8gXZ&index=18

In [None]:
#get_video_stats(youtube ,['sQX1jdrTIhE'])
#get_video_stats(youtube ,['NUkOHjo4THo']) #live
get_video_stats(youtube , analyticsBootCampVideos) #notlive


In [None]:
playlists = get_playlist_stats(youtube, Alex_the_analyst_channel_id)

In [None]:
playlists.head()

In [None]:
bootcamp_video_stats = get_video_stats(youtube, analyticsBootCampVideos)

In [None]:
bootcamp_video_stats.dtypes

In [None]:
bootcamp_video_stats['viewCount'] = pd.to_numeric(bootcamp_video_stats['viewCcount'])
bootcamp_video_stats['likeCount'] = pd.to_numeric(bootcamp_video_stats['likeCount'])
bootcamp_video_stats['commentCount'] = pd.to_numeric(bootcamp_video_stats['commentCount'])
bootcamp_video_stats['publishedAt'] = pd.to_datetime(bootcamp_video_stats['publishedAt'])
bootcamp_video_stats['duration'] = [isodate.parse_duration(i).total_seconds() for i in bootcamp_video_stats['duration']]

In [None]:
bootcamp_video_stats.head()

In [None]:
sns.set(rc = {'figure.figsize' :(10,8)})
ax = sns.lineplot(x='publishedAt', y = 'viewCount', data = bootcamp_video_stats)

In [None]:
bootcamp_video_stats.to_csv(path_or_buf = '/Users/rebeccan/Desktop/Analytics Projects/bootcampvids.csv')

In [None]:
TakeCareOfMaya = np.array(get_playlistitems_stats(youtube, 'PL-SflFiIo_nUIs9XvpamPgjn1_-HKXTvE')['vid_id'])

In [None]:
def get_livebroadcast_stats (youtube, video_ids =[]):
    videos_json = []
    full_data = []
    
    for vid_id in video_ids:
        results = youtube.videos().list(
            part = "contentDetails, id, liveStreamingDetails, localizations, player, snippet, statistics, status, topicDetails",
            id = vid_id,
        maxResults = 50).execute()

        videos_json.append(results)
    
    for i in range(len(videos_json)):
        vidDetails= dict(
            videoId = videos_json[i]['items'][0]['id'],
            channelId = videos_json[i]['items'][0]['snippet']['channelId'],
            publishedAt = videos_json[i]['items'][0]['snippet']['publishedAt'],
            viewCcount = videos_json[i]['items'][0]['statistics']['viewCount'],
            likeCount = videos_json[i]['items'][0]['statistics']['likeCount'],
            favoriteCount = videos_json[i]['items'][0]['statistics']['favoriteCount'],
            liveBroadcastContent = videos_json[i]['items'][0]['snippet']['liveBroadcastContent'],
            duration = videos_json[i]['items'][0]['contentDetails']['duration'],
            #liveStreamingDetails = videos_json['items'][0]['liveStreamingDetails']['liveStreamingDetails']['actualStartTime']
        )
        full_data.append(vidDetails)

    return pd.DataFrame(full_data)

In [None]:
takecareofmayavideos = get_livebroadcast_stats(youtube, TakeCareOfMaya)

In [None]:
takecareofmayavideos.to_csv(path_or_buf = '/Users/rebeccan/Desktop/Analytics Projects/takecareofmayavideos.csv')

In [None]:
heyItsOLiviaChannel = 'UCn2Kt4RZMcvF9w1g6-4Ar9A'

In [None]:
takecareofmayavideos = changeColumnDataType(takecareofmayavideos)

In [None]:
takecareofmayavideos.dtypes

For each channel:
1. get all playlists in channel.
2. get all videos each playlist
3. get all video data for each video

In [None]:
#get all playlists
get_playlist_stats(youtube, heyItsOLiviaChannel)

---

all_videos = pd.DataFrame()
for playlist in OliviasPlaylists['playlist_id']:
    df = get_playlistitems_stats(youtube, playlist)
    df['playlist_id'] = playlist
    all_videos = all_videos.append(df)
    
videos = np.array(all_videos['vid_id'])
channel_vid_info  = get_video_stats(youtube, videos)  
channel_vid_info = changeColumnDataType(channel_vid_info)