In [11]:
# install api py client
# !pip install --upgrade google-api-python-client

In [1]:
from apiclient.discovery import build
from googleapiclient.errors import HttpError

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dotenv import load_dotenv

In [23]:
# read the dataset
df = pd.read_csv('D:\\repositories\\YouTubeAPIV3-SMA\\datasets\\vdoLinks.csv')

# GENERAL DESCRIPTION OF THE DATASET

In [24]:
df.head()

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)


In [25]:
df.shape

(25623, 3)

In [26]:
df.isnull().sum()

youtubeId    0
movieId      0
title        0
dtype: int64

In [27]:
df.duplicated().any()

False

In [28]:
df.dtypes

youtubeId    object
movieId       int64
title        object
dtype: object

In [29]:
df[['youtubeId','title']] = df[['youtubeId','title']].astype(dtype= 'str')

In [30]:
df.dtypes

youtubeId    object
movieId       int64
title        object
dtype: object

# Fetching data using Youtube API 

In [31]:
# global class for all methods
class YoutubeData():
    """
    GLOBAL CLASS:
    ------------
    A class containing all the methods used in this script.

    Attributes:
    ------------
    API_KEY (str): The Developer Key
    API_NAME(str): Name of the service
    API_VERSION(str): Version of the API service
    
    """
    
    def __init__(self, API_KEY, API_NAME, API_VERSION):
        self.apikey = API_KEY
        self.apiname = API_NAME
        self.apiversion = API_VERSION

    def establish_connection(self):
        """
        Description:
        ------------
        Method to establish connection using the youtube Dev Key        
        
        Returns:
        ------------
        A connection object
        
        """
        try:
            obj = build(self.apiname, self.apiversion, developerKey = self.apikey)
        except Exception as e:
            print(f'Failed to establish connection: {e}')

        return obj    

    def get_video_data(self, obj, df):
        """
        Arguments:
        ------------
        obj (object): Connection Object from YouTube API
        df (pd.dataframe): DataFrame containing YouTube video IDs
        
        Description:
        ------------
        Function to retreive statistical data and comments data. If the data is failed to be retreived 'NAN' is appended instead
            
        Returns:
        ------------
        data_dict (dictionary): A dictionary containing retreived data.
        """
        
        self.obj = obj
        self.df = df
    
        # Dictionary to store retreived data
        data_dict = {}
    
        # Iterate over each videoId in the DataFrame
        for idx, video_id in self.df['youtubeId'].items():  
            print(f'Processing videoID: {video_id} (Index: {idx})')
            print('Processing comments data fetching..')
           
            try:
                # API call to get comments for the video
                video_response = obj.commentThreads().list(
                part='snippet,replies',
                videoId=video_id,
                maxResults=100).execute()

                # Initialize list to store comments for this video
                comments = []

                # Extract comments from the response
                for i in range(len(video_response['items'])):
                    comment = video_response['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal']
                    comments.append(comment)

                # # Add the list of comments to the dictionary for this video
                # comments_dict[video_id] = comments
                
            except HttpError as e:
                print(f'Error fetching comments for videoID {video_id}: Error Code {e.status_code}')
                print('Appending NAN instead.')
                # In case of an error, store 'nan' for this video
                comments = np.NaN
                
            except Exception as e:
                print(f"An unexpected error occurred for videoID {video_id} during comment retrieval: {str(e)}")
                comments = np.NaN


            try:
                print('Processing video data fetching..')
                # Initialize variables for each video, ensuring a fresh set for every video_id
                desc_ = np.NaN
                duration_ = np.NaN
                view_count_ = np.NaN
                like_count_ = np.NaN
                comment_count_ = np.NaN
                favorite_count_ = np.NaN
    
                # API call to get data for the video
                video_response = obj.videos().list(
                    part='statistics,snippet,contentDetails',
                    id=video_id,
                    maxResults=1 
                ).execute()
    
                # Retrieve relevant data from the API response
                if 'items' in video_response and len(video_response['items']) > 0:
                    item = video_response['items'][0]  
                    
                    # Retrieve snippet data
                    desc_ = item['snippet'].get('description', np.NaN)
                    
                    # Retrieve contentDetails if present
                    if 'contentDetails' in item: 
                        duration_ = item['contentDetails'].get('duration', np.NaN)
                    
                    # Retrieve statistics if present
                    if 'statistics' in item:
                        view_count_ = item['statistics'].get('viewCount', np.NaN)  # Get view count, fallback to 'N/A'
                        like_count_ = item['statistics'].get('likeCount', np.NaN)  # Get like count, fallback to 'N/A'
                        comment_count_ = item['statistics'].get('commentCount', np.NaN)  # Get comment count, fallback to 'N/A'
                        favorite_count_ = item['statistics'].get('favoriteCount', np.NaN)  # Get favorite count, fallback to 'N/A'
    
                # Add the data for this video to the dictionary
                data_dict[video_id] = {
                    'description': desc_,
                    'duration': duration_,
                    'view_count': view_count_,
                    'like_count': like_count_,
                    'comment_count': comment_count_,
                    'favorite_count': favorite_count_,
                    'comments': comments
                }
    
            except HttpError as e:
                print(f'Error fetching video data for videoID {video_id}: Error Code {e.status_code}')
                print('Appending NAN instead.')
                # In case of an error, store 'N/A' for this video
                data_dict[video_id] = {
                    'description': np.NaN,
                    'duration': np.NaN,
                    'view_count': np.NaN,
                    'like_count': np.NaN,
                    'comment_count': np.NaN,
                    'favorite_count': np.NaN,
                    'comments': comments
                }
            
            except Exception as e:
                print(f"An unexpected error occurred for videoID {video_id} during data retrieval: {str(e)}")
                data_dict[video_id] = {
                    'description': np.NaN,
                    'duration': np.NaN,
                    'view_count': np.NaN,
                    'like_count': np.NaN,
                    'comment_count': np.NaN,
                    'favorite_count': np.NaN,
                    'comments': comments
                }
                    
    
        return data_dict

In [32]:
key = 'AIzaSyA0TINGzdUjaZtRFFOVFaRpG5Sq1DkptL4'
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection = youtube.establish_connection()

In [None]:
data = youtube.get_video_data(connection, df)

Processing videoID: K26_sDKnvMU (Index: 0)
Processing comments data fetching..
Error fetching comments for videoID K26_sDKnvMU: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: 3LPANjHlPxo (Index: 1)
Processing comments data fetching..
Error fetching comments for videoID 3LPANjHlPxo: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: rEnOoWs3FuA (Index: 2)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: j9xml1CxgXI (Index: 3)
Processing comments data fetching..
Error fetching comments for videoID j9xml1CxgXI: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: ltwvKLnj1B4 (Index: 4)
Processing comments data fetching..
Error fetching comments for videoID ltwvKLnj1B4: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: 2GfZl4kuVNI (Index: 5)
Processing comments data fetching..
Processing vi

In [None]:
# description = stat_data['items'][0]['snippet']['description']
# duration = stat_data['items'][0]['contentDetails']['duration']
# view_count = stat_data['items'][0]['statistics']['viewCount']
# like_count = stat_data['items'][0]['statistics']['likeCount']
# comment_count = stat_data['items'][0]['statistics']['commentCount']
# favorite_count = stat_data['items'][0]['statistics']['favoriteCount']