In [11]:
# install api py client
# !pip install --upgrade google-api-python-client

In [4]:
from apiclient.discovery import build
from googleapiclient.errors import HttpError

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv
load_dotenv()
import warnings                    
warnings.filterwarnings('ignore')  

In [5]:
# read the dataset
df = pd.read_csv('C:\\Users\\rohan aryan\\Desktop\\lambton\\Temp_REPOS\\GoogleNewsData-Social_Media_Analytics\\Dataset\\vdoLinks.csv')

# GENERAL DESCRIPTION OF THE DATASET

In [6]:
df.head()

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)


In [7]:
df.shape

(25623, 3)

In [8]:
df.isnull().sum()

youtubeId    0
movieId      0
title        0
dtype: int64

In [9]:
df.duplicated().any()

False

In [10]:
df.dtypes

youtubeId    object
movieId       int64
title        object
dtype: object

In [11]:
df[['youtubeId','title']] = df[['youtubeId','title']].astype(dtype= 'str')

In [12]:
df.dtypes

youtubeId    object
movieId       int64
title        object
dtype: object

# Fetching data using Youtube API 

In [13]:
# global class for all methods
class YoutubeData():
    """
    GLOBAL CLASS:
    ------------
    A class containing all the methods used in this script.

    Attributes:
    ------------
    API_KEY (str): The Developer Key
    API_NAME(str): Name of the service
    API_VERSION(str): Version of the API service
    
    """
    
    def __init__(self, API_KEY, API_NAME, API_VERSION):
        self.apikey = API_KEY
        self.apiname = API_NAME
        self.apiversion = API_VERSION

    def establish_connection(self):
        """
        Description:
        ------------
        Method to establish connection using the youtube Dev Key        
        
        Returns:
        ------------
        A connection object
        
        """
        try:
            obj = build(self.apiname, self.apiversion, developerKey = self.apikey)
        except Exception as e:
            print(f'Failed to establish connection: {e}')

        return obj    

    def get_video_data(self, obj, df):
        """
        Arguments:
        ------------
        obj (object): Connection Object from YouTube API
        df (pd.dataframe): DataFrame containing YouTube video IDs
        
        Description:
        ------------
        Function to retreive statistical data and comments data. If the data is failed to be retreived 'NAN' is appended instead
            
        Returns:
        ------------
        data_dict (dictionary): A dictionary containing retreived data.
        """
        
        self.obj = obj
        self.df = df
    
        # Dictionary to store retreived data
        data_dict = {}
    
        # Iterate over each videoId in the DataFrame
        for idx, video_id in self.df['youtubeId'].items():  
            print(f'Processing videoID: {video_id} (Index: {idx})')
            print('Processing comments data fetching..')
           
            try:
                # API call to get comments for the video
                video_response = obj.commentThreads().list(
                part='snippet,replies',
                videoId=video_id,
                maxResults=100).execute()

                # Initialize list to store comments for this video
                comments = []

                # Extract comments from the response
                for i in range(len(video_response['items'])):
                    comment = video_response['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal']
                    comments.append(comment)

                # # Add the list of comments to the dictionary for this video
                # comments_dict[video_id] = comments
                
            except HttpError as e:
                print(f'Error fetching comments for videoID {video_id}: Error Code {e.status_code}')
                print('Appending NAN instead.')
                # In case of an error, store 'nan' for this video
                comments = np.NaN
                
            except Exception as e:
                print(f"An unexpected error occurred for videoID {video_id} during comment retrieval: {str(e)}")
                comments = np.NaN


            try:
                print('Processing video data fetching..')
                # Initialize variables for each video, ensuring a fresh set for every video_id
                desc_ = np.NaN
                duration_ = np.NaN
                view_count_ = np.NaN
                like_count_ = np.NaN
                comment_count_ = np.NaN
                favorite_count_ = np.NaN
    
                # API call to get data for the video
                video_response = obj.videos().list(
                    part='statistics,snippet,contentDetails',
                    id=video_id,
                    maxResults=1 
                ).execute()
    
                # Retrieve relevant data from the API response
                if 'items' in video_response and len(video_response['items']) > 0:
                    item = video_response['items'][0]  
                    
                    # Retrieve snippet data
                    desc_ = item['snippet'].get('description', np.NaN)
                    
                    # Retrieve contentDetails if present
                    if 'contentDetails' in item: 
                        duration_ = item['contentDetails'].get('duration', np.NaN)
                    
                    # Retrieve statistics if present
                    if 'statistics' in item:
                        view_count_ = item['statistics'].get('viewCount', np.NaN)  # Get view count, fallback to 'N/A'
                        like_count_ = item['statistics'].get('likeCount', np.NaN)  # Get like count, fallback to 'N/A'
                        comment_count_ = item['statistics'].get('commentCount', np.NaN)  # Get comment count, fallback to 'N/A'
                        favorite_count_ = item['statistics'].get('favoriteCount', np.NaN)  # Get favorite count, fallback to 'N/A'
    
                # Add the data for this video to the dictionary
                data_dict[video_id] = {
                    'description': desc_,
                    'duration': duration_,
                    'view_count': view_count_,
                    'like_count': like_count_,
                    'comment_count': comment_count_,
                    'favorite_count': favorite_count_,
                    'comments': comments
                }
    
            except HttpError as e:
                print(f'Error fetching video data for videoID {video_id}: Error Code {e.status_code}')
                print('Appending NAN instead.')
                # In case of an error, store 'N/A' for this video
                data_dict[video_id] = {
                    'description': np.NaN,
                    'duration': np.NaN,
                    'view_count': np.NaN,
                    'like_count': np.NaN,
                    'comment_count': np.NaN,
                    'favorite_count': np.NaN,
                    'comments': comments
                }
            
            except Exception as e:
                print(f"An unexpected error occurred for videoID {video_id} during data retrieval: {str(e)}")
                data_dict[video_id] = {
                    'description': np.NaN,
                    'duration': np.NaN,
                    'view_count': np.NaN,
                    'like_count': np.NaN,
                    'comment_count': np.NaN,
                    'favorite_count': np.NaN,
                    'comments': comments
                }
                    
    
        return data_dict

In [14]:
# API KEY 1 - rohanspam --------USED
key = os.getenv('API_1')
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection_1 = youtube.establish_connection()
print('connection established, API LOADED')

connection established, API LOADED


In [16]:
# API KEY 2- chandrika.singh99 --USED
key = os.getenv('API_2')
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection_2 = youtube.establish_connection()
print('connection established, API LOADED')

connection established, API LOADED


In [17]:
# API KEY 3 - schandrika1827 -- used
key = os.getenv('API_3')
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection_3 = youtube.establish_connection()
print('connection established, API LOADED')

connection established, API LOADED


In [18]:
# API KEY 4- chandrikasingh1827 -- used
key = os.getenv('API_4')
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection_4 = youtube.establish_connection()
print('connection established, API LOADED')

connection established, API LOADED


In [19]:
# API KEY 5 - rohanca27 -- used
key = os.getenv('API_5')
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection_5 = youtube.establish_connection()
print('connection established, API LOADED')

connection established, API LOADED


In [20]:
# API KEY 6 - spam12342024
key = os.getenv('API_6')
name = 'youtube'
version = 'v3'
youtube = YoutubeData(key, name, version)
connection_6 = youtube.establish_connection()
print('connection established, API LOADED')

connection established, API LOADED


In [107]:
# lets try generating the df using 3 splits of dataset.
# limit is 5k
limit = 5000
df_1 = df.iloc[:limit].copy()
df_2 = df.iloc[limit:10000].copy()
df_3 = df.iloc[10000:15000].copy()
df_4 = df.iloc[15000:20000].copy()
df_5 = df.iloc[20000:25000].copy()
df_6 = df.iloc[25000:].copy()

In [108]:
print(df_1.shape)
print(df_2.shape)
print(df_3.shape)
print(df_4.shape)
print(df_5.shape)
print(df_6.shape)

(5000, 3)
(5000, 3)
(5000, 3)
(5000, 3)
(5000, 3)
(623, 3)


In [77]:
data_1 = youtube.get_video_data(connection_1, df_1)

Processing videoID: K26_sDKnvMU (Index: 0)
Processing comments data fetching..
Error fetching comments for videoID K26_sDKnvMU: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: 3LPANjHlPxo (Index: 1)
Processing comments data fetching..
Error fetching comments for videoID 3LPANjHlPxo: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: rEnOoWs3FuA (Index: 2)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: j9xml1CxgXI (Index: 3)
Processing comments data fetching..
Error fetching comments for videoID j9xml1CxgXI: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: ltwvKLnj1B4 (Index: 4)
Processing comments data fetching..
Error fetching comments for videoID ltwvKLnj1B4: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: 2GfZl4kuVNI (Index: 5)
Processing comments data fetching..
Processing vi

In [89]:
data_2 = youtube.get_video_data(connection_2, df_2)

Processing videoID: 3xi85-hb_J0 (Index: 5000)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: rEbM3tjV-3Y (Index: 5001)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: jmwzGOKIkyQ (Index: 5002)
Processing comments data fetching..
Error fetching comments for videoID jmwzGOKIkyQ: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: dqox5M8HHdw (Index: 5003)
Processing comments data fetching..
Error fetching comments for videoID dqox5M8HHdw: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: CoP_8gTnjuY (Index: 5004)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: CZShn0PZiYU (Index: 5005)
Processing comments data fetching..
Error fetching comments for videoID CZShn0PZiYU: Error Code 403
Appending NAN instead.
Processing video data fetching..
Processing videoID: Bkc9c7VXmn8 (Index: 5006)
Pro

In [91]:
data_3 = youtube.get_video_data(connection_3, df_3)

Processing videoID: 6hWV6cOHOyA (Index: 10000)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: _7SRE9Pw-ts (Index: 10001)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: GMMGg_idxqE (Index: 10002)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: uTfbEQsnJSk (Index: 10003)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: ODwt-fjh0nw (Index: 10004)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: 7fRLWoyLFBk (Index: 10005)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: c25GKl5VNeY (Index: 10006)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: o8JvMdfNOrg (Index: 10007)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: 8fKjZDgat_0 (Index: 10008)
Processing comments data 

In [113]:
data_4 = youtube.get_video_data(connection_4, df_4)

Processing videoID: 6yG4oBdWONM (Index: 15000)
Processing comments data fetching..
Error fetching comments for videoID 6yG4oBdWONM: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: cf2H39kSor4 (Index: 15001)
Processing comments data fetching..
Error fetching comments for videoID cf2H39kSor4: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: ZxTYasgU5XM (Index: 15002)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: MzgOvvMi8Lg (Index: 15003)
Processing comments data fetching..
Error fetching comments for videoID MzgOvvMi8Lg: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: uGyLFdzhw-c (Index: 15004)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: D6WOoUG1eNo (Index: 15005)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: oNEfN2R4oRc (Index: 150

In [115]:
data_5 = youtube.get_video_data(connection_5, df_5)

Processing videoID: r8xPCUXHOl0 (Index: 20000)
Processing comments data fetching..
Error fetching comments for videoID r8xPCUXHOl0: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: 0-r-fB7qY-s (Index: 20001)
Processing comments data fetching..
Error fetching comments for videoID 0-r-fB7qY-s: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: zA92Rw6kNWw (Index: 20002)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: 1ESRWuw4LoE (Index: 20003)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: zJGleGyahC8 (Index: 20004)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: qCa6aUZCi28 (Index: 20005)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: sGf_WdlHTqs (Index: 20006)
Processing comments data fetching..
Processing video data fetching..
Processing vid

In [120]:
data_6 = youtube.get_video_data(connection_6, df_6)

Processing videoID: TKwhBk4d7M0 (Index: 25000)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: mmDAx3e7KyI (Index: 25001)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: a8Ji8YfftNE (Index: 25002)
Processing comments data fetching..
Error fetching comments for videoID a8Ji8YfftNE: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing videoID: znp1dNaPp3k (Index: 25003)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: G4O_kgSEcH0 (Index: 25004)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: LLm-Ht3cFvs (Index: 25005)
Processing comments data fetching..
Processing video data fetching..
Processing videoID: oDE6J3iut28 (Index: 25006)
Processing comments data fetching..
Error fetching comments for videoID oDE6J3iut28: Error Code 404
Appending NAN instead.
Processing video data fetching..
Processing vid

In [121]:
df_a = pd.DataFrame.from_dict(data_1, orient='index').reset_index()
df_a.rename(columns={'index': 'video_id'}, inplace=True)

df_b = pd.DataFrame.from_dict(data_2, orient='index').reset_index()
df_b.rename(columns={'index': 'video_id'}, inplace=True)

df_c = pd.DataFrame.from_dict(data_3, orient='index').reset_index()
df_c.rename(columns={'index': 'video_id'}, inplace=True)

df_d = pd.DataFrame.from_dict(data_4, orient='index').reset_index()
df_d.rename(columns={'index': 'video_id'}, inplace=True)

df_e = pd.DataFrame.from_dict(data_5, orient='index').reset_index()
df_e.rename(columns={'index': 'video_id'}, inplace=True)

df_f = pd.DataFrame.from_dict(data_6, orient='index').reset_index()
df_f.rename(columns={'index': 'video_id'}, inplace=True)

In [125]:
print(df_a.shape)
print(df_b.shape)
print(df_c.shape)
print(df_d.shape)
print(df_e.shape)
print(df_f.shape)

(4999, 8)
(4996, 8)
(4999, 8)
(4994, 8)
(4996, 8)
(623, 8)


In [123]:
final_df = pd.concat([df_a, df_b, df_c, df_d, df_e, df_f])

In [124]:
final_df.shape

(25607, 8)

In [130]:
final_df.to_csv('youtube_data.csv', index=False)

In [None]:
# description = stat_data['items'][0]['snippet']['description']
# duration = stat_data['items'][0]['contentDetails']['duration']
# view_count = stat_data['items'][0]['statistics']['viewCount']
# like_count = stat_data['items'][0]['statistics']['likeCount']
# comment_count = stat_data['items'][0]['statistics']['commentCount']
# favorite_count = stat_data['items'][0]['statistics']['favoriteCount']