In [18]:
%load_ext cudf.pandas
import pandas as pd

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [19]:
df = pd.read_csv('Dataset/vdoLinks.csv')
df

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)
...,...,...,...
25618,-oB6DN5dYWo,131252,Forklift Driver Klaus: The First Day on the Jo...
25619,DK7KQ-gEdl4,131256,"Feuer, Eis & Dosenbier (2002)"
25620,v29P-wchMZQ,131258,The Pirates (2014)
25621,dAz-nZ65jYU,131260,Rentun Ruusu (2001)


In [28]:
# Google API.
from googleapiclient.discovery import build
from googleapiclient.http import HttpError

# Displaying User Image.
from IPython.display import Image, display

In [29]:
with open('Key.txt', 'r') as text_file:
    API_KEY = text_file.read().strip()

In [30]:
# Create YouTube resource object
myYoutube = build('youtube',
                  'v3',
                  developerKey = API_KEY)

In [31]:
responses = list()
video_responses = list()

for vid in df.youtubeId:
    # try:
        # Retrive youtube video comments
        video_responses = myYoutube.commentThreads().list(
            part = 'snippet, replies',
            videoId = vid
        ).execute()
        print(video_responses)
        break

    # except HttpError:
    #     print('Connection Error!')
    #     break
    
    # finally:
        responses.append(video_responses)

HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&videoId=K26_sDKnvMU&key=AIzaSyB1OU2e8dlxXDBQIz7ohADz7M-_GfieybM&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

In [12]:
responses

[[]]

In [13]:
# Iterate over the comments
count = 0

for video_responses in responses:
    for item in video_responses['items']:
        comment = item['snippet']
        print(comment)

TypeError: list indices must be integers or slices, not str

In [14]:
# Iterate over the comments
count = 0

for video_responses in responses:
    for item in video_responses['items']:
        count += 1
        print(f'\nComment - { count }:')
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        print(comment)

TypeError: list indices must be integers or slices, not str

In [None]:
%%time
%load_ext cudf.pandas
import pandas as pd
import json
import random
import time
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt


class YouTubeDataExtractor:
    def __init__(self, api_keys):
        self.api_keys = api_keys
        self.youtube = None
        self.current_key_index = 0
        self.analyzer = SentimentIntensityAnalyzer()
        self.update_youtube_client()

    def update_youtube_client(self):
        # Update the YouTube client with a new API key
        self.youtube = build('youtube', 'v3', developerKey=self.api_keys[self.current_key_index])

    def switch_api_key(self):
        # Switch to the next API key and update the YouTube client
        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
        self.update_youtube_client()

    def get_video_details(self, video_id):
        try:
            request = self.youtube.videos().list(part='snippet,statistics,contentDetails', id=video_id)
            response = request.execute()
            if not response['items']:
                print(f"No details found for {video_id}")
                return None
            return response['items'][0]
        except HttpError as e:
            error_message = e.content.decode('utf-8')
            if 'quotaExceeded' in error_message:
                print(f"Quota exceeded for the current API key. Switching keys...")
                self.switch_api_key()
                return self.get_video_details(video_id)
            else:
                print(f"Error fetching details for {video_id}: {e}")
                return None

    def get_comments(self, video_id, max_results=100):
        comments = []
        try:
            request = self.youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=max_results,
                textFormat='plainText'
            )
            response = request.execute()

            for item in response.get('items', []):
                comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
            return comments
        except HttpError as e:
            error_message = e.content.decode('utf-8')
            if 'quotaExceeded' in error_message:
                print(f"Quota exceeded for the current API key: {self.api_keys[self.current_key_index]}. Switching keys...")
                self.switch_api_key()
                return self.get_comments(video_id, max_results)
            else:
                print(f"Error fetching comments for {video_id}: {e}")
                return []

    def analyze_sentiments(self, comments):
        sentiment_scores = []
        for comment in comments:
            score = self.analyzer.polarity_scores(comment)['compound']
            sentiment_scores.append(score)
        
        mean_sentiment = sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0.0
        return mean_sentiment

    def extract_data(self, video_id):
        video_details = self.get_video_details(video_id)
        if not video_details:
            return {
                'videoId': video_id,
                'title': 'Unavailable',
                'description': 'Unavailable',
                'viewCount': 0,
                'likeCount': 0,
                'dislikeCount': 0,
                'commentCount': 0,
                'duration': '0S',
                'favoriteCount': 0,
                'sentimentScore': 0.0
            }

        snippet = video_details['snippet']
        statistics = video_details['statistics']
        content_details = video_details['contentDetails']

        data = {
            'videoId': video_id,
            'title': snippet.get('title', 'Unavailable'),
            'description': snippet.get('description', 'Unavailable'),
            'viewCount': int(statistics.get('viewCount', 0)),
            'likeCount': int(statistics.get('likeCount', 0)),
            'dislikeCount': int(statistics.get('dislikeCount', 0)),  # May be unavailable in newer API versions
            'commentCount': int(statistics.get('commentCount', 0)),
            'duration': content_details.get('duration', '0S'),
            'favoriteCount': int(statistics.get('favoriteCount', 0)),
        }

        comments = self.get_comments(video_id)
        sentiment_score = self.analyze_sentiments(comments)
        data['sentimentScore'] = sentiment_score

        return data


def main():
    # Load API keys from a JSON file
    with open('Keys.json', 'r') as f:
        api_keys = json.load(f)['api_keys']
    
    video_ids = pd.read_csv('Dataset/vdoLinks.csv')['youtubeId'].tolist()
    extractor = YouTubeDataExtractor(api_keys)
    
    data = []
    for video_id in video_ids:
        print(f"Processing video: {video_id}")
        video_data = extractor.extract_data(video_id)
        data.append(video_data)
        time.sleep(0.5)  # Small delay to respect API limits
    
    df = pd.DataFrame(data)
    
    df['viewCount'] = pd.to_numeric(df['viewCount'], errors='coerce').fillna(0)
    df['likeCount'] = pd.to_numeric(df['likeCount'], errors='coerce').fillna(0)

    top_10_videos = df.nlargest(10, 'viewCount')
    bottom_10_videos = df.nsmallest(10, 'viewCount')
    
    most_liked_video = df.loc[df['likeCount'].idxmax()]['title']
    least_liked_video = df.loc[df['likeCount'].idxmin()]['title']
    highest_duration_video = df.loc[df['duration'].idxmax()]['title']

    print("Most liked video:", most_liked_video)
    print("Least liked video:", least_liked_video)
    print("Video with the highest duration:", highest_duration_video)
    
    plt.figure(figsize=(25.6, 16))
    plt.bar(top_10_videos['title'], top_10_videos['viewCount'])
    plt.xticks(rotation=45, ha='right')
    plt.title('Top 10 Videos Based on Views')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(25.6, 16))
    plt.bar(bottom_10_videos['title'], bottom_10_videos['viewCount'])
    plt.xticks(rotation=45, ha='right')
    plt.title('Bottom 10 Videos Based on Views')
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas
Processing video: K26_sDKnvMU


Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded for the current API key. Switching keys...
Quota exceeded