In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('Dataset/vdoLinks.csv')
df

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)
...,...,...,...
25618,-oB6DN5dYWo,131252,Forklift Driver Klaus: The First Day on the Jo...
25619,DK7KQ-gEdl4,131256,"Feuer, Eis & Dosenbier (2002)"
25620,v29P-wchMZQ,131258,The Pirates (2014)
25621,dAz-nZ65jYU,131260,Rentun Ruusu (2001)


In [9]:
# Google API.
from googleapiclient.discovery import build
from googleapiclient.http import HttpError

# Displaying User Image.
from IPython.display import Image, display

In [1]:
with open('Google-Cloud-YT-Data-V3-API-Key.txt', 'r') as text_file:
    API_KEY = text_file.read().strip()

In [5]:
# Create YouTube resource object
myYoutube = build('youtube',
                  'v3',
                  developerKey=API_KEY)

In [None]:
responses = list()

for vid in df.youtubeId:
    try:
        # Retrive youtube video comments
        video_responses = myYoutube.commentThreads().list(
            part = 'snippet, replies',
            videoId = vid
        ).execute()

    except HttpError:
        continue

    responses.append(video_responses)

In [None]:
responses

In [None]:
# Iterate over the comments
count = 0

for video_responses in responses:
    for item in video_responses['items']:
        comment = item['snippet']
        print(comment)

In [None]:
# Iterate over the comments
count = 0

for video_responses in responses:
    for item in video_responses['items']:
        count += 1
        print(f'\nComment - { count }:')
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        print(comment)

In [3]:
import cudf
import asyncio
import aiohttp
from googleapiclient.discovery import build
import json
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from dask import delayed, compute

# Replace with your API key
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

# Function to build YouTube API client
def build_youtube_client():
    return build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)

# Asynchronous function to get video details
async def get_video_details(session, youtube, video_id):
    try:
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=video_id
        )
        response = await session.run_in_executor(None, request.execute)
        if response["items"]:
            return response["items"][0]
        else:
            return None
    except Exception as e:
        print(f"Error fetching details for {video_id}: {e}")
        return None

# Asynchronous function to get comments
async def get_comments(session, youtube, video_id):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            textFormat="plainText"
        )
        response = await session.run_in_executor(None, request.execute)
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
        return comments
    except Exception as e:
        print(f"Error fetching comments for {video_id}: {e}")
        return comments

# Function to extract video data asynchronously
async def extract_video_data(youtube, video_ids):
    video_data = []
    async with aiohttp.ClientSession() as session:
        tasks = []
        for video_id in video_ids:
            tasks.append(asyncio.ensure_future(
                fetch_video_data(session, youtube, video_id)
            ))
        video_data = await asyncio.gather(*tasks)
    return [video for video in video_data if video]  # Filter out None results

async def fetch_video_data(session, youtube, video_id):
    video_details = await get_video_details(session, youtube, video_id)
    if video_details:
        comments = await get_comments(session, youtube, video_id)
        return {
            'video_id': video_id,
            'title': video_details['snippet']['title'],
            'description': video_details['snippet']['description'],
            'view_count': int(video_details['statistics'].get('viewCount', 0)),
            'like_count': int(video_details['statistics'].get('likeCount', 0)),
            'dislike_count': int(video_details['statistics'].get('dislikeCount', 0)),
            'comment_count': int(video_details['statistics'].get('commentCount', 0)),
            'duration': video_details['contentDetails']['duration'],
            'favorite_count': int(video_details['statistics'].get('favoriteCount', 0)),
            'comments': comments
        }

# Function for sentiment analysis using VADER
def analyze_sentiments(comments):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = []
    for comment in comments:
        score = analyzer.polarity_scores(comment)
        sentiment_scores.append(score['compound'])
    return sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0

# Function to generate the report using cuDF
def generate_report(video_data):
    # Convert video_data into a cuDF DataFrame
    df = cudf.DataFrame(video_data)
    
    # Top-10 and bottom-10 videos based on views
    top_10_videos = df.nlargest(10, 'view_count')
    bottom_10_videos = df.nsmallest(10, 'view_count')
    
    # Most and least liked videos
    most_liked_video = df.loc[df['like_count'].idxmax()]['title']
    least_liked_video = df.loc[df['like_count'].idxmin()]['title']
    
    # Video with the highest duration
    video_with_highest_duration = df.loc[df['duration'].idxmax()]['title']
    
    # Sentiment analysis using Dask
    df['sentiment_score'] = df['comments'].apply_delayed(analyze_sentiments, meta=('sentiment_score', 'f8'))
    df['sentiment_score'].compute()

    # Top-10 videos by positive sentiment
    top_10_positive_sentiment = df.nlargest(10, 'sentiment_score')
    
    # Plotting the data
    top_10_videos['title'].to_pandas().plot(kind='barh', x='title', y='view_count', title='Top 10 Videos by View Count')
    plt.show()

    bottom_10_videos['title'].to_pandas().plot(kind='barh', x='title', y='view_count', title='Bottom 10 Videos by View Count')
    plt.show()

    top_10_positive_sentiment['title'].to_pandas().plot(kind='barh', x='title', y='sentiment_score', title='Top 10 Videos by Positive Sentiment Score')
    plt.show()

    print(f"Most Liked Video: {most_liked_video}")
    print(f"Least Liked Video: {least_liked_video}")
    print(f"Video with the Highest Duration: {video_with_highest_duration}")

# Main function to run the script
if __name__ == "__main__":
    # Read video IDs from the CSV file
    video_ids_df = cudf.read_csv('Dataset/vdoLinks.csv')
    video_ids = video_ids_df['youtubeId'].to_pandas().tolist()
    
    youtube = build_youtube_client()
    video_data = asyncio.run(extract_video_data(youtube, video_ids))
    generate_report(video_data)

ModuleNotFoundError: No module named 'cudf'