### A. Imports
You need to import the following libraries. Install the libraries using "conda install ... or pip install ..." if they have not been installed on your machine. For example you can install google api python client by executing "conda install google-api-python-client"

In [54]:
import importlib
import subprocess

# The following lines should install all libraries you need - you can install the libraries manually if the script did not work 
required_modules = ['pandas', 'seaborn', 'matplotlib', 'google-api-python-client', 'datetime', 'configparser', 'nltk', 'langdetect', 'textblob', 'prettytable', 'tabulate', 'numpy']
for module in required_modules:
    try:
        importlib.import_module(module)
    except ImportError:
        print(f"{module} module not found. Installing...")
        subprocess.check_call(['pip', 'install', module])

print("All required modules are installed.")

# import the installed libraries ...
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime
import os
from configparser import ConfigParser
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from langdetect import detect
import langdetect
from textblob import TextBlob
import calendar
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from prettytable import PrettyTable
from tabulate import tabulate
import numpy as np
from collections import defaultdict
import warnings

google-api-python-client module not found. Installing...
All required modules are installed.


[nltk_data] Downloading package stopwords to /Users/krish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### B. Settings
This section specifies the settings for connecting to the YouTube API and collecting the data about YouTube videos and their corresponding comments.

In [55]:
VIDEOS_FILE = "videos.csv"
COMMENTS_FILE = "comments.csv"
CREDENTIALS_FILE = 'credentials.ini'
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2023, 1, 1)
KEYWORDS =['coronavirus', 'covid', 'covid-19', 'pandemic']
# You can use functin get_channel_info() to extract the channel ID of a sample video from a news publisher ...
CHANNELS = {
    'UCXIJgqnII2ZOINSWNOGFThA' : 'Fox News',
    'UC16niRr50-MSBwiO3YDb3RA' : 'BBC News',
    'UCupvZG-5ko_eiXAupbDfxWw' : 'CNN',
    'UCaXkIU1QidjPwiAYu6GcHjg' : 'MSNBC'
}
MAX_VIDEOS = 50 # the maximum number of video that should be returned for each request. Acceptable values are 0 to 50
QUERY= f"intitle:{','.join(KEYWORDS)}"

### C. Load the credentials for authentication

In [56]:
def load_credentials():
    try:
        config = ConfigParser(interpolation=None)
        config.read(CREDENTIALS_FILE)
        developer_key = config.get('credentials_youtube', 'developer_key', fallback=None)
        service_name = config.get('credentials_youtube', 'youtube_api_service_name', fallback=None)
        service_version = config.get('credentials_youtube', 'youtube_api_version', fallback=None)
        if not developer_key or not service_name or not service_version:
            raise ValueError("Invalid credentials file")

        return {
            'developer_key' : developer_key,
            'service_name' : service_name,
            'service_version' : service_version
        }
    except Exception as e:
        raise ValueError("Failed to load credentials: {}".format(str(e)))

### D. Extract the channel_id and channel_title of a sample video

In [57]:
# This function takes a video ID and a YouTube Object and returns the video's channel ID
# See Section H (call the functions) to learn how to use this function
def get_channel_info(video_id, youtube):
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()
    channel_id = response['items'][0]['snippet']['channelId']
    channel_title = response['items'][0]['snippet']['channelTitle']
    return channel_id, channel_title

### E. Search for the videos from the channel

In [58]:
def get_channel_videos(channel_ids, start_date, end_date, query, video_categories={}, max_videos=10):
    df_list = []
    for channel_id in channel_ids: 
        print(f"-> collecting videos for channel: {CHANNELS[channel_id]}")
        try:
            request = youtube.search().list(
                part="snippet",
                type='video',
                channelId=channel_id,
                maxResults=max_videos, # specifies the maximum number of items that should be returned in the result set. Acceptable values are 0 to 50, inclusive.
                q=query,
                publishedAfter=start_date.strftime("%Y-%m-%dT%H:%M:%SZ"),
                publishedBefore=end_date.strftime("%Y-%m-%dT%H:%M:%SZ")
            )
            response = request.execute()
            videos = response['items']
            data = []
            for video in videos:
                video_id = video['id']['videoId']
                video_details = youtube.videos().list(
                    part="snippet,statistics,contentDetails",
                    id=video_id
                ).execute()
                video_data = {
                    'video_id' : video_id,
                    'channel_id' : channel_id,
                    'video_title': video_details['items'][0]['snippet']['title'],
                    'channel_title': video_details['items'][0]['snippet']['channelTitle'],
                    'category_name': video_categories.get(str(video_details['items'][0]['snippet']['categoryId']), 'Unknown'),
                    'live_upcoming_none' : video_details['items'][0]['snippet']['liveBroadcastContent'],
                    'view_count': video_details['items'][0]['statistics'].get('viewCount', 0),
                    'like_count': video_details['items'][0]['statistics'].get('likeCount', 0),
                    'dislike_count': video_details['items'][0]['statistics'].get('dislikeCount', 0),
                    'comment_count': video_details['items'][0]['statistics'].get('commentCount', 0),
                    'published_at': video_details['items'][0]['snippet']['publishedAt'],
                    'tags': ','.join(video_details['items'][0]['snippet'].get('tags', [])),
                    'duration': video_details['items'][0]['contentDetails'].get('duration', ''),
                    'definition': video_details['items'][0]['contentDetails'].get('definition', 'unknown'),
                    'caption': video_details['items'][0]['contentDetails'].get('caption', 'false'),
                    'thumbnail' : video_details['items'][0]['snippet']['thumbnails']['default'].get('url'),
                    'url': 'https://www.youtube.com/watch?v={}'.format(video_id)
                }
                data.append(video_data)
            df = pd.DataFrame(data)
            df_list.append(df)
        except HttpError as e:
            print(f'An HTTP error {e.resp.status} occurred:\n{e.content}')
        except Exception as e:
            print(f'An error occurred:\n{str(e)}')
    df_concatenated = pd.concat(df_list, axis=0)
    df_concatenated.to_csv(VIDEOS_FILE, mode='w', index=False)
    return df_concatenated

print(get_channel_videos())

### F. Retrieve comments for a list of videos

In [59]:
def get_videos_comments():
    videos = pd.read_csv(VIDEOS_FILE)
    video_ids = videos['video_id'].tolist()
    df_list =[]    
    # Loop through all the video IDs and retrieve the comments
    for video_id in video_ids:
        print(f"-> collecting comments for video: {video_id}")
        comments_list = []
        try:
            response = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                textFormat='plainText'
            ).execute()

            # Loop through all the comments and extract the relevant information
            for item in response['items']:
                comment_id = item['snippet']['topLevelComment']['id']
                comment_text = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comment_author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
                comment_date = item['snippet']['topLevelComment']['snippet']['publishedAt']
                like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
                reply_count = item['snippet']['totalReplyCount']
                comments_list.append([video_id, comment_id, comment_text, comment_author, comment_date, like_count, None])
                
                if reply_count > 0:
                    # Retrieve the replies to the top-level comment
                    reply_response = youtube.comments().list(
                        part='snippet',
                        parentId=comment_id,
                        textFormat='plainText'
                    ).execute()
                    
                    # Loop through all the replies and extract the relevant information
                    for reply_item in reply_response['items']:
                        reply_id = reply_item['id']
                        reply_text = reply_item['snippet']['textDisplay']
                        reply_author = reply_item['snippet']['authorDisplayName']
                        reply_date = reply_item['snippet']['publishedAt']
                        reply_like_count = reply_item['snippet']['likeCount']
                        comments_list.append([video_id, reply_id, reply_text, reply_author, reply_date, reply_like_count, comment_id])

        except HttpError as error:
            if error.resp.status == 403:
                print(f'Comments are disabled for video ID {video_id}. Skipping...')
            else:
                raise error
        
        df = pd.DataFrame(comments_list, columns=['video_id', 'comment_id', 'comment_text', 'comment_author', 'comment_date', 'comment_like_count', 'parent_comment_id'])
        df_list.append(df)
    df_concatenated = pd.concat(df_list, axis=0)
    df_concatenated.to_csv(COMMENTS_FILE, mode='w', index=False)
    return df_concatenated

### G. Clean the Data

In [60]:
def clean_data(VIDEOS_FILE, COMMENTS_FILE, stopwords):
    # Load videos data
    videos = pd.read_csv(VIDEOS_FILE)

    # Clean videos data
    videos['video_title'] = videos['video_title'].apply(lambda x: re.sub(r'[^\w\s]', '', x) if isinstance(x, str) else x) # remove punctuation
    videos['video_title'] = videos['video_title'].apply(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x) # remove digits
    videos['video_title'] = videos['video_title'].apply(lambda x: x.lower() if isinstance(x, str) else x) # convert to lowercase

    # Save cleaned videos data to new CSV file, replacing the existing file
    videos.to_csv(VIDEOS_FILE, index=False)

    # Load comments data
    comments = pd.read_csv(COMMENTS_FILE)

    # Clean comments data
    comments['comment_text'] = comments['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x) if isinstance(x, str) else x)  # remove punctuation
    comments['comment_text'] = comments['comment_text'].apply(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x)  # remove digits
    comments['comment_text'] = comments['comment_text'].apply(lambda x: x.lower() if isinstance(x, str) else x)  # convert to lowercase

    # Remove duplicates
    comments = comments.drop_duplicates()

    # Remove rows with missing comment_text
    comments = comments.dropna(subset=['comment_text'])

    # Filter out comments that are not in English
    try:
        comments = comments[comments['comment_text'].apply(lambda x: langdetect.detect(x) == 'en')]
    except langdetect.LangDetectException as e:
        print(f"non-english comment skipped ... {e}")
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    comments['comment_text'] = comments['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    # Save cleaned comments data to new CSV file, replacing the existing file
    comments.to_csv(COMMENTS_FILE, index=False)

## H. Call the functions ...

In [61]:
# Read the developer_key, service_name, and service_version from credentials.ini 
credentials = load_credentials()

# Build a youtube object using the build function
youtube = build(credentials['service_name'], credentials['service_version'],developerKey=credentials['developer_key'])

# Exctract the video categories
response = youtube.videoCategories().list(part='snippet', regionCode='UK').execute()
VIDEO_CATEGORIES = {category['id']: category['snippet']['title'] for category in response['items']}

#The following line shows how to extract the channel_id and channel_title of a video with video_id "OOrW82pHlMQ"
# channel_id, channel_title = get_channel_info('OOrW82pHlMQ', youtube)
# print(f'{channel_id}, {channel_title}')

#  ------------  Get the data -------------------------------------
get_channel_videos(list(CHANNELS.keys()), START_DATE, END_DATE, QUERY, VIDEO_CATEGORIES, max_videos = MAX_VIDEOS).head()
print ("-> Videos have been Collected ---------------------------")
get_videos_comments().head()
print ("-> Comments have been Collected -------------------------")
# -------------- Clean the data -----------------------------------
clean_data(VIDEOS_FILE, COMMENTS_FILE, stopwords)
print ("-> Data Cleaning has been Completed ---------------------")

ValueError: Failed to load credentials: Invalid credentials file