In [6]:
import csv
from googleapiclient.discovery import build
from collections import Counter
import streamlit as st
from googleapiclient.errors import HttpError
import re
from pytube import YouTube
import yt_dlp
from tqdm.auto import tqdm
import pandas as pd
from datetime import datetime
import requests
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# API keys config

In [7]:
DEVELOPER_KEY = "AIzaSyD_NG--GtmImIOhDhp-5V6PFPmJhiiZN88"
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

2023-06-05 22:53:04.367 INFO    googleapiclient.discovery_cache: file_cache is only supported with oauth2client<4.0.0


# Define Channel Link

In [8]:
channel_link = "https://www.youtube.com/@beebomco"

# Get Basic Channel Info

In [9]:
def get_channel_info(channel_link):
    
    match = re.search(r'/@([^/]+)', channel_link)
    if match:
        username = match.group(1)
        response = requests.get(f"https://youtube.googleapis.com/youtube/v3/search?part=snippet&q={username}&type=channel&key={DEVELOPER_KEY}").json()
        channel_id =  response['items'][0]['id']['channelId']
    else:
        print('Invalid channel link')
        return None

    channel_info = {
        'channel_id': channel_id,
        'channel_title': '',
        'video_count': 0,
        'channel_logo_url': '',
        'channel_created_date': '',
        'subscriber_count': 0,
        'channel_description': ''
    }

    try:
        response = youtube.channels().list(
            part='snippet,statistics',
            id=channel_id
        ).execute()

        if 'items' in response and len(response['items']) > 0:
            channel = response['items'][0]
            channel_info['channel_title'] = channel['snippet']['title']
            channel_info['video_count'] = int(channel['statistics']['videoCount'])
            channel_info['channel_logo_url'] = channel['snippet']['thumbnails']['default']['url']
            channel_info['channel_created_date'] = datetime.strptime(channel['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").strftime("%B %d, %Y")
            channel_info['subscriber_count'] = int(channel['statistics']['subscriberCount'])
            channel_info['channel_description'] = channel['snippet']['description']

        return channel_info

    except HttpError as e:
        print(f'An HTTP error occurred: {e}')
        return None

# Get Latest 20 Videos of the channel with complete info

In [10]:
def get_latest20(channel_id):
    channel_url = f'https://www.youtube.com/channel/{channel_id}/videos'

    ydl_opts = {
        'format': 'best',
        'quiet': True,
        'no_warnings': True,
        'ignoreerrors': True,
        'skip_download': True,
        'getduration': True,
        'getdescription': True,
        'getuploaddate': True,
        'playlistend': 20
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(channel_url, download=False)
        videos = info_dict['entries']
        
    video_data = []
    for video in videos:
        video_id = video['id']
        title = video['title']
        url = video['webpage_url']
        description = video.get('description', '')
        duration = video.get('duration')
        published_at = video.get('upload_date', '')
        thumbnails = video.get('thumbnails', [])
        view_count = video.get('view_count', 0)
        like_count = video.get('like_count', 0)
        comment_count = video.get('comment_count', 0)
        minutes, seconds = divmod(duration, 60)
        
        video_data.append({
            'channel_id': channel_id,
            'video_id': video_id,
            'title': title,
            'url': url,
            'description': description,
            'duration': f'{minutes:02d}:{seconds:02d}',
            'published_at': datetime.strptime(published_at, "%Y%m%d").strftime("%B %d, %Y"),
            'thumbnails': thumbnails[-1]['url'],
            'view_count': view_count,
            'like_count': like_count,
            'comment_count': comment_count
    })
        
    return video_data


In [11]:
channel_info = get_channel_info(channel_link)
channel_info

{'channel_id': 'UCvpfclapgcuJo0M_x65pfRw',
 'channel_title': 'Beebom',
 'video_count': 1016,
 'channel_logo_url': 'https://yt3.ggpht.com/ytc/AGIKgqNyTSv58cVWlODIvhYc54XpXgIIMe8bUI_cn2Sc7A=s88-c-k-c0x00ffffff-no-rj',
 'channel_created_date': 'February 09, 2016',
 'subscriber_count': 2760000,
 'channel_description': "Beebom's official YouTube Channel.\n\nOfficial Twitter Handle: https://twitter.com/beebomco/\n"}

In [14]:
video_data = get_latest20(channel_info['channel_id'])
df = pd.DataFrame(video_data)

In [15]:
df

Unnamed: 0,channel_id,video_id,title,url,description,duration,published_at,thumbnails,view_count,like_count,comment_count
0,UCvpfclapgcuJo0M_x65pfRw,J3igzbaOF_M,7 Dangerous Android Settings You Need to Turn ...,https://www.youtube.com/watch?v=J3igzbaOF_M,"If you are an Android user, this video is a mu...",04:18,"June 03, 2023",https://i.ytimg.com/vi_webp/J3igzbaOF_M/maxres...,184690,11416,468
1,UCvpfclapgcuJo0M_x65pfRw,LuThmQCAnZc,QLED vs LED TV at 30K: Don't Make This Mistake!,https://www.youtube.com/watch?v=LuThmQCAnZc,Everyone has a perception that QLED TVs are ex...,05:29,"May 30, 2023",https://i.ytimg.com/vi_webp/LuThmQCAnZc/maxres...,102948,4137,338
2,UCvpfclapgcuJo0M_x65pfRw,dPLj_lId_zY,"Lava Agni 2 Review: The Good, The Bad, The Best!",https://www.youtube.com/watch?v=dPLj_lId_zY,The Lava Agni 2 is here and this is one hyped ...,07:31,"May 26, 2023",https://i.ytimg.com/vi_webp/dPLj_lId_zY/maxres...,159995,8637,621
3,UCvpfclapgcuJo0M_x65pfRw,QOm4okp1J5c,Moto Edge 40: A Good Option Under 30K?,https://www.youtube.com/watch?v=QOm4okp1J5c,Motorola has launched the new Edge 40 and this...,05:58,"May 23, 2023",https://i.ytimg.com/vi_webp/QOm4okp1J5c/maxres...,263729,9403,1000
4,UCvpfclapgcuJo0M_x65pfRw,0YoPpEIS3o4,8 Super Useful Gadgets for Students!,https://www.youtube.com/watch?v=0YoPpEIS3o4,The best gadgets for students! These are 8 sup...,05:09,"May 20, 2023",https://i.ytimg.com/vi_webp/0YoPpEIS3o4/maxres...,348326,12424,454
5,UCvpfclapgcuJo0M_x65pfRw,MDigRlLPlgc,ASUS ROG Strix G18: Core i9 + 4070 at Great Pr...,https://www.youtube.com/watch?v=MDigRlLPlgc,ASUS has launched of number of new ROG laptops...,05:33,"May 17, 2023",https://i.ytimg.com/vi_webp/MDigRlLPlgc/maxres...,73560,2672,202
6,UCvpfclapgcuJo0M_x65pfRw,Q65Fn1FZwdE,"Dell G16 (2023): High Performance, Better Price!",https://www.youtube.com/watch?v=Q65Fn1FZwdE,The new Dell G16 7630 has arrived and it bring...,05:03,"May 15, 2023",https://i.ytimg.com/vi_webp/Q65Fn1FZwdE/maxres...,100145,3562,280
7,UCvpfclapgcuJo0M_x65pfRw,lUDozITit6w,Pixel 7a: Google Fixed It!,https://www.youtube.com/watch?v=lUDozITit6w,Google has just launched the Pixel 7a and this...,06:31,"May 10, 2023",https://i.ytimg.com/vi_webp/lUDozITit6w/maxres...,319032,11241,1600
8,UCvpfclapgcuJo0M_x65pfRw,0d5vOvvP8qk,POCO F5: Snapdragon 7+ Gen 2 is 🔥,https://www.youtube.com/watch?v=0d5vOvvP8qk,The new POCO F5 is here and it's the first pho...,06:12,"May 09, 2023",https://i.ytimg.com/vi_webp/0d5vOvvP8qk/maxres...,304704,10872,1400
9,UCvpfclapgcuJo0M_x65pfRw,JdgWV-e-RR8,ASUS Zenbook Pro 14 Duo (2023): Dual Screens G...,https://www.youtube.com/watch?v=JdgWV-e-RR8,This is the new ASUS Zenbook Pro 14 Duo OLED 2...,05:53,"May 08, 2023",https://i.ytimg.com/vi_webp/JdgWV-e-RR8/maxres...,82431,3002,120


# Downloading High Level Comments

In [39]:
def get_HighLvlcomments(video_id):

    try:
        comments = []
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100, 
            textFormat='plainText'
        ).execute()

        while response:
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                author = comment['authorDisplayName']
                text = comment['textDisplay']
                comments.append(
                    text
                )

            if 'nextPageToken' in response:
                next_page_token = response['nextPageToken']
                response = youtube.commentThreads().list(
                    part='snippet',
                    videoId=video_id,
                    maxResults=100,  
                    textFormat='plainText',
                    pageToken=next_page_token
                ).execute()
            else:
                break
                
    except HttpError as e:
        print(f'An HTTP error occurred: {e}')
        return None

    return comments

            

In [40]:
videoId = "lUDozITit6w"

In [41]:
HighLvlcomments = get_HighLvlcomments(videoId)

In [42]:
HighLvldf = pd.DataFrame(HighLvlcomments,columns=['Comments'])
HighLvldf['Comment ID'] = np.random.randint(1, 1000000, size=len(HighLvldf))
HighLvldf['Comment ID'] = HighLvldf['Comment ID'].astype(str) + '_' + HighLvldf.groupby('Comment ID').cumcount().add(1).astype(str)
HighLvldf

Unnamed: 0,Comments,Comment ID
0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",309718_1
1,I am in group 1.5 Looking forward for sale price,965018_1
2,Camera bar is not aluminium,20467_1
3,Can somebody tell me if this phone has heating...,835314_1
4,Group 2,741056_1
...,...,...
1016,So you upload videos late night,410410_1
1017,Yo 1,812443_1
1018,First,837435_1
1019,First,140084_1


In [43]:
HighLvldf.to_csv(f'HighLevel_{videoId}_master.csv')

# Downloading Both High-level and Low-Level Comments

In [None]:
def getAllTopLevelCommentReplies(topCommentId, token): 

    replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=topCommentId,pageToken=token).execute()

    for indx, reply in enumerate(replies_response['items']):
        all_comments.append(reply['snippet']['textDisplay'])

    if "nextPageToken" in replies_response: 
        return getAllTopLevelCommentReplies(topCommentId, replies_response['nextPageToken'])
    else:
        return []

def get_comments(youtube, video_id, token): 
    global all_comments
    totalReplyCount = 0
    token_reply = None

    if (len(token.strip()) == 0): 
        all_comments = []

    if (token == ''): 
        video_response=youtube.commentThreads().list(part='snippet',maxResults=100,videoId=video_id,order='relevance').execute() 
    else: 
        video_response=youtube.commentThreads().list(part='snippet',maxResults=100,videoId=video_id,order='relevance',pageToken=token).execute() 

    for indx, item in enumerate(video_response['items']): 
        all_comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
        totalReplyCount = item['snippet']['totalReplyCount']

        if (totalReplyCount > 0): 
            replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=item['id']).execute()
            for indx, reply in enumerate(replies_response['items']):
                all_comments.append(reply['snippet']['textDisplay'])

            while "nextPageToken" in replies_response:
                token_reply = replies_response['nextPageToken']
                replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=item['id'],pageToken=token_reply).execute()
                for indx, reply in enumerate(replies_response['items']):
                    all_comments.append(reply['snippet']['textDisplay'])

    if "nextPageToken" in video_response: 
        return get_comments(youtube, video_id, video_response['nextPageToken']) 
    else: 
        all_comments = [x for x in all_comments if len(x) > 0]
        print("Scraping Comments Completed")
        return []


In [None]:
videoId = 'lUDozITit6w'

get_comments(youtube,videoId,'')
print("All total comments obtained: "  + str(len(all_comments)))

In [None]:
AllLvldf = pd.DataFrame(all_comments,columns=['Comments'])
AllLvldf['Comment ID'] = np.random.randint(1, 1000000, size=len(AllLvldf))
AllLvldf['Comment ID'] = AllLvldf['Comment ID'].astype(str) + '_' + AllLvldf.groupby('Comment ID').cumcount().add(1).astype(str)
AllLvldf

In [None]:
AllLvldf.to_csv(f'All_{videoId}_master.csv')

In [None]:
all_comments

# Filtering and cleaning

In [44]:
def filterDF(df):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emojis
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               u"\U00002500-\U00002BEF"  # Chinese/Japanese/Korean characters
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # Variation Selectors
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    
    hyperlink_pattern = re.compile(r"http\S+|www\S+")
    unwanted_chars_pattern = re.compile(r"[^a-zA-Z\s]")  # Updated pattern to include digits \d
    
    df['Comments'] = df['Comments'].apply(lambda comment: emoji_pattern.sub(r'', comment))  # Remove emojis
    df['Comments'] = df['Comments'].apply(lambda comment: hyperlink_pattern.sub(r'', comment))  # Remove hyperlinks
    df['Comments'] = df['Comments'].apply(lambda comment: unwanted_chars_pattern.sub(r'', comment))  # Remove unwanted characters
    df['Comments'] = df['Comments'].apply(lambda comment: re.sub(r'\d+', '', comment))  # Remove integers
    df['Comments'] = df['Comments'].apply(lambda comment: re.sub(r'\d+\.\d+', '', comment))  # Remove floats
    
    df['Comments'] = df['Comments'].str.strip().str.lower()
    
    df = df.dropna()  # Drop empty rows
    return df

In [45]:
HighLvldf_filtered = filterDF(HighLvldf)

In [46]:
HighLvldf_filtered

Unnamed: 0,Comments,Comment ID
0,pixel a at after card discount so are you in...,309718_1
1,i am in group looking forward for sale price,965018_1
2,camera bar is not aluminium,20467_1
3,can somebody tell me if this phone has heating...,835314_1
4,group,741056_1
...,...,...
1016,so you upload videos late night,410410_1
1017,yo,812443_1
1018,first,837435_1
1019,first,140084_1


In [None]:
AllLvldf_filtered = filterDF(AllLvldf)

In [None]:
AllLvldf_filtered

In [38]:
AllLvldf_filtered.to_csv("lUDozITit6w_Afiltered.csv")

NameError: name 'AllLvldf_filtered' is not defined

In [47]:
HighLvldf_filtered.to_csv("lUDozITit6w_Hfiltered.csv")