##### 1. Scrape all the playlist id from the channel and turn them into a dataframe called 'playlist_id'.

In [None]:
import googleapiclient.discovery
import pandas as pd
import json
import matplotlib.pyplot as plt
import html.parser as htmlparser
import html
import time
import datetime as dt
import regex as re
import numpy as np
from tqdm.notebook import trange

pd.set_option('display.max_colwidth', None)


api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "Your google developer key"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=DEVELOPER_KEY)

request2 = youtube.playlists().list(
        fields = 'items/snippet,items/id',
        channelId = 'UCi8xUU_lg3zr8UcBXLdEheQ',
        maxResults = 50,
        part = 'snippet'
)

platlists = []

response2 = request2.execute()

for item2 in response2['items']:
    playlist = item2['snippet']
    playlist_id=item2['id']
    platlists.append([
    playlist_id,    
    playlist['title']])
    
playlist_id = pd.DataFrame(platlists, columns=['playlist_id','playlist_title'])
playlist_id

In [None]:
#Save all playlist id into a list.
play_list=playlist_id.playlist_id.to_list()
play_list

##### 2. Scrape every video id from all the playlist and turn them into a dataframe called 'video_id_data'.

In [None]:
video_id_data = pd.DataFrame() 
for i in play_list:
    try:
        request3 = youtube.playlistItems().list(
                fields = 'nextPageToken,items/snippet/resourceId,items/snippet/title',
                playlistId = i,
                maxResults = 50,
                part = 'snippet'
        )

        videos = []


        response3 = request3.execute()

        for item3 in response3['items']:
            video = item3['snippet']
            videos.append([str(i),
            video['resourceId']['videoId'],    
            video['title']])

        video_id = pd.DataFrame(videos, columns=['playlist_id','videoId','video_Title'])
        video_id_data = pd.concat([video_id_data, video_id], ignore_index=True)
    except:
        print(f'fail to scrape comment from playlist_id:{i}')
        time.sleep(5)
        pass

In [None]:
#Save every video id into a list
video_list=video_id_data.videoId.to_list()
video_list

##### 3. Scrape every top-level comment from every video id and turned into into dataframe called 'comment_data'.

In [None]:
import time
comment_data = pd.DataFrame() 

for i in video_list:
    try:
        request = youtube.commentThreads().list(
        part = 'snippet,replies', 
        videoId = i
    )

        comments = []

        while request:

            response = request.execute()
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                comments.append([
                comment['videoId'],    
                comment['authorDisplayName'],
                comment['publishedAt'],
                comment['updatedAt'],
                comment['likeCount'],
                html.unescape(comment['textDisplay']),
                item['snippet']['totalReplyCount']
                            ])
                request = youtube.commentThreads().list_next(
                request, response)
                youtubedata_data = pd.DataFrame(comments, columns=['videoId','author', 'published_at', 'updated_at', 'like_count', 'comment','Reply_count'])   
        comment_data = pd.concat([comment_data, youtubedata_data], ignore_index=True)
    except:
        print(f'fail to scrape comment from video_id:{i}')
        time.sleep(5)
        pass

##### 4. Perform data cleaning to keep hashtag only instead of html embeded.

In [None]:
for i in range(comment_data1['comment'].count()):
    st=comment_data1.loc[i,'comment']
    comment_data1.loc[i,'comment']=re.sub('https.{0,500}','',st)
    comment_data1.loc[i,'comment']=re.sub('<br>','',st)
comment_data1

##### 5. Check if there's any comment containing empty string

In [None]:
comment_data1=comment_data1[(comment_data1.comment == '')].reset_index(drop=True)
comment_data1

##### 6. Merging various dataframes.

In [None]:
comment_data1=comment_data1.merge(video_id_data, how='inner', on='videoId')
comment_data1=comment_data1.merge(playlist_id, how='inner', on='playlist_id')

##### 7. Since some comments are not in English, I used Google Tranalste API to make them into all English based comment for better sentiment analysis accuracy and add one extra column 'comment translated'.

In [None]:
from deep_translator import GoogleTranslator  
for i in trange(comment_data1['comment'].count()):
    text = comment_data1.loc[i, 'comment']
    comment_data1.loc[i, 'comment_translated']=GoogleTranslator(source='auto', target='en').translate(text) 

##### 8. Apply sentiment analysis using VADER on comments translated and perform data transformation to return the sentiment label of each comment.

In [None]:
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

comment_data1['scores'] = comment_data1['comment_translated'].apply(lambda comment_translated: sid.polarity_scores(str(comment_translated)))
comment_data1

comment_data1['compound'] = comment_data1['scores'].apply(lambda score_dict: score_dict['compound'])
comment_data1

def x(row):
    if row.compound > 0:
        return 'positive'
    elif row.compound < 0:
        return 'negative'
    else:
        return 'neutral'
    
comment_data1['sentiment_label'] = comment_data1.apply(x, axis=1)
comment_data1

#####  9. Utilize NLTK for sentiment-based filtering of comments, removing stop words, and tokenizing documents. Additionally, eliminate emojis from the text.

In [None]:
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

positive_comment=comment_data1[(comment_data1['sentiment_label']=='positive')].reset_index(drop=True)
positive_comment

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

vocab_list_F=[]
for x in trange(0, positive_comment['comment_translated'].count()):
    text = str(positive_comment.loc[x, "comment_translated"])
    text = remove_emojis(text)
    stop = set(stopwords.words('english') + list(string.punctuation))
    vocab_list=[i for i in word_tokenize(text.lower()) if i not in stop]
    vocab_list_F.append(vocab_list)
    
flat_list = []

for xs in vocab_list_F:
    for x in xs:
        flat_list.append(x)
        
flat_list

#Turn list into dataframe
positive_word_count = pd.DataFrame(flat_list, columns =['Vocabulary'])

#Group by word count in descending order
positive_word_count=negative_word_count.groupby('Vocabulary').size().sort_values(ascending=False).reset_index(name='word_count')
positive_word_count

In [None]:
negative_comment=comment_data1[(comment_data1['sentiment_label']=='negative')].reset_index(drop=True)
negative_comment


def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

vocab_list_F=[]
for x in trange(0, positive_comment['comment_translated'].count()):
    text = str(positive_comment.loc[x, "comment_translated"])
    text = remove_emojis(text)
    stop = set(stopwords.words('english') + list(string.punctuation))
    vocab_list=[i for i in word_tokenize(text.lower()) if i not in stop]
    vocab_list_F.append(vocab_list)
    
flat_list = []

for xs in vocab_list_F:
    for x in xs:
        flat_list.append(x)
        
flat_list

#Turn list into dataframe
negative_comment_word_count = pd.DataFrame(flat_list, columns =['Vocabulary'])

#Group by word count in descending order
negative_comment_word_count=negative_word_count.groupby('Vocabulary').size().sort_values(ascending=False).reset_index(name='word_count')
negative_comment_word_count

##### 10. Extract every single unique title and concatenate them into string for more video category classification using Chat GPT

In [None]:
df['video_Title']=df['video_Title'].str.replace('\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f','')

video_list=df['video_Title'].unique().tolist()

#concatenate unique video title into string
video_string=','.join(video_list)
video_list

#concatenate unique video title into dataframe
df1 = pd.DataFrame(video_list, columns =['video_Title'])
df1

##### 11. Utilized Open AI API to classify video into video categories and merge them back to the original dataframe

In [None]:
import openai
import os
import time
from time import sleep

openai.api_key = 'Your API key'

def analyze_gpt35(text):
    messages = [
        {"role": "system", "content": """You are trained to classify the given video title from Qatar Airways official 
        YouTube channel into one of the 7 following different categories based on the video title. 1.Scenic Landings and
        Take-offs 2.Travel Adventures and Destinations 3.Qatar Airways Milestones and Achievements 4.Cultural and Holiday Celebrations 
        5.Behind-the-Scenes and Educational Content 6.Corporate Responsibility and Sustainability 
        7.Sports and Football Partnerships. For example: Video Title "Digital Menus" belongs to "Corporate Responsibility and Sustainability".
        "ITM 2021 – Chief Commercial Officer, Thierry Antinori" belongs to "Qatar Airways Milestones and Achievements". If you are unsure
        about the answer, classify them as "Corporate Responsibility and Sustainability" """},
        {"role": "user", "content": f"""Analyze the given video title and return answer with above given category only. For example, if the answer
        is "Beautiful Landings and Take-offs", please then return "Beautiful Landings and Take-offs" only. Do not include any other words in the 
        answer. This is very important for me, please take more time to consider. You can make it!: {text}"""}
        ]
   
    response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo",
                      messages=messages, 
                      max_tokens=1, 
                      n=1, 
                      stop=None, 
                      temperature=0)

    response_text = response.choices[0].message.content.strip().lower()

    return response_text

In [None]:
for i in trange(326,youtubedata['text'].count()):
    
    if i % 3==0:
        time.sleep(70)
        text = df1.loc[i, 'text']
        df1.loc[i, 'category']=analyze_gpt35(text)
    else:
        text = youtubedata.loc[i, 'text']
        df1.loc[i, 'category']=analyze_gpt35(text)

In [None]:
df2=df2.merge(comment_data1, how='left', on='video_Title')
df2