In [3]:
import os
import csv 
import pickle
import google.oauth2.credentials
 
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
 
# The CLIENT_SECRETS_FILE variable specifies the name of a file that contains
# the OAuth 2.0 information for this application, including its client_id and
# client_secret.
CLIENT_SECRETS_FILE = "client_secret.json"
 
# This OAuth 2.0 access scope allows for full read/write access to the
# authenticated user's account and requires requests to use an SSL connection.
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'
 
 
def get_authenticated_service():
    credentials = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            credentials = pickle.load(token)
    #  Check if the credentials are invalid or do not exist
    if not credentials or not credentials.valid:
        # Check if the credentials have expired
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                CLIENT_SECRETS_FILE, SCOPES)
            credentials = flow.run_console()
 
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(credentials, token)
 
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)
 
 
def get_video_comments(service, **kwargs):
    comments = []
    results = service.commentThreads().list(**kwargs).execute()
 
    while results:
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
 
        # Check if another page exists
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.commentThreads().list(**kwargs).execute()
        else:
            break
 
    return comments
 
 
def write_to_csv(comments):
    with open('comments.csv', 'w' , encoding="utf-8") as comments_file:
        comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        comments_writer.writerow(['Video ID', 'Title', 'Comment'])
        for row in comments:
            # convert the tuple to a list and write to the output file
            comments_writer.writerow(list(row))
 
 
def get_videos(service, **kwargs):
    final_results = []
    results = service.search().list(**kwargs).execute()
 
    i = 0
    max_pages = 3
    while results and i < max_pages:
        final_results.extend(results['items'])
 
        # Check if another page exists
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.search().list(**kwargs).execute()
            i += 1
        else:
            break
 
    return final_results
 
 
def search_videos_by_keyword(service, **kwargs):
    results = get_videos(service, **kwargs)
    final_result = []
    for item in results:
        title = item['snippet']['title']
        video_id = item['id']['videoId']
        comments = get_video_comments(service, part='snippet', videoId=video_id, textFormat='plainText')
        # make a tuple consisting of the video id, title, comment and add the result to 
        # the final list
        final_result.extend([(video_id, title, comment) for comment in comments]) 
 
    write_to_csv(final_result)
 
 
if __name__ == '__main__':
    # When running locally, disable OAuthlib's HTTPs verification. When
    # running in production *do not* leave this option enabled.
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
    service = get_authenticated_service()
    keyword = input('Enter a keyword: ')
    search_videos_by_keyword(service, q=keyword, part='id,snippet', eventType='completed', type='video')

Enter a keyword: geeks for geeks


In [4]:
import pandas as pd
data = pd.read_csv("comments.csv")
data.head()

Unnamed: 0,Video ID,Title,Comment
0,k4286q_Tovc,Webinar | How to Begin with Competitive Progra...,He looks like a bit of younger version of Abhi...
1,k4286q_Tovc,Webinar | How to Begin with Competitive Progra...,Hey can u tell us the name of the dsa course t...
2,k4286q_Tovc,Webinar | How to Begin with Competitive Progra...,sir I am currently doing cp on hackerrank how ...
3,k4286q_Tovc,Webinar | How to Begin with Competitive Progra...,🤔🤨
4,k4286q_Tovc,Webinar | How to Begin with Competitive Progra...,what is his name?


In [8]:
import nltk
#nltk.download('all')
word_tokens=nltk.word_tokenize(data["Comment"][2])
word_tokens

['sir',
 'I',
 'am',
 'currently',
 'doing',
 'cp',
 'on',
 'hackerrank',
 'how',
 'much',
 'time',
 'shall',
 'i',
 'devote',
 'to',
 'it']

In [10]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence)

['sir', 'I', 'currently', 'cp', 'hackerrank', 'much', 'time', 'shall', 'devote']


In [11]:
from nltk.stem import PorterStemmer
stemmer= PorterStemmer()
# words  = nltk.word_tokenize(data["Comment"][4])
df = pd.DataFrame()
df['OriginalWords'] = pd.Series(filtered_sentence)
StemmedWords = [stemmer.stem(word) for word in filtered_sentence]
df['StemmedWords'] = pd.Series(StemmedWords)
df

Unnamed: 0,OriginalWords,StemmedWords
0,sir,sir
1,I,I
2,currently,current
3,cp,cp
4,hackerrank,hackerrank
5,much,much
6,time,time
7,shall,shall
8,devote,devot


In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
LemmatizedWords = [lemmatizer.lemmatize(word) for word in filtered_sentence]
df['Lemmatizer'] = pd.Series(LemmatizedWords)
df

Unnamed: 0,OriginalWords,StemmedWords,Lemmatizer
0,sir,sir,sir
1,I,I,I
2,currently,current,currently
3,cp,cp,cp
4,hackerrank,hackerrank,hackerrank
5,much,much,much
6,time,time,time
7,shall,shall,shall
8,devote,devot,devote
