# Configuration and module installation

In [None]:
pip install google-api-python-client

In [None]:
pip install google-auth google-auth-oauthlib google-auth-httplib2

# The real business starts HERE !!

In [3]:
CLIENT_SECRETS_FILE = "client_secret3.json"
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

In [4]:
import os
import pickle
import google.oauth2.credentials
 
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
 


def get_authenticated_service():
    credentials = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            credentials = pickle.load(token)
    #  Check if the credentials are invalid or do not exist
    if not credentials or not credentials.valid:
        # Check if the credentials have expired
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                CLIENT_SECRETS_FILE, SCOPES)
            credentials = flow.run_console()
 
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(credentials, token)
 
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

In [5]:
if __name__ == '__main__':
    # When running locally, disable OAuthlib's HTTPs verification. When
    # running in production *do not* leave this option enabled.
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
    service = get_authenticated_service()

In [6]:
def get_videos(service, **kwargs):
    final_results = []
    results = service.search().list(**kwargs).execute()


    
      
    i = 0
    max_pages = 5
    while results and i < max_pages:
        final_results.extend(results['items'])

        # Check if another page exists
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.search().list(**kwargs).execute()
            i += 1
        else:
            break

    return final_results

In [7]:
def get_video_comments(service, **kwargs):
    comments = []
    results = service.commentThreads().list(**kwargs).execute()
    
    while results:
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
 
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.commentThreads().list(**kwargs).execute()
        else:
            break
 
    return comments

In [8]:
def search_videos_by_keyword(service, **kwargs):
    results = get_videos(service, **kwargs)
    
    list_comms = []
    for item in results:
        title = item['snippet']['title']
        video_id = item['id']['videoId']

        try:
            comments = get_video_comments(service, part='snippet', videoId=video_id, textFormat='plainText')
            for comment in comments:
                list_comms.append(comment)
        except:
            print('disabled comments')
        
        
    return list_comms

In [9]:
def isEnglish(s):
    if s.isascii():
        return s
    else:
        return "false"

In [10]:
import csv
import pandas as pd

def write_to_csv(comments):
    df = pd.Series(comments)
    df = df.apply(lambda x:isEnglish(x))
    df = df[df!='false']
    df.to_csv('out.csv', index=False, header=False)  



In [11]:
keyword = input('Enter a keyword: ')
comments = search_videos_by_keyword(service, q=keyword, part='id,snippet', eventType='completed', type='video')

Enter a keyword: algerie


In [12]:
write_to_csv(comments)

## Assembling all the outputs in one dataset

In [16]:
import pandas as pd 

all_csv = []
for i in range(1,5):
    all_csv.append('out'+str(i)+'.csv')
    

In [17]:
li = []

for filename in all_csv:
    df = pd.read_csv(filename, index_col=None, header=None,error_bad_lines=False)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [21]:
write_to_csv(frame[0].tolist())

# Joining the current dataset with the new one and removing duplicated values

In [22]:
df1 = pd.read_csv("dataset.csv",header=None)
df2 = pd.read_csv("out.csv",header=None)

li = []

li.append(df1)
li.append(df2)

df = pd.concat(li, axis=0, ignore_index=True)
df = df.drop_duplicates()

In [23]:
df.to_csv('dataset.csv', index=False, header=False)  