In [2]:
# library used for gathering youtube data
import requests
import time
import pandas as pd
import networkx as nx
import youtubeAPI as api_key
import json

In [10]:
# set up the urls and api key for data gathering
SEARCH_URL = "https://www.googleapis.com/youtube/v3/search"
COMMENTS_URL = "https://www.googleapis.com/youtube/v3/commentThreads"
API_KEY = api_key.api_key


In [1]:
#function used for storing data
def store_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)
#function used to load data
def load_data(filename):
    with open(filename, 'r') as f:
        return json.load(f)

# function request the data with the urls
def make_request(url, params):
    # this function would keep request the data until it hits the limits.
    while True:
        response = requests.get(url, params=params)
        if response.status_code == 403 and 'quota' in response.text:
            print("Rate limit hit. Sleeping for 3600 seconds...")
            time.sleep(3600)  # Sleep for 3600 seconds
            continue
        return response

# Load previously stored data if it exists
try:
    videos = load_data('videos.json')
    df_data = load_data('df_data.json')
except FileNotFoundError:
    videos = []
    df_data = []

# If videos list is empty, fetch videos
if not videos:
    search_params = {
        'part': 'snippet',
        'q': 'Andrew Tate',
        'type': 'video',
        'key': API_KEY,
        'maxResults': 50  # This is the maximum allowed per request
    }
    # request video data
    while True:
        response = make_request(SEARCH_URL, search_params)
        results = response.json()
        videos.extend(results.get('items', []))
        
        page_token = results.get('nextPageToken')
        if not page_token:
            break
        search_params['pageToken'] = page_token
    
    store_data(videos, 'videos.json')
# data processing 
#gather all useful information that could be used for analysis
for video in videos:
    video_id = video['id']['videoId']
    video_name = video['snippet']['title']
    
    comments = []
    page_token = None
    while True:
        params = {
            'part': 'snippet,replies',
            'videoId': video_id,
            'key': API_KEY,
            'pageToken': page_token
        }
        try:
            comments_response = make_request(COMMENTS_URL, params)
            comments_data = comments_response.json()
            comments.extend(comments_data.get('items', []))
            
            page_token = comments_data.get('nextPageToken')
            if not page_token:
                break
        except Exception as e:
            print(f"Encountered an error: {e}. Pausing for an hour...")
            time.sleep(3600)  # Sleep for an hour
    #gather all comments for each video data
    for comment in comments:
        topLevelComment = comment['snippet']['topLevelComment']['snippet']
        comment_id = comment['snippet']['topLevelComment']['id']
        comment_likes = topLevelComment['likeCount']
        #append it to the df_data
        df_data.append({
            'ID': comment_id,
            'ParentID': video_id,
            'Type': 'comment',
            'Content': topLevelComment['textDisplay'],
            'Upvotes': comment_likes,
            'Date': topLevelComment['publishedAt'],
            'VideoName': video_name
        })
        #gather all replies if there is any
        replies = comment.get('replies', {}).get('comments', [])
        for reply in replies:
            reply_snippet = reply['snippet']
            reply_id = reply['id']
            reply_likes = reply_snippet['likeCount']
            #append to the df_data
            df_data.append({
                'ID': reply_id,
                'ParentID': comment_id,
                'Type': 'reply',
                'Content': reply_snippet['textDisplay'],
                'Upvotes': reply_likes,
                'Date': reply_snippet['publishedAt'],
                'VideoName': video_name
            })
    
    # Store df_data periodically in json format
    store_data(df_data, 'df_data.json')
# convert the data format to dataframe
df = pd.DataFrame(df_data)

#create a graph based on the data gathered
G = nx.DiGraph()

for index, row in df.iterrows():
    G.add_node(row['ID'], type=row['Type'], content=row['Content'], upvotes=row['Upvotes'], date=row['Date'])

for index, row in df.iterrows():
    if row['ParentID']:
        G.add_edge(row['ParentID'], row['ID'], weight=row['Upvotes'])

#output the data to csv and graph to graphml
df.to_csv("TATE_youtube.csv", index=False)
nx.write_graphml(G, "Tate_youtube.graphml")


In [43]:
df.ParentID.value_counts()

ParentID
m1S9oOgnGp0                   52005
HUYp5Gkomng                   51327
2rhslYeQyaM                   23039
B3OCjKcXs-w                   22371
OY9_7kNRJME                   12799
                              ...  
Ugy-tVxjRFasSXmv4hB4AaABAg        1
UgwcrlB6Ir8RfLzl79B4AaABAg        1
UgyCxl9sKbesf6nTLrB4AaABAg        1
UgwmhE_Ja1mz7o0_kFd4AaABAg        1
UgzmbFOO-Pj-NOtS-ch4AaABAg        1
Name: count, Length: 35090, dtype: int64

In [12]:
#preprocessing the youtube data further
df = pd.read_csv('Tate_youtube.csv')
df = df.drop(columns=['Unnamed: 0'])

In [70]:
df.head()

Unnamed: 0,ID,ParentID,Type,Content,Upvotes,Date,VideoName
0,UgzcO3bVC_-SqlfnIux4AaABAg,tgNGrT3B1ik,comment,This girl is 100% wrong,0,2023-10-04 13:41:28+00:00,HEATED Debate: Is Andrew Tate Good For Society?
1,UgyK1FqoMMcyoTnwI3F4AaABAg,tgNGrT3B1ik,comment,Not even Candace Owens couldn’t articulate tho...,0,2023-10-04 13:41:22+00:00,HEATED Debate: Is Andrew Tate Good For Society?
2,UgyfqLmMbxkgpmm1NXx4AaABAg,tgNGrT3B1ik,comment,Guys 100% right? She’s totally wrong.,0,2023-10-04 13:41:08+00:00,HEATED Debate: Is Andrew Tate Good For Society?
3,Ugw6q4S5KYcIOVJ93pJ4AaABAg,tgNGrT3B1ik,comment,Does she keep this same energy with the Kardas...,0,2023-10-04 13:41:02+00:00,HEATED Debate: Is Andrew Tate Good For Society?
4,UgymTqI0qab6pfv1P1l4AaABAg,tgNGrT3B1ik,comment,She’s spot on. Adam had TERRIBLE points and th...,0,2023-10-04 13:40:51+00:00,HEATED Debate: Is Andrew Tate Good For Society?


In [61]:
#group the data by the parentID (whether the comments have or have not replies)
videogroup = df.groupby('ParentID')

video_reply = {}
video_no_reply = {}

for v_id, g_df in videogroup:
    no_reply = g_df[g_df.Type == "comment"]
    
    if len(g_df) == len(no_reply):
        video_no_reply[v_id] = no_reply
    else:
        video_reply[v_id] = g_df


In [66]:
df_no_reply = pd.concat(video_no_reply.values(), ignore_index=True)
# df_yes_reply = pd.concat(video_reply.values(), ignore_index=True)

In [82]:
# I want the TopComment ParentID
df_no_reply.head()

Unnamed: 0,ID,ParentID,Type,Content,Upvotes,Date,VideoName
0,Ugzdbk-tdiG_yY9a9ld4AaABAg,-LWj3g_geR8,comment,What do you think about marriage?,9,2023-09-14 12:59:43+00:00,Andrew Tate ANNOUNCED His Marriage
1,Ugw_srvaF0Jn5zNBeYZ4AaABAg,-LWj3g_geR8,comment,From him saying &quot;I have 8 wifes and 18 ki...,0,2023-10-04 10:50:47+00:00,Andrew Tate ANNOUNCED His Marriage
2,UgwwNV2tCcMdTNy2fF14AaABAg,-LWj3g_geR8,comment,He is foolish person. He change decision alway...,0,2023-10-03 15:56:45+00:00,Andrew Tate ANNOUNCED His Marriage
3,Ugwq_fn-t1_dBkA63694AaABAg,-LWj3g_geR8,comment,bro said he will have children he’s muslim and...,1,2023-10-03 15:08:26+00:00,Andrew Tate ANNOUNCED His Marriage
4,UgxXqjxx9BVsvdfXkkJ4AaABAg,-LWj3g_geR8,comment,pimp,0,2023-10-03 05:19:43+00:00,Andrew Tate ANNOUNCED His Marriage


In [77]:
video_id = "-LWj3g_geR8"
top_level_comments = df[(df.ParentID == video_id) & (df.Type == "comment")]

top_level_comment_ids = top_level_comments['ID'].tolist()

all_comments_replies = df[df['ID'].isin(top_level_comment_ids) | df['ParentID'].isin(top_level_comment_ids)]


In [78]:
all_comments_replies

Unnamed: 0,ID,ParentID,Type,Content,Upvotes,Date,VideoName
41534,Ugzdbk-tdiG_yY9a9ld4AaABAg,-LWj3g_geR8,comment,What do you think about marriage?,9,2023-09-14 12:59:43+00:00,Andrew Tate ANNOUNCED His Marriage
41535,Ugzdbk-tdiG_yY9a9ld4AaABAg.9ud_wHcSH_b9vGwAEZEkto,Ugzdbk-tdiG_yY9a9ld4AaABAg,reply,tree is honestly the way to go,0,2023-09-30 05:03:36+00:00,Andrew Tate ANNOUNCED His Marriage
41536,Ugzdbk-tdiG_yY9a9ld4AaABAg.9ud_wHcSH_b9uf3qvTApz7,Ugzdbk-tdiG_yY9a9ld4AaABAg,reply,With a Prenup,1,2023-09-15 02:49:06+00:00,Andrew Tate ANNOUNCED His Marriage
41537,Ugzdbk-tdiG_yY9a9ld4AaABAg.9ud_wHcSH_b9ueru0KxvLh,Ugzdbk-tdiG_yY9a9ld4AaABAg,reply,It shouldn’t cost a lot,0,2023-09-15 00:55:55+00:00,Andrew Tate ANNOUNCED His Marriage
41538,Ugw_srvaF0Jn5zNBeYZ4AaABAg,-LWj3g_geR8,comment,From him saying &quot;I have 8 wifes and 18 ki...,0,2023-10-04 10:50:47+00:00,Andrew Tate ANNOUNCED His Marriage
...,...,...,...,...,...,...,...
153568,UgwRATQetvxXL2DFG7J4AaABAg.9udb4rLMFXv9vProwAsatR,UgwRATQetvxXL2DFG7J4AaABAg,reply,@KAVINRAJ R and no he didn’t start a drug casi...,0,2023-10-03 16:18:46+00:00,Andrew Tate ANNOUNCED His Marriage
153569,UgwRATQetvxXL2DFG7J4AaABAg.9udb4rLMFXv9vPrD_1oao0,UgwRATQetvxXL2DFG7J4AaABAg,reply,@KAVINRAJ R well haven&#39;t seen or heard tha...,0,2023-10-03 16:13:32+00:00,Andrew Tate ANNOUNCED His Marriage
153570,UgwRATQetvxXL2DFG7J4AaABAg.9udb4rLMFXv9vPqYK8giOC,UgwRATQetvxXL2DFG7J4AaABAg,reply,@KAVINRAJ R he is a false prophet but meaningl...,0,2023-10-03 16:07:38+00:00,Andrew Tate ANNOUNCED His Marriage
153571,UgwRATQetvxXL2DFG7J4AaABAg.9udb4rLMFXv9vPpqpx0sL6,UgwRATQetvxXL2DFG7J4AaABAg,reply,He is running drug casino,0,2023-10-03 16:01:33+00:00,Andrew Tate ANNOUNCED His Marriage


In [None]:
# Filtering out the videos to find the Parent comment (only need video_no_reply) 

In [1]:
jupyter nbconvert --to pdf Youtube-scrape.ipynb

[NbConvertApp] Converting notebook Youtube-scrape.ipynb to pdf
[NbConvertApp] Writing 46486 bytes to notebook.tex
[NbConvertApp] Building PDF
Traceback (most recent call last):
  File "/Users/amayiyer/Downloads/anaconda3/bin/jupyter-nbconvert", line 11, in <module>
    sys.exit(main())
             ^^^^^^
  File "/Users/amayiyer/Downloads/anaconda3/lib/python3.11/site-packages/jupyter_core/application.py", line 280, in launch_instance
    super().launch_instance(argv=argv, **kwargs)
  File "/Users/amayiyer/Downloads/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/Users/amayiyer/Downloads/anaconda3/lib/python3.11/site-packages/nbconvert/nbconvertapp.py", line 412, in start
    self.convert_notebooks()
  File "/Users/amayiyer/Downloads/anaconda3/lib/python3.11/site-packages/nbconvert/nbconvertapp.py", line 590, in convert_notebooks
    self.convert_single_notebook(notebook_filename)
  File "/Users/amayiyer/Dow