# Scraping public data on 9gag profiles

In [64]:
import pandas as pd
import numpy as np
import string
import json
import time
import nltk
import os
from PIL import Image
from pathlib import Path
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from wordcloud import WordCloud
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service

nltk.download('popular')

In [65]:
# This function allows to understand different kinds of date displayed by the app
# Returns a date object

def transform_date(date):
    if date[-1] == 'd':
        nb_days = int(date[:-1])
        date_object = datetime.now() + timedelta(days = -nb_days)
    
    elif date[-1] == 'h':
        nb_hours = int(date[:-1])
        date_object = datetime.now() + timedelta(hours = -nb_hours)
    
    elif date[-1] == 'm':
        nb_minutes = int(date[:-1])
        date_object = datetime.now() + timedelta(minutes = -nb_minutes)
    
    else:
        date_list = date.split(' ')
        if len(date_list) == 2:
            actual_year = datetime.strftime(datetime.now(),'%y')
            date += ' '+actual_year
        date_object = datetime.strptime(date, '%d %b %y')
    return(date_object)

In [63]:
# Defining user name to scrap
user = 'bananacoco'
# user = 'i_just_farted'
# user = 'we_bang_ok'
# user = 'l33tlolkbi'
# user = 'radicaledward'
# user = 'loomyboomy'
# user = 'jonnypetaurus56'

# SCROLLING THROUGH POSTS

In [None]:
# Opening Edge navigator driver that should have been dowloaded and stored previously
service = Service('./edgedriver_win64/msedgedriver.exe')
driver = webdriver.Edge(service=service)

# Displaying user profile on web browser
driver.get(f'https://9gag.com/u/{user}/posts')

In [66]:
# Click to accept cookies
button = driver.find_element(By.XPATH, "/html/body/div/div/div/div/div/div/button[1]")
button.click()

In [67]:
# Scrolling down 800px at a time, until finding the <a.btn.end> element
# Saving data at each step otherwise it might disappear
# posts are packed into <stream-n> elements going from 0 to n

scroll_step = 0
stream_nb = 0
df_posts = pd.DataFrame()

while True:
    # starting with retreiving <stream> element
    try:
        stream = driver.find_element(By.ID, 'stream-'+str(stream_nb))
    
    # if <stream> element is not available
    except:
        # checking if this is the end of the whole list, if so: quit
        try:
            driver.find_element(By.CSS_SELECTOR, 'a.btn.end')
            break
        
        # if not the end, scrolling to load more, and going back to retreiving <stream> element
        except:
            scroll_step += 800 # Tuning tip : You might want to scroll more, but will possibly miss data
            driver.execute_script(f"window.scrollTo(0, {scroll_step});")
            time.sleep(1) # Tuning tip : You might want to wait less,  but will possibly miss data
            continue
    
    # when <stream> element is retreived, saving raw data and going up for the next <stream> element
    else:
        posts = stream.find_elements(By.CSS_SELECTOR, 'article')
        df_stream_posts = pd.DataFrame({'content':posts})
        df_stream_posts['header'] = df_stream_posts['content'].apply(lambda post: post.text)
        df_stream_posts['link'] = df_stream_posts['content'].apply(lambda post: post.find_element(By.CSS_SELECTOR, 'a.badge-evt.badge-track').get_attribute('href'))
        df_posts = pd.concat([df_posts,df_stream_posts[['header','link']]], axis=0)
        stream_nb += 1

In [45]:
# Extracting data from raw [header], splitting and saving it in dedicated columns

df_posts['section'] = df_posts['header'].apply(lambda head: head.split(' · ')[0])
df_posts['date'] = df_posts['header'].apply(lambda head: head.split(' · ')[1].split('\n')[0])
df_posts['title'] = df_posts['header'].apply(lambda head: head.split(' · ')[1].split('\n')[1])
df_posts['nb_up'] = df_posts['header'].apply(lambda head: head.split(' · ')[1].split('\n')[-5])
df_posts['nb_down'] = df_posts['header'].apply(lambda head: head.split(' · ')[1].split('\n')[-4])
df_posts['nb_comments'] = df_posts['header'].apply(lambda head: head.split(' · ')[1].split('\n')[-3])
df_posts.head()

Unnamed: 0,header,link,section,date,title,nb_up,nb_down,nb_comments
0,Funny · 25 Dec 21\nWhat's your flirt move?\n11...,https://9gag.com/gag/abgpEA8,Funny,25 Dec 21,What's your flirt move?,11K,135,180
1,Funny · 28 Nov 21\nAre you struggling currentl...,https://9gag.com/gag/a31Bq27,Funny,28 Nov 21,Are you struggling currently in your life? Me:,5.8K,226,91
0,Funny · 28 Nov 21\nClose one\nGIF\n1.3K\n47\n2...,https://9gag.com/gag/aBnYWQx,Funny,28 Nov 21,Close one,1.3K,47,27
1,Funny · 27 Jun 21\nAllez OPI OMI\n4.4K\n120\n1...,https://9gag.com/gag/aqjA5yP,Funny,27 Jun 21,Allez OPI OMI,4.4K,120,106
2,"Funny · 6 May 21\nDamnit, focus grandma\nGIF\n...",https://9gag.com/gag/aB2LxO1,Funny,6 May 21,"Damnit, focus grandma",3.3K,111,88


In [46]:
# former 9gag layout

# df_posts['section'] = df_posts['header'].apply(lambda head: head.split(' · ')[0])
# df_posts['date'] = df_posts['header'].apply(lambda head: head.split(' · ')[1])
# df_posts['title'] = df_posts['header'].apply(lambda head: head.split(' · ')[2].split('\n')[1])
# df_posts['nb_up'] = df_posts['header'].apply(lambda head: head.split(' · ')[2].split('\n')[2])
# df_posts['nb_down'] = df_posts['header'].apply(lambda head: head.split(' · ')[2].split('\n')[3])
# df_posts['nb_comments'] = df_posts['header'].apply(lambda head: head.split(' · ')[2].split('\n')[4])
# df_posts

In [47]:
# post-treating date and numbers

df_posts['date'] = df_posts['date'].apply(transform_date)
df_posts['nb_up'] = df_posts['nb_up'].apply(lambda x: int(float(x.replace('K','')) * 1000) if 'K' in x else x)
df_posts['nb_down'] = df_posts['nb_down'].apply(lambda x: int(float(x.replace('K','')) * 1000) if 'K' in x else x)
df_posts['nb_comments'] = df_posts['nb_comments'].apply(lambda x: int(float(x.replace('K','')) * 1000) if 'K' in x else x)
df_posts.head()

Unnamed: 0,header,link,section,date,title,nb_up,nb_down,nb_comments
0,Funny · 25 Dec 21\nWhat's your flirt move?\n11...,https://9gag.com/gag/abgpEA8,Funny,2021-12-25,What's your flirt move?,11000,135,180
1,Funny · 28 Nov 21\nAre you struggling currentl...,https://9gag.com/gag/a31Bq27,Funny,2021-11-28,Are you struggling currently in your life? Me:,5800,226,91
0,Funny · 28 Nov 21\nClose one\nGIF\n1.3K\n47\n2...,https://9gag.com/gag/aBnYWQx,Funny,2021-11-28,Close one,1300,47,27
1,Funny · 27 Jun 21\nAllez OPI OMI\n4.4K\n120\n1...,https://9gag.com/gag/aqjA5yP,Funny,2021-06-27,Allez OPI OMI,4400,120,106
2,"Funny · 6 May 21\nDamnit, focus grandma\nGIF\n...",https://9gag.com/gag/aB2LxO1,Funny,2021-05-06,"Damnit, focus grandma",3300,111,88


In [48]:
# Retreiving user info

pseudo = driver.find_element(By.CLASS_NAME, 'info-name').text
profile_pic_url = driver.find_element(By.CLASS_NAME, 'avatar-container').find_element(By.TAG_NAME, 'img').get_attribute('src')
age = driver.find_element(By.CLASS_NAME, 'info-additional').find_element(By.XPATH, '//span[2]').text.split(' ')[0].replace(',','')

In [49]:
# Exporting collected data to local folder data_save/username

df_export = df_posts.drop(['header'], axis=1)
user_folder = f'./data_save/{user}'
if not os.path.exists(user_folder):
    os.makedirs(user_folder)
filepath = Path(f'./data_save/{user}/{user}_posts.csv')
df_export.to_csv(filepath,index=False)

In [11]:
# Importing and displaying data to check if everything went ok

# df = pd.read_csv(f'./data_save/{user}/{user}_posts.csv', parse_dates=['date'])
# df

# CREATING WORD CLOUD FROM TITLES

In [50]:
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words("english")
stopwords.extend(["n't","'s","'m","...","''"])
lemmatizer = nltk.WordNetLemmatizer()

# Putting all titles' words into one big list after cleaning
def clean(string):
  words = nltk.word_tokenize(string.lower())
  no_stopunc = [lemmatizer.lemmatize(w) for w in words if w not in punctuation and w not in stopwords]
  return no_stopunc

all_words = []
df_export['title'].apply(lambda x: all_words.extend(clean(x)))

# Creating a proper mask with 255 and 0
# Custom tip : You might want to pick a different picture, but make sure it is saved as a monochrome format
mask = np.array(Image.open("./9gag_monochrome.bmp"))

def transform_format(val):
    if val == True:
        return 0
    else:
        return 255

transformed_mask = np.ndarray((mask.shape[0],mask.shape[1]), np.int32)

for i in range(len(mask)):
    transformed_mask[i] = list(map(transform_format, mask[i]))

# Create a word cloud object
wc = WordCloud(background_color="black", mask=transformed_mask, contour_width=0)

# Generate a wordcloud
wc.generate(' '.join(all_words))

# store to file
wc.to_file(f'./data_save/{user}/{user}_wcloud.png')

<wordcloud.wordcloud.WordCloud at 0x2273f430a30>

# SAVING USER INFO IN JSON

In [51]:
json_content = {
    'handle':user ,
    'alias': pseudo,
    'avatar_link': profile_pic_url,
    'nb_days': age,
    'scrap_date': datetime.strftime(datetime.now(),'%d-%m-%Y')
}

with open(f'./data_save/{user}/{user}_meta.json', 'w', encoding='utf-8') as f:
    json.dump(json_content, f, ensure_ascii=False, indent=4)

# CHASING COMMENTS
Same process as for posts

In [53]:
# Loading comments page on the same web browser instance
driver.get(f'https://9gag.com/u/{user}/comments')

In [54]:
# Scrolling through comments and saving data along the way

scroll_step = 0
stream_nb = 0
df_reactions = pd.DataFrame()

while True:
    try:
        stream = driver.find_element(By.ID, 'stream-'+str(stream_nb))
    
    except:
        try:
            driver.find_element(By.CSS_SELECTOR, 'a.btn.end')
            break
        
        except:
            scroll_step += 800
            driver.execute_script(f"window.scrollTo(0, {scroll_step});")
            time.sleep(1)
            continue
    
    else:#no error
        reactions = stream.find_elements(By.CSS_SELECTOR, 'article')
        df_stream_posts = pd.DataFrame({'content':reactions})
        df_stream_posts['header'] = df_stream_posts['content'].apply(lambda post: post.text)
        df_stream_posts['link'] = df_stream_posts['content'].apply(lambda post: post.find_element(By.CSS_SELECTOR, 'a.badge-evt.badge-track').get_attribute('href'))
        df_reactions = pd.concat([df_reactions,df_stream_posts[['header','link']]], axis=0)
        stream_nb += 1

In [55]:
# Splitting data into proper columns

df_reactions['section'] = df_reactions['header'].apply(lambda head: head.split(' · ')[0])
df_reactions['date'] = df_reactions['header'].apply(lambda head: head.split(' · ')[1].split('\n')[0])
df_reactions['title'] = df_reactions['header'].apply(lambda head: head.split(' · ')[1].split('\n')[1])
df_reactions['nb_up'] = df_reactions['header'].apply(lambda head: head.split(' · ')[1].split('\n')[-5])
df_reactions['nb_down'] = df_reactions['header'].apply(lambda head: head.split(' · ')[1].split('\n')[-4])
df_reactions['nb_comments'] = df_reactions['header'].apply(lambda head: head.split(' · ')[1].split('\n')[-3])
df_reactions.head()

Unnamed: 0,header,link,section,date,title,nb_up,nb_down,nb_comments
0,"Funny · 14h\nWhen the time hits, you gotta do ...",https://9gag.com/gag/aRr8Qnq,Funny,14h,"When the time hits, you gotta do it",8.9K,199,163
1,Funny · 15h\nJust boy things\nPlay\n7.4K\n341\...,https://9gag.com/gag/aWgeZNn,Funny,15h,Just boy things,7.4K,341,242
0,Funny · 3d\nKnow your unicorn\n5.4K\n152\n131\...,https://9gag.com/gag/awzPYPD,Funny,3d,Know your unicorn,5.4K,152,131
1,Funny · 4d\nFootball players when they reached...,https://9gag.com/gag/aVx42md,Funny,4d,Football players when they reached the pitch a...,8.6K,219,146
2,WTF · 5d\nThe personification of toxicity\nGIF...,https://9gag.com/gag/aE8ypXx,WTF,5d,The personification of toxicity,12K,849,1.3K


In [56]:
# Post-processing date and numbers

df_reactions['date'] = df_reactions['date'].apply(transform_date)
df_reactions['nb_up'] = df_reactions['nb_up'].apply(lambda x: int(float(x.replace('K','')) * 1000) if 'K' in x else x)
df_reactions['nb_down'] = df_reactions['nb_down'].apply(lambda x: int(float(x.replace('K','')) * 1000) if 'K' in x else x)
df_reactions['nb_comments'] = df_reactions['nb_comments'].apply(lambda x: int(float(x.replace('K','')) * 1000) if 'K' in x else x)
df_reactions.head()

Unnamed: 0,header,link,section,date,title,nb_up,nb_down,nb_comments
0,"Funny · 14h\nWhen the time hits, you gotta do ...",https://9gag.com/gag/aRr8Qnq,Funny,2022-06-12 22:02:55.509770,"When the time hits, you gotta do it",8900,199,163
1,Funny · 15h\nJust boy things\nPlay\n7.4K\n341\...,https://9gag.com/gag/aWgeZNn,Funny,2022-06-12 21:02:55.509770,Just boy things,7400,341,242
0,Funny · 3d\nKnow your unicorn\n5.4K\n152\n131\...,https://9gag.com/gag/awzPYPD,Funny,2022-06-10 12:02:55.509770,Know your unicorn,5400,152,131
1,Funny · 4d\nFootball players when they reached...,https://9gag.com/gag/aVx42md,Funny,2022-06-09 12:02:55.509770,Football players when they reached the pitch a...,8600,219,146
2,WTF · 5d\nThe personification of toxicity\nGIF...,https://9gag.com/gag/aE8ypXx,WTF,2022-06-08 12:02:55.509770,The personification of toxicity,12000,849,1300


In [57]:
# Saving all data locally

df_export = df_reactions.drop(['header'], axis=1)
user_folder = f'./data_save/{user}'
if not os.path.exists(user_folder):
    os.makedirs(user_folder)
filepath = Path(f'./data_save/{user}/{user}_reactions.csv')
df_export.to_csv(filepath,index=False)