In [3]:
#importing libraries used
import requests
import pandas as pd
import time

# Data Collection with Pushshift API

In [4]:
#pushshift api
url = 'https://api.pushshift.io/reddit/search/submission'

In [5]:
#data collection for The Onion subreddit

#creating empty dataframe to house data scrapped
onion_df = pd.DataFrame() 

#parameters for pushshift api, epoch value of 1635120000 is 25/10/2021 00:00
params_onion = {"subreddit" : "theonion",
                  "fields": ['subreddit', 'title', 'removed_by_category', 'created_utc'],
                  "size": 100,
                  "before": 1635120000}

#loop to collect at least 2000 data points as pushshift is limited to 100 posts per run
while len(onion_df) < 2000:
    onion_res = requests.get(url, params_onion)
    
    #checks if code is good to go
    if onion_res.status_code == 200:
        onion_add_df = pd.DataFrame(onion_res.json()['data'])
        
        #checks and removes deleted posts
        onion_df = onion_df.append(onion_add_df[onion_add_df['removed_by_category'].isnull()])
        
        #checks and removes duplicate posts
        onion_df.drop_duplicates(subset=['title'],inplace=True)
        
        #updates parameters's date to last post's
        params_onion.update({"before":(onion_df["created_utc"].iloc[-1])})
        
#if api is overloaded, i.e, res status code = 493, rests for 1 second before re-attempting collection        
    else:  
        time.sleep(1)

In [6]:
#data collection for News subreddit

#creating empty dataframe to house data scrapped
news_df = pd.DataFrame()

#parameters for pushshift api, epoch value of 1635120000 is 25/10/2021 00:00
params_news = {"subreddit" : "news",
                  "fields": ['subreddit', 'title', 'removed_by_category', 'created_utc',''],
                  "size": 100,
                  "before": 1635120000}

#loop to collect at least 2000 data points as pushshift is limited to 100 posts per run
while len(news_df) < 2000:
    news_res = requests.get(url, params_news)
    
    #checks if code is good to go
    if news_res.status_code == 200:
        news_add_df = pd.DataFrame(news_res.json()['data'])
        
        #checks and removes for deleted posts
        news_df = news_df.append(news_add_df[news_add_df['removed_by_category'].isnull()])
        
        #checks and removes duplicate posts
        news_df.drop_duplicates(subset=['title'],inplace=True)
        
        #updates parameters's date to last post's
        params_news.update({"before":(news_df["created_utc"].iloc[-1])})
        
#if api is overloaded, i.e, res status code = 493, rests for 1 second before re-attempting collection 
    else: 
        time.sleep(1)

In [7]:
#check no. of posts scrapped 
print("No. of The Onion title: " + str(len(onion_df)))
print("No. of News Title: " + str(len(news_df)))

No. of The Onion title: 2018
No. of News Title: 2011


# Basic Data Cleaning

In [8]:
#check for duplicate posts
print("The Onion duplicates: " + str(onion_df[['title']].duplicated().sum()))
print("News duplicates: " + str(news_df[['title']].duplicated().sum()))
print("\n")
 
#check for null entries
print("No. of null entries for The Onion: " + "\n"  + str(onion_df.isnull().sum()))
print("No. of null entries for News: " + "\n" + str(news_df.isnull().sum()))

The Onion duplicates: 0
News duplicates: 0


No. of null entries for The Onion: 
created_utc               0
subreddit                 0
title                     0
removed_by_category    2018
dtype: int64
No. of null entries for News: 
created_utc               0
removed_by_category    2011
subreddit                 0
title                     0
dtype: int64


In [9]:
#preliminary look at data
print(onion_df.info)
print(news_df.info)

<bound method DataFrame.info of     created_utc subreddit                                              title  \
0    1635090051  TheOnion  Smithsonian Acquires Arms Of Kermit The Frog P...   
1    1634958726  TheOnion  Less Popular Friend Only Included In Suicide P...   
3    1634940672  TheOnion  Crypto-Averse Man Would Prefer Investing In Tr...   
4    1634910709  TheOnion  God Loses Pouch Filled With Crystals That Give...   
7    1634872463  TheOnion  Retired NFL Player Touts Sports Betting App As...   
..          ...       ...                                                ...   
95   1580708304  TheOnion  The Onion is made a podcast called "The Topica...   
96   1580691176  TheOnion  ‘I’m Just Here For The Commercials,’ Jokes Man...   
97   1580669936  TheOnion     My god how much Budget did they have for this?   
98   1580662117  TheOnion      Snacks Distract Lawmakers From Horrors of War   
99   1580589888  TheOnion  Onion Talks: Hypothetically It Would Be Okay T...   

   remo

Index seems to be messed up, will reset index. Since 'remove_by_category' already helped sifted deleted posts, it is no longer needed and column can be dropped.

In [10]:
#reset index for both data sets
onion_df.reset_index(inplace=True)
news_df.reset_index(inplace=True)

In [11]:
#removal of old index and removed_by_category columns
onion_df.drop(["index", "removed_by_category"], axis=1, inplace=True)
news_df.drop(["index", "removed_by_category"], axis=1, inplace=True)

In [12]:
#check if columns were dropped
print(onion_df.columns)
print(news_df.columns)

Index(['created_utc', 'subreddit', 'title'], dtype='object')
Index(['created_utc', 'subreddit', 'title'], dtype='object')


In [13]:
#exporting collected data
onion_df.to_csv('theonion', index=False)
news_df.to_csv('news', index=False)