In [4]:
#import request library
import requests 
import time # to delay the scrape time
import datetime as dt
import pandas as pd

## Function to get API

In [64]:
def inquire(subreddit, days, n):
    col = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    #base url and parameters:
    base_url = 'https://api.pushshift.io/reddit/search/submission'
    
    #days to get post
    days = days #this mean to get post after 'days'ago. get post from 1 day ago, after 2 days ago, ...
    
    #empty list to add into the dictionary created from json
    posts = []
    
    #repeat n times, size each time request is limited to 500 post. look for at least 2000
    #start from 1 and include iself.
    for i in range(1, n+1):
        
        #send url request
        re = requests.get(base_url,
                          params = {
                              'subreddit' : subreddit,
                              'size' : 500,
                              'after' : f'{days*i}d'
                          })
        
        #add statment to check status
        print(f'Completed requet for {subreddit} for {days*i} day(s)')
        
        #add assert incase the request was bad so we know
        assert re.status_code == 200
        
        #convert to json with just the data section
        json = re.json()['data']
        
        #convert json to df
        df = pd.DataFrame(json)
        
        #add dictionary to the posts
        posts.append(df)
        
        #give it a pause of 2 second for each request
        time.sleep(2)
        
    #combine each posts into one full list
    combine = pd.concat(posts, sort = False)
    
    #clean the data. Remove duplicates, and only contain is_self post, so no repost
    combine = combine[col]
    
    combine.drop_duplicates(inplace = True)
    
    combine = combine.loc[combine['is_self'] == True]
    
    #add the converted time column for each post
    combine['time_date'] = combine['created_utc'].map(dt.date.fromtimestamp)
    
    #indicator for finish
    print('Request Completed')
    
    return combine
        

In [67]:
e_46 = inquire('e46', 60, 10)

Completed requet for e46 for 60 day(s)
Completed requet for e46 for 120 day(s)
Completed requet for e46 for 180 day(s)
Completed requet for e46 for 240 day(s)
Completed requet for e46 for 300 day(s)
Completed requet for e46 for 360 day(s)
Completed requet for e46 for 420 day(s)
Completed requet for e46 for 480 day(s)
Completed requet for e46 for 540 day(s)
Completed requet for e46 for 600 day(s)
Request Completed


In [68]:
len(e_46)

2062

In [78]:
e_90 = inquire('E90', 60, 15)

Completed requet for E90 for 60 day(s)
Completed requet for E90 for 120 day(s)
Completed requet for E90 for 180 day(s)
Completed requet for E90 for 240 day(s)
Completed requet for E90 for 300 day(s)
Completed requet for E90 for 360 day(s)
Completed requet for E90 for 420 day(s)
Completed requet for E90 for 480 day(s)
Completed requet for E90 for 540 day(s)
Completed requet for E90 for 600 day(s)
Completed requet for E90 for 660 day(s)
Completed requet for E90 for 720 day(s)
Completed requet for E90 for 780 day(s)
Completed requet for E90 for 840 day(s)
Completed requet for E90 for 900 day(s)
Request Completed


In [79]:
len(e_90)

1906

In [None]:
#export to CSV for EDA, and modeling.

e_46.to_csv('./data/e_46.csv', index = False)
e_90.to_csv('./data/e_90.csv', index = False)