In [1]:
import math
import json
import requests
import itertools
import numpy as np
import pandas as pd
import time
from datetime import datetime, timedelta, timezone
import re

In [2]:
def make_request(uri, max_retries = 5):

    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

In [3]:
columns = ['postid', 'author', 'created_utc', 'permalink', 'score', 'title', 'selftext']

def formulate(p):
    def clear(s):
        return re.sub("[^0-9a-zA-Z]+", " ", s)
    
    res = []
    for f in ['id', 'author_fullname', 'created_utc', 'permalink', 'score']:
        if f in p:
            res.append(p[f])
        else:
            res.append(None)
    if 'title' in p:
        res.append(clear(p['title']))
    else:
        res.append(None)
    if 'selftext' in p:
        res.append(clear(p['selftext']))
    else:
        res.append(None)
    return res



In [4]:
def pull_posts_for(subreddit, start_at, end_at):
    
    def map_posts(posts):
        '''return list(map(lambda post: {
            'id': post['id'],
            'created_utc': post['created_utc'],
            'prefix': 't4_'
        }, posts))'''
        return posts
    
    SIZE = 100
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    URI_TEMPLATE += '&fields=selftext,title,created_utc,id,permalink,score,author_fullname'
    post_collections = map_posts( \
        make_request( \
            URI_TEMPLATE.format( \
                subreddit, start_at, end_at, SIZE))['data'])
    
    formulated_p = [formulate(p) for p in post_collections]
    df = pd.DataFrame(formulated_p, columns=columns)
    df.to_csv('Jan_March_' + str(start_at) + '.csv')
    
    n = len(post_collections)
    dist = end_at - start_at
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - 1
        
        print((new_start_at - start_at) / dist)
        more_posts = map_posts( \
            make_request( \
                URI_TEMPLATE.format( \
                    subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_posts)
        formulated_p = [formulate(p) for p in more_posts]
        df = pd.DataFrame(formulated_p, columns=columns)
        df.to_csv('May_Jul_' + str(new_start_at) + '.csv')
        post_collections = more_posts
    return post_collections


In [5]:
subreddit = 'wallstreetbets'
end_at = int(datetime(2021, 8, 1).replace(tzinfo=timezone.utc).timestamp())
start_at = int(datetime(2021, 7, 1).replace(tzinfo=timezone.utc).timestamp())
posts = pull_posts_for(subreddit, start_at, end_at)


0.002396580047789725
0.004120370370370371
0.006813022700119475
0.010393891875746715
0.013953853046594982
0.016953031660692952
0.018437126642771803
0.01926000597371565
0.02016465053763441
0.02138702210274791
0.022424208482676223
0.023272849462365592
0.024027031063321387
0.024940636200716845
0.025869922341696536
0.027134856630824374
0.02840725806451613
0.02981332138590203
0.03128024193548387
0.033096624850657105
0.035221400836320194
0.03699970131421745
0.03909460872162485
0.04356518817204301
0.04777292413381123
0.05036327658303465
0.05142323775388292
0.05229577359617682
0.053136200716845876
0.05395908004778972
0.0551008064516129
0.05608833632019116
0.057112455197132615
0.05829898446833931
0.059611708482676225
0.0612313321385902
0.06379517622461171
0.06638440860215054
0.06952620967741935
0.074723715651135
0.08218974014336917
0.09046482974910394
0.09257168458781362
0.0967827807646356
0.10076762246117085
0.10641278375149343
0.11358348267622462
0.11805667562724015
0.12312014635603345
0.12696