In [42]:
import pandas as pd
import requests
import os

In [43]:
# Parameters
subreddit_1 = 'Buddhism'
subreddit_2 = 'Stoicism'
num_posts = 10000

submission_url = 'https://api.pushshift.io/reddit/search/submission'
comment_url = 'https://api.pushshift.io/reddit/search/comment'

# Function to Pull from Pushshift Reddit API

In [44]:
def get_posts(subreddit, num_posts, before='1654041600',
              url='https://api.pushshift.io/reddit/search/submission',
              selftext_missing_list=['[removed]', '', '[deleted]']):

    data = []
    count = 0

    while True:
        # initializing params for api call
        params = {'subreddit': subreddit,
                  'size': '100',
                  'before': before}

        # get the data from the API in a batch of 100
        res = requests.get(url, params=params)
        batch = res.json()['data']

        if batch == []:
            print(f'No more posts to retrieve for {subreddit}')
            break

        # count num of valid posts in the batch so when we have enough
        for i in batch:
            try:
                if i['selftext'] not in selftext_missing_list:
                    count += 1
            except:
                pass

        # append the batch to the full list of data
        data.extend(batch)

        # print verification of execution
        print(f'{subreddit}, status={res.status_code}, text posts pulled={count}, before={before}')

        # update 'before' so the next api call knows where to start from
        before = batch[-1]['created_utc']

        if count >= num_posts:
            break

    # return the data as a dataframe with the selected columns
    df = pd.DataFrame(data)
    df['readable_time'] = pd.to_datetime(df['created_utc'], unit='s')

    print(f'{len(df)} total posts pulled from r/{subreddit}\n')

    return df


A note on the velocity of the API requests, a sleep() function with random sleep time between 0 and 5 seconds can be used to avoid the API rate limit.

However in testing, the API rate limit was not reached by this function so it was not used.

# Data Retrieval from Reddit

In [45]:
# Gathering the data
df_a = get_posts(subreddit_1, num_posts=num_posts)


Buddhism, status=200, text posts pulled=53, before=1654041600
Buddhism, status=200, text posts pulled=102, before=1653929137
Buddhism, status=200, text posts pulled=161, before=1653780819
Buddhism, status=200, text posts pulled=221, before=1653614037
Buddhism, status=200, text posts pulled=274, before=1653451537
Buddhism, status=200, text posts pulled=326, before=1653330058
Buddhism, status=200, text posts pulled=384, before=1653219077
Buddhism, status=200, text posts pulled=433, before=1653100997
Buddhism, status=200, text posts pulled=489, before=1652962533
Buddhism, status=200, text posts pulled=544, before=1652795521
Buddhism, status=200, text posts pulled=598, before=1652664118
Buddhism, status=200, text posts pulled=657, before=1652516010
Buddhism, status=200, text posts pulled=712, before=1652382690
Buddhism, status=200, text posts pulled=766, before=1652233203
Buddhism, status=200, text posts pulled=815, before=1652094570
Buddhism, status=200, text posts pulled=859, before=1651

In [46]:
df_b = get_posts(subreddit_2, num_posts=num_posts)

Stoicism, status=200, text posts pulled=96, before=1654041600
Stoicism, status=200, text posts pulled=187, before=1653576079
Stoicism, status=200, text posts pulled=282, before=1653171850
Stoicism, status=200, text posts pulled=377, before=1652821896
Stoicism, status=200, text posts pulled=466, before=1652457964
Stoicism, status=200, text posts pulled=564, before=1652028689
Stoicism, status=200, text posts pulled=659, before=1651625520
Stoicism, status=200, text posts pulled=751, before=1651284038
Stoicism, status=200, text posts pulled=843, before=1650917548
Stoicism, status=200, text posts pulled=935, before=1650600658
Stoicism, status=200, text posts pulled=1027, before=1650248292
Stoicism, status=200, text posts pulled=1124, before=1649807138
Stoicism, status=200, text posts pulled=1214, before=1649373617
Stoicism, status=200, text posts pulled=1303, before=1648910737
Stoicism, status=200, text posts pulled=1392, before=1648538787
Stoicism, status=200, text posts pulled=1484, befor

In [47]:
# store raw data as a csv file
if not os.path.exists('output'):
    os.makedirs('output')

df_a.to_csv(f'output/{subreddit_1}_raw_data.csv')
df_b.to_csv(f'output/{subreddit_2}_raw_data.csv')


In [48]:
# store select columns in another csv file
fields = ['subreddit', 'id', 'author', 'title', 'selftext', 'readable_time']

df_a[fields].to_csv(f'output/{subreddit_1}_select.csv')
df_b[fields].to_csv(f'output/{subreddit_2}_select.csv')