# pushshift.io

https://pushshift.io/ keeps copy of reddit posts and comments for Big Data and Social Media ingest and analysis.

It provides free APIs to access subreddit and it has much higher limits for how much data can be requested at once, 1000 elements vs. Reddit's 100.

No developer registration is required.

# pushshift.io API Documentation

https://pushshift.io/api-parameters/

In [1]:
import requests
import time
import pprint

# Get List of Posts

`before` - use epoch time, ie, 1633107473

`limit` - number of entries per request, 1000

In [11]:
def get_posts(pushshift_url, subreddit_name, before_time, max_size=100):
    req = requests.get(f'{pushshift_url}/?subreddit={subreddit_name}&sort=desc&sort_type=created_utc&before={before_time}&size={max_entries}')
    return req.json()

def get_comments(pushshift_url, comment_id, max_size=100):
    req = requests.get(f'{pushshift_url}/?link_id={comment_id}&limit={max_size}')
    return req.json()

In [3]:
pushshift_url = 'https://api.pushshift.io/reddit/search/submission'
subreddit_name = 'migraine'
max_entries = 1000
before_time = int(time.time())  # current epoch time

posts = get_posts(pushshift_url, subreddit_name, before_time, max_entries)
data = posts.get('data', [])
print(f'Number of entries: {len(data)}')
if len(data) > 0:
    print('First entry:')
    pprint.pprint(data[0])
    print('')
    print('Last entry:')
    pprint.pprint(data[-1])

Number of entries: 100
First entry:
{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'anthopleuraxantho',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_u0p53',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1633106400,
 'domain': 'self.migraine',
 'full_link': 'https://www.reddit.com/r/migraine/comments/pzbtkq/day_after_migraine_symptoms/',
 'gildings': {},
 'id': 'pzbtkq',
 'is_created_from_ads_ui': False,
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follo

# Request Next Page

To request next page find the `created_utc` field in the last entry of the current page and use it as `before` field in the request.

In [4]:
before_time = data[-1]['created_utc']

posts = get_posts(pushshift_url, subreddit_name, before_time, max_entries)
data = posts.get('data', [])
print(f'Number of entries: {len(data)}')
if len(data) > 0:
    print('First entry:')
    pprint.pprint(data[0])
    print('')
    print('Last entry:')
    pprint.pprint(data[-1])

Number of entries: 100
First entry:
{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'unmaredDlite',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_8ofzwrq6',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1632875739,
 'domain': 'self.migraine',
 'full_link': 'https://www.reddit.com/r/migraine/comments/pxjvma/rant_dad_thinks_scent_triggers_are_bs/',
 'gildings': {},
 'id': 'pxjvma',
 'is_created_from_ads_ui': False,
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 '

# Get Comments

To retrieve comments first get post's id `id` and use it with `/reddit/comment/search` API.

In [12]:
pushshift_url = 'https://api.pushshift.io/reddit/comment/search'

comments = get_comments(pushshift_url, data[0]['id'])
comment_data = comments.get('data', [])
print(f'Number of entries: {len(comment_data)}')
if len(comment_data) > 0:
    print('First comment:')
    pprint.pprint(comment_data[0])
    print('')
    print('Last comment:')
    pprint.pprint(comment_data[-1])

Number of entries: 10
First comment:
{'all_awardings': [],
 'approved_at_utc': None,
 'associated_award': None,
 'author': 'PoppyRyeCranberry',
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_36dcnhor',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'body': 'I just went down a rabbit hole of articles - depending on where in '
         'the spectrum of internet article to scientific writing they would '
         'best respond, there\'s lots out there.  Searching "smell trigger '
         'migraine" gets you lots of general confirmation, but searching '
         '"olfactory stimulation nervous system" or "olfactory stimulation '
         'migraine" gets you to the more medical/scientific writing.\n'
     