# Push Shift API for Reddit Data

Imports and Function Definitions for Reddit Data Extraction

In [97]:
# Import necessary libraries
from psaw import PushshiftAPI
import datetime as dt
import pandas as pd
import codecs

# Initialize intance of the API
api = PushshiftAPI()

# Get word count
def word_count(text):
    count = 0
    for char in text:
        if char == ' ':
            count += 1
    return count

# Gathers desired data from subreddit
def get_dict_data_from_sub(subreddit, keyword, searchstartdate, limitofresults):
    submissions = list(api.search_submissions(
        after=searchstartdate,      # Start date of search
        subreddit=subreddit,        # Subreddit to search
        filter=['id','selftext'],   # Filter by fields of interest
        limit=limitofresults,       # Number of desired results (set limit = None when you went to get as many as needed)
        q=keyword))               # Will only return results that contain this keyword

    data_dict = {}

    # Gets the data from submissions and puts in dictionary (keys = submission id: values = (submission date, text/selftext))
    for submission in submissions:
        id = submission[1]
        text = str(submission[2]).replace('\n', '') # removing \n characters in text
        creation_date = submission[0]
        data_dict[id] = (creation_date, word_count(text), text)

    with codecs.open(f'{subreddit}_submissions.tsv', 'w', encoding='utf8') as f:
        for key in data_dict:
            f.write(f"{key}\t{data_dict[key][0]}\t{data_dict[key][1]}\t{data_dict[key][2]}\n")

Now go through the subreddits and get data for all of them

In [98]:
subs_and_dates = {'careerguidance': (2011, 12, 24),
                'careeradvice': (2010, 1, 12),
                'cscareerquestions': (2011, 3, 19),
                'carreerchange': (2013, 8, 3),
                'FinancialCareers': (2011, 3, 4),
                'LifeProTips': (2010, 10, 25),
                'ADHD': (2008, 10, 28),
                'productivity': (2008, 1, 25),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),
                '': (),}

start_date = int(dt.datetime(2011, 12, 24).timestamp())
get_dict_data_from_sub('careerguidance', 'burnout', start_date, None)

