# Push Shift API for Reddit Data

Imports and Function Definitions for Reddit Data Extraction

Now go through the subreddits and get data for all of them

In [4]:
from re import sub
from psaw import PushshiftAPI
import datetime as dt
import pandas as pd
import codecs
import os.path
from os import path


# Initialize API
api = PushshiftAPI()

# Get word count
def word_count(text):
    count = 0
    for char in text:
        if char == ' ':
            count += 1
    return count

# Gathers desired data from subreddit
def get_dict_data_from_sub(subreddit, keyword, search_start_date, limit_of_results, override):
    submissions = list(api.search_submissions(
        after=search_start_date,   # Start date of search
        subreddit=subreddit,       # Subreddit to search
        filter=['id','selftext'],  # Filter by fields of interest
        limit=limit_of_results,    # Number of desired results (set limit = None when you went to get as many as needed)
        q=keyword))                # Will only return results that contain this keyword

    # Save raw data files from above
    if path.exists(f'{subreddit}_submissions_raw.txt') == override:
        with codecs.open(f'{subreddit}_submissions_raw.txt', 'w', encoding='utf8') as f:
            for element in submissions:
                f.write(f"{element}\n")

    data_dict = {}

    # Gets the data from submissions and puts in dictionary (keys = submission id: values = (submission date, text/selftext))
    for submission in submissions:
        id = submission[1]
        text = str(submission[2]).replace('\n', '') # removing \n characters in text
        creation_date = submission[0]
        data_dict[id] = (creation_date, word_count(text), text)
    
    if path.exists(f'{subreddit}_submissions.tsv') == override:
        with codecs.open(f'{subreddit}_submissions.tsv', 'w', encoding='utf8') as f:
            for key in data_dict:
                f.write(f"{key}\t{data_dict[key][0]}\t{data_dict[key][1]}\t{data_dict[key][2]}\n")


subs_and_dates = {'careerguidance': (2010, 1, 1),
                'careeradvice': (2010, 1, 1),
                'cscareerquestions': (2010, 1, 1)}
                # 'FinancialCareers': (2011, 3, 4),
                # 'LifeProTips': (2010, 10, 25),
                # 'ADHD': (2008, 10, 28),
                # 'productivity': (2008, 1, 25),
                # 'selfimprovement': (2008, 9, 10),
                # 'depression': (2000, 1, 1),
                # 'AskReddit': (2008, 1, 25),
                # 'college': (2008, 1, 25),
                # 'mentalhealth': (2008, 6, 12),
                # 'Anxiety': (2008, 9, 15),
                # 'Advice': (2008, 6, 26),
                # 'UCSD': (2009, 9, 11),
                # 'ucla': (2009, 1, 31),
                # 'berkeley': (2008, 2, 27),
                # 'princeton': (2009, 9, 12),
                # 'stanford': (2009, 9, 21)
                

for key in subs_and_dates:
    date = subs_and_dates[key]
    subreddit = key
    start_date = int(dt.datetime(date[0], date[1], date[2]).timestamp())
    get_dict_data_from_sub(subreddit, 'burnout', start_date, 10, True)


# start_date = int(dt.datetime(2021, 1, 1).timestamp())

