In [1]:
import sys
import datetime
import urllib.request
import time
import json
import os
import pandas
import multiprocessing as mp
import threading_jobs as tj

In [None]:
# for the following, string replace these values:
# %subreddit%
# %before_epoch%
# %after_epoch%
# %size%

# only difference is submission and num_comments
def getBaseSubmissionURL():
    return "https://api.pushshift.io/reddit/search/submission/?subreddit=%subreddit%&after=%after_epoch%&before=%before_epoch%&sort_type=num_comments&size=%size%"

# only difference is comment and score
def getBaseCommentURL():
    return "https://api.pushshift.io/reddit/search/comment/?subreddit=%subreddit%&after=%after_epoch%&before=%before_epoch%&sort_type=score&size=%size%"

# build a request url with given input
def buildBasicRequest(baseUrl, subreddit, before_epoch, after_epoch, size):
    return baseUrl.replace("%subreddit%", subreddit).replace("%after_epoch%", after_epoch).replace("%before_epoch%", before_epoch).replace("%size%", size)

subreddits = ["nyc", "losangeles", "chicago", "houston", "phoenix"]

In [None]:
# epochs
def getEpochForDate(year, month, day):
    return str(datetime.datetime(year,month,day,0,0).timestamp()).rstrip('0').rstrip('.')

print(getEpochForDate(2010,12,25))

In [None]:
# here is how we check what file to write to
# this is also going to initialize the files that we need if it isn't there

def getFileName(subreddit, year):
    # check file exists in 'D:\social_media_analytics\reddit_content\chicago'
    # if it does not, create the file with initial headers for .csv
    # regardless, return "D:\social_media_analytics\reddit_content\{subreddit}\{year}_reddit_submissions_and_comments.csv"
    file_name = 'D:\social_media_analytics\\reddit_content\%s\%s_%s_submissions_and_comments.csv' %(subreddit, year, subreddit)
    if not os.path.isfile(file_name):
        print("writing new file for " + subreddit + ":" + str(year))
        f = open(file_name, "+a")
        f.write("created_utc,author,body,type\n")
        f.close()
    return file_name


# print(getFileName("houston", 2010))

In [None]:
# here is the meat, here is where the calls happen

# the approach is to enter this with the timeframe,
# then we're going to make both calls for each of our five cities

# we start with submissions, pull back up to 100 - sorted by most comments
# we do size=250-submissions_result_count for comments

# write everything to a file for the given year
# we are writing to {year}_reddit_submissions_and_comments.csv

# here is the data we care about in our results:
# "created_utc" : epoch time
# "author"      : string, could be repeats or "[deleted]"
# "body"        : string
# "type"        : string, could be submission or comment

def parsePosts(subreddit, posts, content_name, comment_or_submission):
    data = json.loads(posts.decode("utf-8"))
    proper_list_of_posts = data["data"]
    results = []
    for post in proper_list_of_posts:
        author = post["author"]
        created_utc = post["created_utc"]
        body = post[content_name]
        try:
            body = body.replace("\r", "")
            body = body.replace("\n", "")
            body = body.replace("\\n", "")
            body = body.replace(",", "")
        except:
            print("error replacing new lines")
        results.append([subreddit, [created_utc, author, body, comment_or_submission]])
    return results

def getHistoricalData(after_epoch, before_epoch):
    final_posts = []
    for subreddit in subreddits:
        time.sleep(2)
        posts1 = urllib.request.urlopen(buildBasicRequest(getBaseCommentURL(), subreddit, str(before_epoch), str(after_epoch), str(200))).read()
        posts2 = urllib.request.urlopen(buildBasicRequest(getBaseSubmissionURL(), subreddit, str(before_epoch), str(after_epoch), str(200))).read()
        parsed_posts = parsePosts(subreddit, posts1, "body", "comment")
        parsed_posts = parsed_posts + parsePosts(subreddit, posts2, "title", "submission")
        for group in parsed_posts:
            final_posts.append([str(group[0]),group[1]])
    return final_posts


In [None]:
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)


# need this starting date for first iteration
prior_start_date = datetime.date(2009, 12, 31)

# starting at january 1st 2010
#start_date = datetime.date(2010, 1, 1)

# last day will be day before, so january 1st 2018
#end_date = datetime.date(2018, 1, 1)

for single_date in daterange(start_date, end_date):
    
    # get split version - [year, month, day]
    prior_date = prior_start_date.strftime("%Y-%m-%d").split('-')
    current_date = single_date.strftime("%Y-%m-%d").split('-')
    
    # get the epochs
    after_epoch = getEpochForDate(int(prior_date[0]), int(prior_date[1]), int(prior_date[2]))
    before_epoch = getEpochForDate(int(current_date[0]), int(current_date[1]), int(current_date[2]))
    
    # update before search in case something breaks
    prior_start_date = single_date
    
    result = []
    try:
        # get that data
        result = getHistoricalData(after_epoch, before_epoch)
    except:
        print('error for current date: ' + str(current_date))

    try:
        # now we have all of the content for all 5 subreddits for the day, let's get the file name to write to, then write the data
        for post_group in result:
            file_to_write_data = getFileName(post_group[0], current_date[0])
            content = post_group[1]
            data_to_write = str(content[0]) + "," + str(content[1]) + "," + str(content[2]) + "," + str(content[3]) + "\n"
            f = open(file_to_write_data, "+a")
            f.write(data_to_write)
            f.close()
    except:
        print("error writing results to file:")
        print(result)
