# Grabbing data from the subreddit r/WritingPrompts using the PushShift API

Pushshift documentation: https://github.com/pushshift/api

Essentially you specify a url for what kind of data you want and can then download the information from that page

In [1]:
# IMPORTS
import pandas as pd
import requests
import time

In [2]:
# Defining the pushshift endpoint url for r/WritingPrompts posts in general

url = "https://api.pushshift.io/reddit/search/submission/?subreddit=WritingPrompts&limit=100"

In [3]:
# Function to save posts from a url and return them in a list
def save_url(url):
    
    # Returns a requests.models.Response object
    posts_json = requests.get(url)
    
    # Returns a dictionary with a single value, "data"
    posts_dict = posts_json.json()
    
    # Returns a list of dictionaries, each representing a post
    posts_list = posts_dict["data"]
    
    # Empty list to append info to
    list_shortened_dicts = []
    
    # Append info from each post to the list
    for post in posts_list:
        list_shortened_dicts.append([post["title"], post["created_utc"], post["num_comments"], post["score"]])
        
    return list_shortened_dicts

In [5]:
# Set initial timestamp as current unix time
timestamp = int(time.time())

# Set empty list for loop to append to
main_list = []

# Set var to keep track of progress - just for troubleshooting
count = 0


# Loop to continuously grab data from API
while True:
    
    try:
        # Create the url w/ the timestamp we want posts from before
        before_url = url + f"&before={timestamp}"
        
        # Get the list of posts from the url
        posts_list = save_url(before_url)

        # If there aren't any more posts, our job is done and we can break from the loop
        if len(posts_list) == 0:
            break

        # Add the new posts to the main list
        main_list.extend(posts_list)

        # Set the new timestamp to get posts from before equal to the timestamp of the last post we got
        timestamp = main_list[-1][1]

        # Write the posts to a txt file to save it in case something breaks mid-program
        with open("master_list_file.txt", "a", encoding="utf-8") as master_list_file:
            for post in posts_list:
                for val in post:
                    master_list_file.write(str(val))
                    master_list_file.write("\n")

        # Increment count to track how many loops have run
        count += 1
        
    # Handle exceptions - prevent entire program from breaking, fine if we skip a few posts out of the hundreds of thousands
    except Exception as e:
        # Print exception
        print("AN EXCEPTION OCCURED")
        print(e)
        
        # Wait a few seconds to see if the API was throttling us, then skip an hour just in case
        time.sleep(5)
        timestamp += 3600
        
    # Give the API time to recover so it doesn't throttle us
    time.sleep(.5)

# Create a dataframe out of all the posts
df = pd.DataFrame(main_list, columns=["title", "utc", "num_comments", "score"])

In [8]:
# Check to see if dataframe is functional
print(df.shape)
df.head()

(876845, 4)


Unnamed: 0,title,utc,num_comments,score
0,"[WP] Waking up from a hangover, you find yours...",1624815759,1,1
1,[WP] You are born a pied piper. Playing your i...,1624815706,1,1
2,[WP] you see a floating countdown over every p...,1624815642,1,1
3,"[WP] You are a traffic light, witnessing human...",1624814841,1,1
4,Newbie,1624814205,1,1


In [10]:
# Save dataframe to a csv file
df.to_csv("WPData.csv")