In [6]:
# Imports

# Standards
import numpy as np
import pandas as pd

# API
import requests

# Automating
import time
import datetime as dt
import warnings
import sys

In [10]:
# Creates the get_comments function that scrapes comments using the pushshift api
def get_comments(base_url, subreddit, n_iter, epoch_right_now):
    
    """Scrapes reddit data using the pushshift api, and pulls 100 posts each time, while waiting 5 seconds between each pull"""
    df_list = []
    current_time = epoch_right_now

    for comment in range(n_iter):
        
        params = {
            'subreddit': subreddit,
            'size': 100,
            'lang': True,   
            'before': current_time
        }
        
        res = requests.get(base_url, params)

        df = pd.DataFrame(res.json()['data'])
        
        df = df[['subreddit', 'created_utc', 'body', 'author', 'permalink']]

        df_list.append(df)
        
        current_time = df['created_utc'].min()  
        
        # add wait time
        time.sleep(5)

    return pd.concat(df_list)

In [11]:
# Defines the reddit url to scrape comments
reddit_url = 'https://api.pushshift.io/reddit/search/comment' 

In [12]:
# Defines the tolkien subreddit, and runs the get_comments function on the tolkien reddit posts
tolk_subreddit = 'tolkienfans'
tolkien_comments = get_comments(reddit_url, tolk_subreddit, n_iter=100, epoch_right_now=1616998184)

In [13]:
# Defines the harry potter subreddit, and runs the get_comments function on the tolkien reddit posts
hp_subreddit = 'harrypotter'
hp_comments = get_comments(reddit_url, hp_subreddit, n_iter=100, epoch_right_now=1616998184)

In [14]:
# Concatenates the two scrapes from the two separate reddits together
comments = pd.concat([tolkien_comments, hp_comments])

In [15]:
# Saves the final dataframe as a csv file in the data folder 
comments.to_csv('../../data/reddit_comments.csv')