# Function Definitions and Imports

In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [2]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [3]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [4]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [5]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-05-01', 
                              before='2018-06-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-05-01&before=2018-06-01&limit=1000
1526905950
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1526905950&before=2018-06-01&limit=1000
526


In [6]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,8g45yv,2018-05-01 00:02:35,1525132955,"The Ataribox Gets Real, Atari VCS Pre-Order Sa...",,1,1,0


In [7]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=8g45yv
https://api.pushshift.io/reddit/comment/search/?link_id=8g47z0
https://api.pushshift.io/reddit/comment/search/?link_id=8g4jmp
https://api.pushshift.io/reddit/comment/search/?link_id=8g4ot4
https://api.pushshift.io/reddit/comment/search/?link_id=8g515l
https://api.pushshift.io/reddit/comment/search/?link_id=8g5e3s
https://api.pushshift.io/reddit/comment/search/?link_id=8g5io8
https://api.pushshift.io/reddit/comment/search/?link_id=8g64tn
https://api.pushshift.io/reddit/comment/search/?link_id=8g64wk
https://api.pushshift.io/reddit/comment/search/?link_id=8g6dq9
https://api.pushshift.io/reddit/comment/search/?link_id=8g6iwv
https://api.pushshift.io/reddit/comment/search/?link_id=8g6kqu
https://api.pushshift.io/reddit/comment/search/?link_id=8g6niz
https://api.pushshift.io/reddit/comment/search/?link_id=8g6nmu
https://api.pushshift.io/reddit/comment/search/?link_id=8g6nqu
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8gnezo
https://api.pushshift.io/reddit/comment/search/?link_id=8go7z8
https://api.pushshift.io/reddit/comment/search/?link_id=8go9p6
https://api.pushshift.io/reddit/comment/search/?link_id=8goj6h
https://api.pushshift.io/reddit/comment/search/?link_id=8gonzj
https://api.pushshift.io/reddit/comment/search/?link_id=8gosly
https://api.pushshift.io/reddit/comment/search/?link_id=8gp1z5
https://api.pushshift.io/reddit/comment/search/?link_id=8gp27n
https://api.pushshift.io/reddit/comment/search/?link_id=8gp6hs
https://api.pushshift.io/reddit/comment/search/?link_id=8gp9dg
https://api.pushshift.io/reddit/comment/search/?link_id=8gpa32
https://api.pushshift.io/reddit/comment/search/?link_id=8gpfuk
https://api.pushshift.io/reddit/comment/search/?link_id=8gpngk
https://api.pushshift.io/reddit/comment/search/?link_id=8gpq90
https://api.pushshift.io/reddit/comment/search/?link_id=8gpt82
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8h8l5d
https://api.pushshift.io/reddit/comment/search/?link_id=8h8lxf
https://api.pushshift.io/reddit/comment/search/?link_id=8h8zpc
https://api.pushshift.io/reddit/comment/search/?link_id=8h935p
https://api.pushshift.io/reddit/comment/search/?link_id=8h9876
https://api.pushshift.io/reddit/comment/search/?link_id=8h9h8i
https://api.pushshift.io/reddit/comment/search/?link_id=8h9p6i
https://api.pushshift.io/reddit/comment/search/?link_id=8h9zyy
https://api.pushshift.io/reddit/comment/search/?link_id=8ha3hw
https://api.pushshift.io/reddit/comment/search/?link_id=8ha81u
https://api.pushshift.io/reddit/comment/search/?link_id=8hafus
https://api.pushshift.io/reddit/comment/search/?link_id=8hag29
https://api.pushshift.io/reddit/comment/search/?link_id=8hap55
https://api.pushshift.io/reddit/comment/search/?link_id=8haq10
https://api.pushshift.io/reddit/comment/search/?link_id=8hawic
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8hwhd3
https://api.pushshift.io/reddit/comment/search/?link_id=8hwlrh
https://api.pushshift.io/reddit/comment/search/?link_id=8hwnik
https://api.pushshift.io/reddit/comment/search/?link_id=8hwnz4
https://api.pushshift.io/reddit/comment/search/?link_id=8hww2r
https://api.pushshift.io/reddit/comment/search/?link_id=8hwywv
https://api.pushshift.io/reddit/comment/search/?link_id=8hx2yk
https://api.pushshift.io/reddit/comment/search/?link_id=8hx3v2
https://api.pushshift.io/reddit/comment/search/?link_id=8hx6e8
https://api.pushshift.io/reddit/comment/search/?link_id=8hx7in
https://api.pushshift.io/reddit/comment/search/?link_id=8hxbvc
https://api.pushshift.io/reddit/comment/search/?link_id=8hxf2k
https://api.pushshift.io/reddit/comment/search/?link_id=8hxn5r
https://api.pushshift.io/reddit/comment/search/?link_id=8hxrnq
https://api.pushshift.io/reddit/comment/search/?link_id=8hxrnw
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8ii2af
https://api.pushshift.io/reddit/comment/search/?link_id=8iil8q
https://api.pushshift.io/reddit/comment/search/?link_id=8iio9r
https://api.pushshift.io/reddit/comment/search/?link_id=8iisc6
https://api.pushshift.io/reddit/comment/search/?link_id=8ij1jq
https://api.pushshift.io/reddit/comment/search/?link_id=8ijmid
https://api.pushshift.io/reddit/comment/search/?link_id=8iken9
https://api.pushshift.io/reddit/comment/search/?link_id=8ikfpr
https://api.pushshift.io/reddit/comment/search/?link_id=8ikhnq
https://api.pushshift.io/reddit/comment/search/?link_id=8ikk2i
https://api.pushshift.io/reddit/comment/search/?link_id=8iklml
https://api.pushshift.io/reddit/comment/search/?link_id=8ikriw
https://api.pushshift.io/reddit/comment/search/?link_id=8ikv2r
https://api.pushshift.io/reddit/comment/search/?link_id=8iky2d
https://api.pushshift.io/reddit/comment/search/?link_id=8il27a
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8jbi0a
https://api.pushshift.io/reddit/comment/search/?link_id=8jble9
https://api.pushshift.io/reddit/comment/search/?link_id=8jbsnr
https://api.pushshift.io/reddit/comment/search/?link_id=8jbuh8
https://api.pushshift.io/reddit/comment/search/?link_id=8jbw5m
https://api.pushshift.io/reddit/comment/search/?link_id=8jc10y
https://api.pushshift.io/reddit/comment/search/?link_id=8jcha6
https://api.pushshift.io/reddit/comment/search/?link_id=8jck3u
https://api.pushshift.io/reddit/comment/search/?link_id=8jck7n
https://api.pushshift.io/reddit/comment/search/?link_id=8jcos1
https://api.pushshift.io/reddit/comment/search/?link_id=8jcpx5
https://api.pushshift.io/reddit/comment/search/?link_id=8jcu11
https://api.pushshift.io/reddit/comment/search/?link_id=8jcul2
https://api.pushshift.io/reddit/comment/search/?link_id=8jcvf7
https://api.pushshift.io/reddit/comment/search/?link_id=8jcxq3
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8jwu2m
https://api.pushshift.io/reddit/comment/search/?link_id=8jwuf2
https://api.pushshift.io/reddit/comment/search/?link_id=8jwujt
https://api.pushshift.io/reddit/comment/search/?link_id=8jwweo
https://api.pushshift.io/reddit/comment/search/?link_id=8jwx67
https://api.pushshift.io/reddit/comment/search/?link_id=8jwxuc
https://api.pushshift.io/reddit/comment/search/?link_id=8jxg59
https://api.pushshift.io/reddit/comment/search/?link_id=8jxgum
https://api.pushshift.io/reddit/comment/search/?link_id=8jy4nf
https://api.pushshift.io/reddit/comment/search/?link_id=8jywy1
https://api.pushshift.io/reddit/comment/search/?link_id=8jz5o8
https://api.pushshift.io/reddit/comment/search/?link_id=8jzb9w
https://api.pushshift.io/reddit/comment/search/?link_id=8jzjlx
https://api.pushshift.io/reddit/comment/search/?link_id=8jzoiz
https://api.pushshift.io/reddit/comment/search/?link_id=8jzuj4
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8kkj5c
https://api.pushshift.io/reddit/comment/search/?link_id=8kkpiq
https://api.pushshift.io/reddit/comment/search/?link_id=8kkqud
https://api.pushshift.io/reddit/comment/search/?link_id=8kkrh4
https://api.pushshift.io/reddit/comment/search/?link_id=8kl1bu
https://api.pushshift.io/reddit/comment/search/?link_id=8kl3db
https://api.pushshift.io/reddit/comment/search/?link_id=8kl3rk
https://api.pushshift.io/reddit/comment/search/?link_id=8kl538
https://api.pushshift.io/reddit/comment/search/?link_id=8kl6tw
https://api.pushshift.io/reddit/comment/search/?link_id=8klfcm
https://api.pushshift.io/reddit/comment/search/?link_id=8klizq
https://api.pushshift.io/reddit/comment/search/?link_id=8klyu7
https://api.pushshift.io/reddit/comment/search/?link_id=8km61m
https://api.pushshift.io/reddit/comment/search/?link_id=8kmh8u
https://api.pushshift.io/reddit/comment/search/?link_id=8kmp46
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8l9fcw
https://api.pushshift.io/reddit/comment/search/?link_id=8l9gk5
https://api.pushshift.io/reddit/comment/search/?link_id=8l9naf
https://api.pushshift.io/reddit/comment/search/?link_id=8l9ncl
https://api.pushshift.io/reddit/comment/search/?link_id=8l9t91
https://api.pushshift.io/reddit/comment/search/?link_id=8l9x57
https://api.pushshift.io/reddit/comment/search/?link_id=8lad1p
https://api.pushshift.io/reddit/comment/search/?link_id=8lae44
https://api.pushshift.io/reddit/comment/search/?link_id=8lafqh
https://api.pushshift.io/reddit/comment/search/?link_id=8lapqd
https://api.pushshift.io/reddit/comment/search/?link_id=8lat7v
https://api.pushshift.io/reddit/comment/search/?link_id=8laylw
https://api.pushshift.io/reddit/comment/search/?link_id=8lb1ij
https://api.pushshift.io/reddit/comment/search/?link_id=8lb4hk
https://api.pushshift.io/reddit/comment/search/?link_id=8lb98b
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8lw5fu
https://api.pushshift.io/reddit/comment/search/?link_id=8lwa39
https://api.pushshift.io/reddit/comment/search/?link_id=8lwiek
https://api.pushshift.io/reddit/comment/search/?link_id=8lwrpx
https://api.pushshift.io/reddit/comment/search/?link_id=8lx1ij
https://api.pushshift.io/reddit/comment/search/?link_id=8lx2uh
https://api.pushshift.io/reddit/comment/search/?link_id=8lxgqq
https://api.pushshift.io/reddit/comment/search/?link_id=8ly4yj
https://api.pushshift.io/reddit/comment/search/?link_id=8ly5w7
https://api.pushshift.io/reddit/comment/search/?link_id=8lyadi
https://api.pushshift.io/reddit/comment/search/?link_id=8lycll
https://api.pushshift.io/reddit/comment/search/?link_id=8lyffx
https://api.pushshift.io/reddit/comment/search/?link_id=8lyr4i
https://api.pushshift.io/reddit/comment/search/?link_id=8lz4n0
https://api.pushshift.io/reddit/comment/search/?link_id=8lzgin
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8mo0ql
https://api.pushshift.io/reddit/comment/search/?link_id=8mo7ka
https://api.pushshift.io/reddit/comment/search/?link_id=8mo7o0
https://api.pushshift.io/reddit/comment/search/?link_id=8mo7wh
https://api.pushshift.io/reddit/comment/search/?link_id=8mo8s2
https://api.pushshift.io/reddit/comment/search/?link_id=8mo9mh
https://api.pushshift.io/reddit/comment/search/?link_id=8moh9g
https://api.pushshift.io/reddit/comment/search/?link_id=8mopez
https://api.pushshift.io/reddit/comment/search/?link_id=8morxa
https://api.pushshift.io/reddit/comment/search/?link_id=8mouaa
https://api.pushshift.io/reddit/comment/search/?link_id=8moyif
https://api.pushshift.io/reddit/comment/search/?link_id=8mp10m
https://api.pushshift.io/reddit/comment/search/?link_id=8mp6ae
https://api.pushshift.io/reddit/comment/search/?link_id=8mp8g1
https://api.pushshift.io/reddit/comment/search/?link_id=8mp9aj
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8n7xat
https://api.pushshift.io/reddit/comment/search/?link_id=8n7xta
https://api.pushshift.io/reddit/comment/search/?link_id=8n8ce3
https://api.pushshift.io/reddit/comment/search/?link_id=8n8iqt
https://api.pushshift.io/reddit/comment/search/?link_id=8n8okk
https://api.pushshift.io/reddit/comment/search/?link_id=8n8olp
https://api.pushshift.io/reddit/comment/search/?link_id=8n8osa
https://api.pushshift.io/reddit/comment/search/?link_id=8n8tzi
https://api.pushshift.io/reddit/comment/search/?link_id=8n8w9m
https://api.pushshift.io/reddit/comment/search/?link_id=8n8xw3
https://api.pushshift.io/reddit/comment/search/?link_id=8n923d
https://api.pushshift.io/reddit/comment/search/?link_id=8n96b9
https://api.pushshift.io/reddit/comment/search/?link_id=8n9e8l
https://api.pushshift.io/reddit/comment/search/?link_id=8n9j1g
https://api.pushshift.io/reddit/comment/search/?link_id=8n9n9l
https://api.pushshift.io/reddit/comment/search/?link_id

In [8]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [9]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,8g45yv,2018-05-01 00:02:35,1525132955,"The Ataribox Gets Real, Atari VCS Pre-Order Sa...",,1,1,0,"[Hello, /u/ChickenTeriyakiBoy1! Thanks for con..."


In [10]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [11]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,8g45yv,2018-05-01 00:02:35,1525132955,"The Ataribox Gets Real, Atari VCS Pre-Order Sa...",,1,1,0,"[Hello, /u/ChickenTeriyakiBoy1! Thanks for con..."


In [12]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-05-01_to_2018-06-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)