# Function Definitions and Imports

In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [2]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [3]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [4]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [5]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-03-01', 
                              before='2018-04-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-03-01&before=2018-04-01&limit=1000
1521324269
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1521324269&before=2018-04-01&limit=1000
933


In [6]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,810z2h,2018-03-01 00:09:01,1519862941,Michael Kors smartwatch is killing it!,,1,1,0


In [7]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=810z2h
https://api.pushshift.io/reddit/comment/search/?link_id=811m5q
https://api.pushshift.io/reddit/comment/search/?link_id=8120o6
https://api.pushshift.io/reddit/comment/search/?link_id=81257g
https://api.pushshift.io/reddit/comment/search/?link_id=8126uw
https://api.pushshift.io/reddit/comment/search/?link_id=812a49
https://api.pushshift.io/reddit/comment/search/?link_id=812lwv
https://api.pushshift.io/reddit/comment/search/?link_id=812qx2
https://api.pushshift.io/reddit/comment/search/?link_id=812x6o
https://api.pushshift.io/reddit/comment/search/?link_id=8131we
https://api.pushshift.io/reddit/comment/search/?link_id=8133qj
https://api.pushshift.io/reddit/comment/search/?link_id=8138qj
https://api.pushshift.io/reddit/comment/search/?link_id=813bcb
https://api.pushshift.io/reddit/comment/search/?link_id=813cb4
https://api.pushshift.io/reddit/comment/search/?link_id=813cme
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=81oa2g
https://api.pushshift.io/reddit/comment/search/?link_id=81od86
https://api.pushshift.io/reddit/comment/search/?link_id=81ok4i
https://api.pushshift.io/reddit/comment/search/?link_id=81pg78
https://api.pushshift.io/reddit/comment/search/?link_id=81pm9o
https://api.pushshift.io/reddit/comment/search/?link_id=81ppcl
https://api.pushshift.io/reddit/comment/search/?link_id=81pxwt
https://api.pushshift.io/reddit/comment/search/?link_id=81q5ze
https://api.pushshift.io/reddit/comment/search/?link_id=81qglk
https://api.pushshift.io/reddit/comment/search/?link_id=81qjdh
https://api.pushshift.io/reddit/comment/search/?link_id=81qkrw
https://api.pushshift.io/reddit/comment/search/?link_id=81qp5b
https://api.pushshift.io/reddit/comment/search/?link_id=81qpjv
https://api.pushshift.io/reddit/comment/search/?link_id=81r3sj
https://api.pushshift.io/reddit/comment/search/?link_id=81ra6s
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=828coi
https://api.pushshift.io/reddit/comment/search/?link_id=828des
https://api.pushshift.io/reddit/comment/search/?link_id=828hv2
https://api.pushshift.io/reddit/comment/search/?link_id=828l7i
https://api.pushshift.io/reddit/comment/search/?link_id=8296jm
https://api.pushshift.io/reddit/comment/search/?link_id=829aee
https://api.pushshift.io/reddit/comment/search/?link_id=829u7m
https://api.pushshift.io/reddit/comment/search/?link_id=82bmx9
https://api.pushshift.io/reddit/comment/search/?link_id=82boco
https://api.pushshift.io/reddit/comment/search/?link_id=82bur3
https://api.pushshift.io/reddit/comment/search/?link_id=82bwmm
https://api.pushshift.io/reddit/comment/search/?link_id=82c1qh
https://api.pushshift.io/reddit/comment/search/?link_id=82ch0c
https://api.pushshift.io/reddit/comment/search/?link_id=82ck3v
https://api.pushshift.io/reddit/comment/search/?link_id=82crdq
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=82t567
https://api.pushshift.io/reddit/comment/search/?link_id=82tiry
https://api.pushshift.io/reddit/comment/search/?link_id=82toch
https://api.pushshift.io/reddit/comment/search/?link_id=82toxk
https://api.pushshift.io/reddit/comment/search/?link_id=82tw7b
https://api.pushshift.io/reddit/comment/search/?link_id=82ucx1
https://api.pushshift.io/reddit/comment/search/?link_id=82ueyk
https://api.pushshift.io/reddit/comment/search/?link_id=82ukk7
https://api.pushshift.io/reddit/comment/search/?link_id=82umg7
https://api.pushshift.io/reddit/comment/search/?link_id=82v6sj
https://api.pushshift.io/reddit/comment/search/?link_id=82v8a6
https://api.pushshift.io/reddit/comment/search/?link_id=82va4z
https://api.pushshift.io/reddit/comment/search/?link_id=82vjnz
https://api.pushshift.io/reddit/comment/search/?link_id=82vowk
https://api.pushshift.io/reddit/comment/search/?link_id=82vp3e
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=83detv
https://api.pushshift.io/reddit/comment/search/?link_id=83dhw0
https://api.pushshift.io/reddit/comment/search/?link_id=83dle9
https://api.pushshift.io/reddit/comment/search/?link_id=83dly8
https://api.pushshift.io/reddit/comment/search/?link_id=83ds7c
https://api.pushshift.io/reddit/comment/search/?link_id=83dspb
https://api.pushshift.io/reddit/comment/search/?link_id=83e4be
https://api.pushshift.io/reddit/comment/search/?link_id=83e7vn
https://api.pushshift.io/reddit/comment/search/?link_id=83e8ue
https://api.pushshift.io/reddit/comment/search/?link_id=83egdr
https://api.pushshift.io/reddit/comment/search/?link_id=83ei00
https://api.pushshift.io/reddit/comment/search/?link_id=83ekcn
https://api.pushshift.io/reddit/comment/search/?link_id=83eqck
https://api.pushshift.io/reddit/comment/search/?link_id=83f0wc
https://api.pushshift.io/reddit/comment/search/?link_id=83ff1s
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=841vpo
https://api.pushshift.io/reddit/comment/search/?link_id=841zbx
https://api.pushshift.io/reddit/comment/search/?link_id=8420lq
https://api.pushshift.io/reddit/comment/search/?link_id=8420qv
https://api.pushshift.io/reddit/comment/search/?link_id=8422h7
https://api.pushshift.io/reddit/comment/search/?link_id=84263s
https://api.pushshift.io/reddit/comment/search/?link_id=8426n7
https://api.pushshift.io/reddit/comment/search/?link_id=8427nk
https://api.pushshift.io/reddit/comment/search/?link_id=842c4u
https://api.pushshift.io/reddit/comment/search/?link_id=842fga
https://api.pushshift.io/reddit/comment/search/?link_id=842ngm
https://api.pushshift.io/reddit/comment/search/?link_id=842qvz
https://api.pushshift.io/reddit/comment/search/?link_id=842u40
https://api.pushshift.io/reddit/comment/search/?link_id=842utk
https://api.pushshift.io/reddit/comment/search/?link_id=842vrq
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=84elia
https://api.pushshift.io/reddit/comment/search/?link_id=84en5b
https://api.pushshift.io/reddit/comment/search/?link_id=84ene4
https://api.pushshift.io/reddit/comment/search/?link_id=84eply
https://api.pushshift.io/reddit/comment/search/?link_id=84et7o
https://api.pushshift.io/reddit/comment/search/?link_id=84etpb
https://api.pushshift.io/reddit/comment/search/?link_id=84fgiu
https://api.pushshift.io/reddit/comment/search/?link_id=84fqke
https://api.pushshift.io/reddit/comment/search/?link_id=84fuja
https://api.pushshift.io/reddit/comment/search/?link_id=84fvqs
https://api.pushshift.io/reddit/comment/search/?link_id=84fyj3
https://api.pushshift.io/reddit/comment/search/?link_id=84fz1l
https://api.pushshift.io/reddit/comment/search/?link_id=84g35j
https://api.pushshift.io/reddit/comment/search/?link_id=84g6uj
https://api.pushshift.io/reddit/comment/search/?link_id=84g79z
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=84u3lb
https://api.pushshift.io/reddit/comment/search/?link_id=84u4g6
https://api.pushshift.io/reddit/comment/search/?link_id=84u6v7
https://api.pushshift.io/reddit/comment/search/?link_id=84udze
https://api.pushshift.io/reddit/comment/search/?link_id=84uj6n
https://api.pushshift.io/reddit/comment/search/?link_id=84ukg6
https://api.pushshift.io/reddit/comment/search/?link_id=84ulu4
https://api.pushshift.io/reddit/comment/search/?link_id=84um56
https://api.pushshift.io/reddit/comment/search/?link_id=84uv2w
https://api.pushshift.io/reddit/comment/search/?link_id=84v9ut
https://api.pushshift.io/reddit/comment/search/?link_id=84vab5
https://api.pushshift.io/reddit/comment/search/?link_id=84vcv7
https://api.pushshift.io/reddit/comment/search/?link_id=84vj1q
https://api.pushshift.io/reddit/comment/search/?link_id=84vyhi
https://api.pushshift.io/reddit/comment/search/?link_id=84vywf
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=85fj47
https://api.pushshift.io/reddit/comment/search/?link_id=85fmde
https://api.pushshift.io/reddit/comment/search/?link_id=85fw0q
https://api.pushshift.io/reddit/comment/search/?link_id=85g215
https://api.pushshift.io/reddit/comment/search/?link_id=85g5el
https://api.pushshift.io/reddit/comment/search/?link_id=85g8wf
https://api.pushshift.io/reddit/comment/search/?link_id=85gcfx
https://api.pushshift.io/reddit/comment/search/?link_id=85gno8
https://api.pushshift.io/reddit/comment/search/?link_id=85gxjv
https://api.pushshift.io/reddit/comment/search/?link_id=85h35m
https://api.pushshift.io/reddit/comment/search/?link_id=85hd4z
https://api.pushshift.io/reddit/comment/search/?link_id=85hfyn
https://api.pushshift.io/reddit/comment/search/?link_id=85hksq
https://api.pushshift.io/reddit/comment/search/?link_id=85hmdi
https://api.pushshift.io/reddit/comment/search/?link_id=85hmi3
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=85uhx8
https://api.pushshift.io/reddit/comment/search/?link_id=85ur7k
https://api.pushshift.io/reddit/comment/search/?link_id=85viam
https://api.pushshift.io/reddit/comment/search/?link_id=85vp4f
https://api.pushshift.io/reddit/comment/search/?link_id=85vr1a
https://api.pushshift.io/reddit/comment/search/?link_id=85vwt9
https://api.pushshift.io/reddit/comment/search/?link_id=85wfxk
https://api.pushshift.io/reddit/comment/search/?link_id=85x1vz
https://api.pushshift.io/reddit/comment/search/?link_id=85x24i
https://api.pushshift.io/reddit/comment/search/?link_id=85x2ey
https://api.pushshift.io/reddit/comment/search/?link_id=85xp49
https://api.pushshift.io/reddit/comment/search/?link_id=85xq9l
https://api.pushshift.io/reddit/comment/search/?link_id=85xxrq
https://api.pushshift.io/reddit/comment/search/?link_id=85y29u
https://api.pushshift.io/reddit/comment/search/?link_id=85ymte
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=86cpop
https://api.pushshift.io/reddit/comment/search/?link_id=86cqmq
https://api.pushshift.io/reddit/comment/search/?link_id=86cvh8
https://api.pushshift.io/reddit/comment/search/?link_id=86d184
https://api.pushshift.io/reddit/comment/search/?link_id=86d1ng
https://api.pushshift.io/reddit/comment/search/?link_id=86d1rc
https://api.pushshift.io/reddit/comment/search/?link_id=86d71e
https://api.pushshift.io/reddit/comment/search/?link_id=86d78d
https://api.pushshift.io/reddit/comment/search/?link_id=86dmju
https://api.pushshift.io/reddit/comment/search/?link_id=86do2k
https://api.pushshift.io/reddit/comment/search/?link_id=86dprq
https://api.pushshift.io/reddit/comment/search/?link_id=86drtr
https://api.pushshift.io/reddit/comment/search/?link_id=86duyq
https://api.pushshift.io/reddit/comment/search/?link_id=86dz2o
https://api.pushshift.io/reddit/comment/search/?link_id=86e0f3
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=86sa1g
https://api.pushshift.io/reddit/comment/search/?link_id=86scla
https://api.pushshift.io/reddit/comment/search/?link_id=86sdln
https://api.pushshift.io/reddit/comment/search/?link_id=86sfii
https://api.pushshift.io/reddit/comment/search/?link_id=86sh34
https://api.pushshift.io/reddit/comment/search/?link_id=86stdd
https://api.pushshift.io/reddit/comment/search/?link_id=86suw3
https://api.pushshift.io/reddit/comment/search/?link_id=86swx0
https://api.pushshift.io/reddit/comment/search/?link_id=86syu8
https://api.pushshift.io/reddit/comment/search/?link_id=86t3fd
https://api.pushshift.io/reddit/comment/search/?link_id=86tmpq
https://api.pushshift.io/reddit/comment/search/?link_id=86tp3j
https://api.pushshift.io/reddit/comment/search/?link_id=86tpng
https://api.pushshift.io/reddit/comment/search/?link_id=86tqxp
https://api.pushshift.io/reddit/comment/search/?link_id=86truf
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=879m60
https://api.pushshift.io/reddit/comment/search/?link_id=879pdb
https://api.pushshift.io/reddit/comment/search/?link_id=879qbv
https://api.pushshift.io/reddit/comment/search/?link_id=879y9f
https://api.pushshift.io/reddit/comment/search/?link_id=87a4xl
https://api.pushshift.io/reddit/comment/search/?link_id=87a6z5
https://api.pushshift.io/reddit/comment/search/?link_id=87a8ji
https://api.pushshift.io/reddit/comment/search/?link_id=87ab0y
https://api.pushshift.io/reddit/comment/search/?link_id=87af3i
https://api.pushshift.io/reddit/comment/search/?link_id=87ak0f
https://api.pushshift.io/reddit/comment/search/?link_id=87arcg
https://api.pushshift.io/reddit/comment/search/?link_id=87b5lr
https://api.pushshift.io/reddit/comment/search/?link_id=87b90o
https://api.pushshift.io/reddit/comment/search/?link_id=87b9lx
https://api.pushshift.io/reddit/comment/search/?link_id=87bfnm
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=87pkf0
https://api.pushshift.io/reddit/comment/search/?link_id=87pmco
https://api.pushshift.io/reddit/comment/search/?link_id=87pmdm
https://api.pushshift.io/reddit/comment/search/?link_id=87psee
https://api.pushshift.io/reddit/comment/search/?link_id=87pu42
https://api.pushshift.io/reddit/comment/search/?link_id=87pvuv
https://api.pushshift.io/reddit/comment/search/?link_id=87q06n
https://api.pushshift.io/reddit/comment/search/?link_id=87q18i
https://api.pushshift.io/reddit/comment/search/?link_id=87q2yo
https://api.pushshift.io/reddit/comment/search/?link_id=87q34m
https://api.pushshift.io/reddit/comment/search/?link_id=87q3k3
https://api.pushshift.io/reddit/comment/search/?link_id=87q4ls
https://api.pushshift.io/reddit/comment/search/?link_id=87q54p
https://api.pushshift.io/reddit/comment/search/?link_id=87q5u4
https://api.pushshift.io/reddit/comment/search/?link_id=87q6jr
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=887ecv
https://api.pushshift.io/reddit/comment/search/?link_id=887iyd
https://api.pushshift.io/reddit/comment/search/?link_id=887oxu
https://api.pushshift.io/reddit/comment/search/?link_id=887ts5
https://api.pushshift.io/reddit/comment/search/?link_id=888121
https://api.pushshift.io/reddit/comment/search/?link_id=8886sn
https://api.pushshift.io/reddit/comment/search/?link_id=8888db
https://api.pushshift.io/reddit/comment/search/?link_id=888dyi
https://api.pushshift.io/reddit/comment/search/?link_id=888eq4
https://api.pushshift.io/reddit/comment/search/?link_id=888f22
https://api.pushshift.io/reddit/comment/search/?link_id=888h44
https://api.pushshift.io/reddit/comment/search/?link_id=888l1g
https://api.pushshift.io/reddit/comment/search/?link_id=888lk5
https://api.pushshift.io/reddit/comment/search/?link_id=888n7p
https://api.pushshift.io/reddit/comment/search/?link_id=888twv
https://api.pushshift.io/reddit/comment/search/?link_id

In [8]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [9]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,810z2h,2018-03-01 00:09:01,1519862941,Michael Kors smartwatch is killing it!,,1,1,0,"[Hello, /u/ReeceAzza! Thanks for contributing..."


In [10]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [11]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,810z2h,2018-03-01 00:09:01,1519862941,Michael Kors smartwatch is killing it!,,1,1,0,"[Hello, /u/ReeceAzza! Thanks for contributing..."


In [12]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-03-01_to_2018-04-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)