# Function Definitions and Imports

In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [2]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [3]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [4]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [5]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-02-01', 
                              before='2018-03-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-02-01&before=2018-03-01&limit=1000


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


1518904986
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1518904986&before=2018-03-01&limit=1000
682


In [6]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,7uevfi,2018-02-01 00:52:11,1517446331,"J. Cole, Future, Cardi B, Migos",,0,1,


In [7]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=7uevfi
https://api.pushshift.io/reddit/comment/search/?link_id=7ufq82
https://api.pushshift.io/reddit/comment/search/?link_id=7ufrsy
https://api.pushshift.io/reddit/comment/search/?link_id=7uftrl
https://api.pushshift.io/reddit/comment/search/?link_id=7ufx9n
https://api.pushshift.io/reddit/comment/search/?link_id=7ufxix
https://api.pushshift.io/reddit/comment/search/?link_id=7ug1si
https://api.pushshift.io/reddit/comment/search/?link_id=7ug4ct
https://api.pushshift.io/reddit/comment/search/?link_id=7ug9u7
https://api.pushshift.io/reddit/comment/search/?link_id=7ugan6
https://api.pushshift.io/reddit/comment/search/?link_id=7ugcm5
https://api.pushshift.io/reddit/comment/search/?link_id=7ugftr
https://api.pushshift.io/reddit/comment/search/?link_id=7ugi31
https://api.pushshift.io/reddit/comment/search/?link_id=7ugltu
https://api.pushshift.io/reddit/comment/search/?link_id=7ugrg3
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7uwf7b
https://api.pushshift.io/reddit/comment/search/?link_id=7uwsul
https://api.pushshift.io/reddit/comment/search/?link_id=7ux24k
https://api.pushshift.io/reddit/comment/search/?link_id=7ux33y
https://api.pushshift.io/reddit/comment/search/?link_id=7ux808
https://api.pushshift.io/reddit/comment/search/?link_id=7ux9ql
https://api.pushshift.io/reddit/comment/search/?link_id=7uxbz1
https://api.pushshift.io/reddit/comment/search/?link_id=7uxj1e
https://api.pushshift.io/reddit/comment/search/?link_id=7uxphs
https://api.pushshift.io/reddit/comment/search/?link_id=7uxtde
https://api.pushshift.io/reddit/comment/search/?link_id=7uxyai
https://api.pushshift.io/reddit/comment/search/?link_id=7uy3qw
https://api.pushshift.io/reddit/comment/search/?link_id=7uy8rz
https://api.pushshift.io/reddit/comment/search/?link_id=7uyb7u
https://api.pushshift.io/reddit/comment/search/?link_id=7uybum
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7vezh4
https://api.pushshift.io/reddit/comment/search/?link_id=7vf0o3
https://api.pushshift.io/reddit/comment/search/?link_id=7vf3r1
https://api.pushshift.io/reddit/comment/search/?link_id=7vfj6c
https://api.pushshift.io/reddit/comment/search/?link_id=7vflrs
https://api.pushshift.io/reddit/comment/search/?link_id=7vfmk2
https://api.pushshift.io/reddit/comment/search/?link_id=7vfsbl
https://api.pushshift.io/reddit/comment/search/?link_id=7vfuju
https://api.pushshift.io/reddit/comment/search/?link_id=7vfvik
https://api.pushshift.io/reddit/comment/search/?link_id=7vfy2v
https://api.pushshift.io/reddit/comment/search/?link_id=7vg7u3
https://api.pushshift.io/reddit/comment/search/?link_id=7vgaoi
https://api.pushshift.io/reddit/comment/search/?link_id=7vgi6k
https://api.pushshift.io/reddit/comment/search/?link_id=7vgne9
https://api.pushshift.io/reddit/comment/search/?link_id=7vhfym
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7vwkog
https://api.pushshift.io/reddit/comment/search/?link_id=7vwumw
https://api.pushshift.io/reddit/comment/search/?link_id=7vww78
https://api.pushshift.io/reddit/comment/search/?link_id=7vwzli
https://api.pushshift.io/reddit/comment/search/?link_id=7vx8mu
https://api.pushshift.io/reddit/comment/search/?link_id=7vx9eg
https://api.pushshift.io/reddit/comment/search/?link_id=7vxen4
https://api.pushshift.io/reddit/comment/search/?link_id=7vxju7
https://api.pushshift.io/reddit/comment/search/?link_id=7vxmfq
https://api.pushshift.io/reddit/comment/search/?link_id=7vxumu
https://api.pushshift.io/reddit/comment/search/?link_id=7vxv8d
https://api.pushshift.io/reddit/comment/search/?link_id=7vxvvq
https://api.pushshift.io/reddit/comment/search/?link_id=7vxx8n
https://api.pushshift.io/reddit/comment/search/?link_id=7vxzkc
https://api.pushshift.io/reddit/comment/search/?link_id=7vyktz
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7wdmr0
https://api.pushshift.io/reddit/comment/search/?link_id=7wdod6
https://api.pushshift.io/reddit/comment/search/?link_id=7wdpqs
https://api.pushshift.io/reddit/comment/search/?link_id=7wdv0t
https://api.pushshift.io/reddit/comment/search/?link_id=7we1xc
https://api.pushshift.io/reddit/comment/search/?link_id=7we4c1
https://api.pushshift.io/reddit/comment/search/?link_id=7we6ks
https://api.pushshift.io/reddit/comment/search/?link_id=7we6s6
https://api.pushshift.io/reddit/comment/search/?link_id=7we7b9
https://api.pushshift.io/reddit/comment/search/?link_id=7wefq2
https://api.pushshift.io/reddit/comment/search/?link_id=7wekat
https://api.pushshift.io/reddit/comment/search/?link_id=7weq3v
https://api.pushshift.io/reddit/comment/search/?link_id=7wessg
https://api.pushshift.io/reddit/comment/search/?link_id=7wf7cm
https://api.pushshift.io/reddit/comment/search/?link_id=7wf7ph
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7wzvx5
https://api.pushshift.io/reddit/comment/search/?link_id=7x00pm
https://api.pushshift.io/reddit/comment/search/?link_id=7x06b0
https://api.pushshift.io/reddit/comment/search/?link_id=7x07b9
https://api.pushshift.io/reddit/comment/search/?link_id=7x07gc
https://api.pushshift.io/reddit/comment/search/?link_id=7x07ue
https://api.pushshift.io/reddit/comment/search/?link_id=7x0dr0
https://api.pushshift.io/reddit/comment/search/?link_id=7x0l86
https://api.pushshift.io/reddit/comment/search/?link_id=7x0va2
https://api.pushshift.io/reddit/comment/search/?link_id=7x0yp8
https://api.pushshift.io/reddit/comment/search/?link_id=7x0z28
https://api.pushshift.io/reddit/comment/search/?link_id=7x12ou
https://api.pushshift.io/reddit/comment/search/?link_id=7x1d9x
https://api.pushshift.io/reddit/comment/search/?link_id=7x1ihf
https://api.pushshift.io/reddit/comment/search/?link_id=7x1ngn
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7xg9yu
https://api.pushshift.io/reddit/comment/search/?link_id=7xggvv
https://api.pushshift.io/reddit/comment/search/?link_id=7xgljb
https://api.pushshift.io/reddit/comment/search/?link_id=7xgmdk
https://api.pushshift.io/reddit/comment/search/?link_id=7xgn8z
https://api.pushshift.io/reddit/comment/search/?link_id=7xgte0
https://api.pushshift.io/reddit/comment/search/?link_id=7xgx6p
https://api.pushshift.io/reddit/comment/search/?link_id=7xh15o
https://api.pushshift.io/reddit/comment/search/?link_id=7xhhnr
https://api.pushshift.io/reddit/comment/search/?link_id=7xhkxz
https://api.pushshift.io/reddit/comment/search/?link_id=7xhn7f
https://api.pushshift.io/reddit/comment/search/?link_id=7xhpk4
https://api.pushshift.io/reddit/comment/search/?link_id=7xhrmo
https://api.pushshift.io/reddit/comment/search/?link_id=7xhstr
https://api.pushshift.io/reddit/comment/search/?link_id=7xi99m
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7xxsr9
https://api.pushshift.io/reddit/comment/search/?link_id=7xxv2v
https://api.pushshift.io/reddit/comment/search/?link_id=7xxxri
https://api.pushshift.io/reddit/comment/search/?link_id=7xy79a
https://api.pushshift.io/reddit/comment/search/?link_id=7xy8ix
https://api.pushshift.io/reddit/comment/search/?link_id=7xy967
https://api.pushshift.io/reddit/comment/search/?link_id=7xyai6
https://api.pushshift.io/reddit/comment/search/?link_id=7xycac
https://api.pushshift.io/reddit/comment/search/?link_id=7xyerl
https://api.pushshift.io/reddit/comment/search/?link_id=7xyh2i
https://api.pushshift.io/reddit/comment/search/?link_id=7xyp8v
https://api.pushshift.io/reddit/comment/search/?link_id=7xypw3
https://api.pushshift.io/reddit/comment/search/?link_id=7xyt38
https://api.pushshift.io/reddit/comment/search/?link_id=7xyy5o
https://api.pushshift.io/reddit/comment/search/?link_id=7xz8pb
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7ykexd
https://api.pushshift.io/reddit/comment/search/?link_id=7ykkv3
https://api.pushshift.io/reddit/comment/search/?link_id=7ykxh9
https://api.pushshift.io/reddit/comment/search/?link_id=7yl1rn
https://api.pushshift.io/reddit/comment/search/?link_id=7yl6mu
https://api.pushshift.io/reddit/comment/search/?link_id=7yl9pn
https://api.pushshift.io/reddit/comment/search/?link_id=7ylamy
https://api.pushshift.io/reddit/comment/search/?link_id=7ylan6
https://api.pushshift.io/reddit/comment/search/?link_id=7ylbjj
https://api.pushshift.io/reddit/comment/search/?link_id=7ylgfl
https://api.pushshift.io/reddit/comment/search/?link_id=7yll0t
https://api.pushshift.io/reddit/comment/search/?link_id=7ylmpv
https://api.pushshift.io/reddit/comment/search/?link_id=7ylmtt
https://api.pushshift.io/reddit/comment/search/?link_id=7ylojl
https://api.pushshift.io/reddit/comment/search/?link_id=7ylowh
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7z2wsh
https://api.pushshift.io/reddit/comment/search/?link_id=7z2z27
https://api.pushshift.io/reddit/comment/search/?link_id=7z34rb
https://api.pushshift.io/reddit/comment/search/?link_id=7z3962
https://api.pushshift.io/reddit/comment/search/?link_id=7z3bd8
https://api.pushshift.io/reddit/comment/search/?link_id=7z3iq9
https://api.pushshift.io/reddit/comment/search/?link_id=7z3lo9
https://api.pushshift.io/reddit/comment/search/?link_id=7z47li
https://api.pushshift.io/reddit/comment/search/?link_id=7z47z7
https://api.pushshift.io/reddit/comment/search/?link_id=7z4jzi
https://api.pushshift.io/reddit/comment/search/?link_id=7z4kkz
https://api.pushshift.io/reddit/comment/search/?link_id=7z4ky9
https://api.pushshift.io/reddit/comment/search/?link_id=7z4lh3
https://api.pushshift.io/reddit/comment/search/?link_id=7z4lnc
https://api.pushshift.io/reddit/comment/search/?link_id=7z4lpb
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7zmu5n
https://api.pushshift.io/reddit/comment/search/?link_id=7zmvor
https://api.pushshift.io/reddit/comment/search/?link_id=7zmwkf
https://api.pushshift.io/reddit/comment/search/?link_id=7zn3jj
https://api.pushshift.io/reddit/comment/search/?link_id=7zn6ek
https://api.pushshift.io/reddit/comment/search/?link_id=7zn7zd
https://api.pushshift.io/reddit/comment/search/?link_id=7zn9mt
https://api.pushshift.io/reddit/comment/search/?link_id=7zncr6
https://api.pushshift.io/reddit/comment/search/?link_id=7znkki
https://api.pushshift.io/reddit/comment/search/?link_id=7znqio
https://api.pushshift.io/reddit/comment/search/?link_id=7znuhc
https://api.pushshift.io/reddit/comment/search/?link_id=7znwel
https://api.pushshift.io/reddit/comment/search/?link_id=7znyqy
https://api.pushshift.io/reddit/comment/search/?link_id=7zo0zx
https://api.pushshift.io/reddit/comment/search/?link_id=7zo3r5
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=805wdj
https://api.pushshift.io/reddit/comment/search/?link_id=805wlw
https://api.pushshift.io/reddit/comment/search/?link_id=805wyw
https://api.pushshift.io/reddit/comment/search/?link_id=805yj3
https://api.pushshift.io/reddit/comment/search/?link_id=805z46
https://api.pushshift.io/reddit/comment/search/?link_id=8061c9
https://api.pushshift.io/reddit/comment/search/?link_id=8061hu
https://api.pushshift.io/reddit/comment/search/?link_id=80631c
https://api.pushshift.io/reddit/comment/search/?link_id=806345
https://api.pushshift.io/reddit/comment/search/?link_id=8066bp
https://api.pushshift.io/reddit/comment/search/?link_id=80673f
https://api.pushshift.io/reddit/comment/search/?link_id=8069ux
https://api.pushshift.io/reddit/comment/search/?link_id=806a6t
https://api.pushshift.io/reddit/comment/search/?link_id=806ajj
https://api.pushshift.io/reddit/comment/search/?link_id=806bys
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=80mu0o
https://api.pushshift.io/reddit/comment/search/?link_id=80mv1r
https://api.pushshift.io/reddit/comment/search/?link_id=80mxmt
https://api.pushshift.io/reddit/comment/search/?link_id=80n35u
https://api.pushshift.io/reddit/comment/search/?link_id=80n68f
https://api.pushshift.io/reddit/comment/search/?link_id=80n9zq
https://api.pushshift.io/reddit/comment/search/?link_id=80naf5
https://api.pushshift.io/reddit/comment/search/?link_id=80ndh6
https://api.pushshift.io/reddit/comment/search/?link_id=80nfd7
https://api.pushshift.io/reddit/comment/search/?link_id=80njds
https://api.pushshift.io/reddit/comment/search/?link_id=80nltb
https://api.pushshift.io/reddit/comment/search/?link_id=80nmur
https://api.pushshift.io/reddit/comment/search/?link_id=80o6td
https://api.pushshift.io/reddit/comment/search/?link_id=80oauy
https://api.pushshift.io/reddit/comment/search/?link_id=80oc5z
https://api.pushshift.io/reddit/comment/search/?link_id

In [8]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [10]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [11]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,7uevfi,2018-02-01 00:52:11,1517446331,"J. Cole, Future, Cardi B, Migos",,0,1,,[]


In [12]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-02-01_to_2018-03-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)