# Function Definitions and Imports

In [6]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [7]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [8]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [9]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [10]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-06-01', 
                              before='2018-07-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-06-01&before=2018-07-01&limit=1000
1529513006
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1529513006&before=2018-07-01&limit=1000
556


In [11]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,8nntc5,2018-06-01 01:09:11,1527815351,SearchCap: DoubleClick bid manager and Google ...,,0,1,0


In [12]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=8nntc5
https://api.pushshift.io/reddit/comment/search/?link_id=8nnugc
https://api.pushshift.io/reddit/comment/search/?link_id=8no7oe
https://api.pushshift.io/reddit/comment/search/?link_id=8no7t7
https://api.pushshift.io/reddit/comment/search/?link_id=8no8ce
https://api.pushshift.io/reddit/comment/search/?link_id=8noaoc
https://api.pushshift.io/reddit/comment/search/?link_id=8nob2t
https://api.pushshift.io/reddit/comment/search/?link_id=8nofwd
https://api.pushshift.io/reddit/comment/search/?link_id=8noq6t
https://api.pushshift.io/reddit/comment/search/?link_id=8np1ra
https://api.pushshift.io/reddit/comment/search/?link_id=8npaqz
https://api.pushshift.io/reddit/comment/search/?link_id=8nph82
https://api.pushshift.io/reddit/comment/search/?link_id=8npi86
https://api.pushshift.io/reddit/comment/search/?link_id=8npinz
https://api.pushshift.io/reddit/comment/search/?link_id=8npllb
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8oes38
https://api.pushshift.io/reddit/comment/search/?link_id=8oeti2
https://api.pushshift.io/reddit/comment/search/?link_id=8oeu3l
https://api.pushshift.io/reddit/comment/search/?link_id=8oexvo
https://api.pushshift.io/reddit/comment/search/?link_id=8ofcth
https://api.pushshift.io/reddit/comment/search/?link_id=8ofosu
https://api.pushshift.io/reddit/comment/search/?link_id=8ofp6c
https://api.pushshift.io/reddit/comment/search/?link_id=8ofpi5
https://api.pushshift.io/reddit/comment/search/?link_id=8ofu57
https://api.pushshift.io/reddit/comment/search/?link_id=8ofza7
https://api.pushshift.io/reddit/comment/search/?link_id=8og0a4
https://api.pushshift.io/reddit/comment/search/?link_id=8og7yq
https://api.pushshift.io/reddit/comment/search/?link_id=8ogd64
https://api.pushshift.io/reddit/comment/search/?link_id=8ogony
https://api.pushshift.io/reddit/comment/search/?link_id=8ogpag
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8p18og
https://api.pushshift.io/reddit/comment/search/?link_id=8p1dp4
https://api.pushshift.io/reddit/comment/search/?link_id=8p1hxn
https://api.pushshift.io/reddit/comment/search/?link_id=8p1l2a
https://api.pushshift.io/reddit/comment/search/?link_id=8p1p5r
https://api.pushshift.io/reddit/comment/search/?link_id=8p1whv
https://api.pushshift.io/reddit/comment/search/?link_id=8p20qv
https://api.pushshift.io/reddit/comment/search/?link_id=8p2990
https://api.pushshift.io/reddit/comment/search/?link_id=8p2chc
https://api.pushshift.io/reddit/comment/search/?link_id=8p2im5
https://api.pushshift.io/reddit/comment/search/?link_id=8p2pbm
https://api.pushshift.io/reddit/comment/search/?link_id=8p2qt9
https://api.pushshift.io/reddit/comment/search/?link_id=8p2sgd
https://api.pushshift.io/reddit/comment/search/?link_id=8p2vkx
https://api.pushshift.io/reddit/comment/search/?link_id=8p2wdn
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8ppguy
https://api.pushshift.io/reddit/comment/search/?link_id=8ppqqn
https://api.pushshift.io/reddit/comment/search/?link_id=8ppu79
https://api.pushshift.io/reddit/comment/search/?link_id=8ppxpr
https://api.pushshift.io/reddit/comment/search/?link_id=8pq64p
https://api.pushshift.io/reddit/comment/search/?link_id=8pqj2h
https://api.pushshift.io/reddit/comment/search/?link_id=8pqqxt
https://api.pushshift.io/reddit/comment/search/?link_id=8pqr2b
https://api.pushshift.io/reddit/comment/search/?link_id=8pqs8r
https://api.pushshift.io/reddit/comment/search/?link_id=8pqu4t
https://api.pushshift.io/reddit/comment/search/?link_id=8pr1lp
https://api.pushshift.io/reddit/comment/search/?link_id=8prf3t
https://api.pushshift.io/reddit/comment/search/?link_id=8prfcy
https://api.pushshift.io/reddit/comment/search/?link_id=8prggi
https://api.pushshift.io/reddit/comment/search/?link_id=8prhs2
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8qe6m8
https://api.pushshift.io/reddit/comment/search/?link_id=8qe7d2
https://api.pushshift.io/reddit/comment/search/?link_id=8qe821
https://api.pushshift.io/reddit/comment/search/?link_id=8qet94
https://api.pushshift.io/reddit/comment/search/?link_id=8qex83
https://api.pushshift.io/reddit/comment/search/?link_id=8qfady
https://api.pushshift.io/reddit/comment/search/?link_id=8qfi2j
https://api.pushshift.io/reddit/comment/search/?link_id=8qg3da
https://api.pushshift.io/reddit/comment/search/?link_id=8qg6qx
https://api.pushshift.io/reddit/comment/search/?link_id=8qgb7e
https://api.pushshift.io/reddit/comment/search/?link_id=8qgm11
https://api.pushshift.io/reddit/comment/search/?link_id=8qgnip
https://api.pushshift.io/reddit/comment/search/?link_id=8qgsqm
https://api.pushshift.io/reddit/comment/search/?link_id=8qgzci
https://api.pushshift.io/reddit/comment/search/?link_id=8qh3r9
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8quedg
https://api.pushshift.io/reddit/comment/search/?link_id=8quryi
https://api.pushshift.io/reddit/comment/search/?link_id=8qvdhz
https://api.pushshift.io/reddit/comment/search/?link_id=8qvdn4
https://api.pushshift.io/reddit/comment/search/?link_id=8qvelp
https://api.pushshift.io/reddit/comment/search/?link_id=8qvugc
https://api.pushshift.io/reddit/comment/search/?link_id=8qvxr6
https://api.pushshift.io/reddit/comment/search/?link_id=8qwhd7
https://api.pushshift.io/reddit/comment/search/?link_id=8qx27c
https://api.pushshift.io/reddit/comment/search/?link_id=8qxhxm
https://api.pushshift.io/reddit/comment/search/?link_id=8qyhig
https://api.pushshift.io/reddit/comment/search/?link_id=8qykmq
https://api.pushshift.io/reddit/comment/search/?link_id=8qyppi
https://api.pushshift.io/reddit/comment/search/?link_id=8qyq19
https://api.pushshift.io/reddit/comment/search/?link_id=8qyy19
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8rkxhb
https://api.pushshift.io/reddit/comment/search/?link_id=8rlc5w
https://api.pushshift.io/reddit/comment/search/?link_id=8rlf0p
https://api.pushshift.io/reddit/comment/search/?link_id=8rlryu
https://api.pushshift.io/reddit/comment/search/?link_id=8rm5c4
https://api.pushshift.io/reddit/comment/search/?link_id=8rn4ap
https://api.pushshift.io/reddit/comment/search/?link_id=8rndh8
https://api.pushshift.io/reddit/comment/search/?link_id=8rnvat
https://api.pushshift.io/reddit/comment/search/?link_id=8ro1hc
https://api.pushshift.io/reddit/comment/search/?link_id=8ro5vs
https://api.pushshift.io/reddit/comment/search/?link_id=8rodhj
https://api.pushshift.io/reddit/comment/search/?link_id=8rone8
https://api.pushshift.io/reddit/comment/search/?link_id=8rou9p
https://api.pushshift.io/reddit/comment/search/?link_id=8row0x
https://api.pushshift.io/reddit/comment/search/?link_id=8rp58o
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8s8y8e
https://api.pushshift.io/reddit/comment/search/?link_id=8s92t4
https://api.pushshift.io/reddit/comment/search/?link_id=8s9exp
https://api.pushshift.io/reddit/comment/search/?link_id=8s9iho
https://api.pushshift.io/reddit/comment/search/?link_id=8s9oe6
https://api.pushshift.io/reddit/comment/search/?link_id=8s9wuu
https://api.pushshift.io/reddit/comment/search/?link_id=8s9xue
https://api.pushshift.io/reddit/comment/search/?link_id=8s9yk5
https://api.pushshift.io/reddit/comment/search/?link_id=8s9z8o
https://api.pushshift.io/reddit/comment/search/?link_id=8sa6ew
https://api.pushshift.io/reddit/comment/search/?link_id=8sa9z6
https://api.pushshift.io/reddit/comment/search/?link_id=8saa6d
https://api.pushshift.io/reddit/comment/search/?link_id=8sadkw
https://api.pushshift.io/reddit/comment/search/?link_id=8saffv
https://api.pushshift.io/reddit/comment/search/?link_id=8safk2
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8ssfbn
https://api.pushshift.io/reddit/comment/search/?link_id=8ssftv
https://api.pushshift.io/reddit/comment/search/?link_id=8ssfvb
https://api.pushshift.io/reddit/comment/search/?link_id=8ssl8i
https://api.pushshift.io/reddit/comment/search/?link_id=8ssqfl
https://api.pushshift.io/reddit/comment/search/?link_id=8ssush
https://api.pushshift.io/reddit/comment/search/?link_id=8ssxak
https://api.pushshift.io/reddit/comment/search/?link_id=8ssxds
https://api.pushshift.io/reddit/comment/search/?link_id=8st0v1
https://api.pushshift.io/reddit/comment/search/?link_id=8st10w
https://api.pushshift.io/reddit/comment/search/?link_id=8st32g
https://api.pushshift.io/reddit/comment/search/?link_id=8st5y1
https://api.pushshift.io/reddit/comment/search/?link_id=8staag
https://api.pushshift.io/reddit/comment/search/?link_id=8steyc
https://api.pushshift.io/reddit/comment/search/?link_id=8stnjq
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8tbzco
https://api.pushshift.io/reddit/comment/search/?link_id=8tc1xv
https://api.pushshift.io/reddit/comment/search/?link_id=8tdrda
https://api.pushshift.io/reddit/comment/search/?link_id=8te1vo
https://api.pushshift.io/reddit/comment/search/?link_id=8tejdr
https://api.pushshift.io/reddit/comment/search/?link_id=8teofc
https://api.pushshift.io/reddit/comment/search/?link_id=8tf59s
https://api.pushshift.io/reddit/comment/search/?link_id=8tfgu7
https://api.pushshift.io/reddit/comment/search/?link_id=8tfq9c
https://api.pushshift.io/reddit/comment/search/?link_id=8tfwac
https://api.pushshift.io/reddit/comment/search/?link_id=8tg7io
https://api.pushshift.io/reddit/comment/search/?link_id=8tgnoo
https://api.pushshift.io/reddit/comment/search/?link_id=8tgu2s
https://api.pushshift.io/reddit/comment/search/?link_id=8th1d6
https://api.pushshift.io/reddit/comment/search/?link_id=8th2fw
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8tz2ov
https://api.pushshift.io/reddit/comment/search/?link_id=8tz4if
https://api.pushshift.io/reddit/comment/search/?link_id=8tzah4
https://api.pushshift.io/reddit/comment/search/?link_id=8tzeym
https://api.pushshift.io/reddit/comment/search/?link_id=8tzfq0
https://api.pushshift.io/reddit/comment/search/?link_id=8tzgus
https://api.pushshift.io/reddit/comment/search/?link_id=8tzx8j
https://api.pushshift.io/reddit/comment/search/?link_id=8tzx8m
https://api.pushshift.io/reddit/comment/search/?link_id=8u023u
https://api.pushshift.io/reddit/comment/search/?link_id=8u04yd
https://api.pushshift.io/reddit/comment/search/?link_id=8u08m8
https://api.pushshift.io/reddit/comment/search/?link_id=8u08v4
https://api.pushshift.io/reddit/comment/search/?link_id=8u09c1
https://api.pushshift.io/reddit/comment/search/?link_id=8u0i8a
https://api.pushshift.io/reddit/comment/search/?link_id=8u0kid
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8uj80r
https://api.pushshift.io/reddit/comment/search/?link_id=8ujbtp
https://api.pushshift.io/reddit/comment/search/?link_id=8ujfzs
https://api.pushshift.io/reddit/comment/search/?link_id=8ujh69
https://api.pushshift.io/reddit/comment/search/?link_id=8uji13
https://api.pushshift.io/reddit/comment/search/?link_id=8ujovi
https://api.pushshift.io/reddit/comment/search/?link_id=8ujv9b
https://api.pushshift.io/reddit/comment/search/?link_id=8uk0rn
https://api.pushshift.io/reddit/comment/search/?link_id=8uk8bn
https://api.pushshift.io/reddit/comment/search/?link_id=8uk8pr
https://api.pushshift.io/reddit/comment/search/?link_id=8ukala
https://api.pushshift.io/reddit/comment/search/?link_id=8ukckb
https://api.pushshift.io/reddit/comment/search/?link_id=8ukgcq
https://api.pushshift.io/reddit/comment/search/?link_id=8ul19r
https://api.pushshift.io/reddit/comment/search/?link_id=8ul1f4
https://api.pushshift.io/reddit/comment/search/?link_id

In [13]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [14]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,8nntc5,2018-06-01 01:09:11,1527815351,SearchCap: DoubleClick bid manager and Google ...,,0,1,0,[]


In [15]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [16]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,8nntc5,2018-06-01 01:09:11,1527815351,SearchCap: DoubleClick bid manager and Google ...,,0,1,0,[]


In [17]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-06-01_to_2018-07-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)