# Function Definitions and Imports

In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [2]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [3]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [4]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [5]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-04-01', 
                              before='2018-05-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-04-01&before=2018-05-01&limit=1000
1523878912
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1523878912&before=2018-05-01&limit=1000
878


In [6]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,88mv2c,2018-04-01 00:09:34,1522541374,تفسير الاحلام في الصناع وأصحاب الحرف والعملة و...,,0,1,0.0


In [7]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=88mv2c
https://api.pushshift.io/reddit/comment/search/?link_id=88mz7k
https://api.pushshift.io/reddit/comment/search/?link_id=88naas
https://api.pushshift.io/reddit/comment/search/?link_id=88nreh
https://api.pushshift.io/reddit/comment/search/?link_id=88ntme
https://api.pushshift.io/reddit/comment/search/?link_id=88nwq9
https://api.pushshift.io/reddit/comment/search/?link_id=88o49e
https://api.pushshift.io/reddit/comment/search/?link_id=88odxq
https://api.pushshift.io/reddit/comment/search/?link_id=88ok5m
https://api.pushshift.io/reddit/comment/search/?link_id=88omdz
https://api.pushshift.io/reddit/comment/search/?link_id=88p1nq
https://api.pushshift.io/reddit/comment/search/?link_id=88p4ue
https://api.pushshift.io/reddit/comment/search/?link_id=88p7sg
https://api.pushshift.io/reddit/comment/search/?link_id=88pb4m
https://api.pushshift.io/reddit/comment/search/?link_id=88pbni
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=89as9h
https://api.pushshift.io/reddit/comment/search/?link_id=89ax0x
https://api.pushshift.io/reddit/comment/search/?link_id=89ax2r
https://api.pushshift.io/reddit/comment/search/?link_id=89axl6
https://api.pushshift.io/reddit/comment/search/?link_id=89b0ij
https://api.pushshift.io/reddit/comment/search/?link_id=89b156
https://api.pushshift.io/reddit/comment/search/?link_id=89b4cz
https://api.pushshift.io/reddit/comment/search/?link_id=89b4og
https://api.pushshift.io/reddit/comment/search/?link_id=89b4wh
https://api.pushshift.io/reddit/comment/search/?link_id=89b6th
https://api.pushshift.io/reddit/comment/search/?link_id=89b7it
https://api.pushshift.io/reddit/comment/search/?link_id=89b9cp
https://api.pushshift.io/reddit/comment/search/?link_id=89bcrj
https://api.pushshift.io/reddit/comment/search/?link_id=89bdil
https://api.pushshift.io/reddit/comment/search/?link_id=89bj2q
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=89qd90
https://api.pushshift.io/reddit/comment/search/?link_id=89qexj
https://api.pushshift.io/reddit/comment/search/?link_id=89qukb
https://api.pushshift.io/reddit/comment/search/?link_id=89qyws
https://api.pushshift.io/reddit/comment/search/?link_id=89qzza
https://api.pushshift.io/reddit/comment/search/?link_id=89r6vt
https://api.pushshift.io/reddit/comment/search/?link_id=89rrxx
https://api.pushshift.io/reddit/comment/search/?link_id=89rwmw
https://api.pushshift.io/reddit/comment/search/?link_id=89s8da
https://api.pushshift.io/reddit/comment/search/?link_id=89sahz
https://api.pushshift.io/reddit/comment/search/?link_id=89sr4a
https://api.pushshift.io/reddit/comment/search/?link_id=89srwb
https://api.pushshift.io/reddit/comment/search/?link_id=89syc5
https://api.pushshift.io/reddit/comment/search/?link_id=89takz
https://api.pushshift.io/reddit/comment/search/?link_id=89u5dh
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8a8s4l
https://api.pushshift.io/reddit/comment/search/?link_id=8a92pf
https://api.pushshift.io/reddit/comment/search/?link_id=8a955n
https://api.pushshift.io/reddit/comment/search/?link_id=8a95l3
https://api.pushshift.io/reddit/comment/search/?link_id=8a96hw
https://api.pushshift.io/reddit/comment/search/?link_id=8a9a01
https://api.pushshift.io/reddit/comment/search/?link_id=8a9b57
https://api.pushshift.io/reddit/comment/search/?link_id=8a9cj0
https://api.pushshift.io/reddit/comment/search/?link_id=8a9g1b
https://api.pushshift.io/reddit/comment/search/?link_id=8a9iaz
https://api.pushshift.io/reddit/comment/search/?link_id=8a9j8z
https://api.pushshift.io/reddit/comment/search/?link_id=8a9ltb
https://api.pushshift.io/reddit/comment/search/?link_id=8a9um5
https://api.pushshift.io/reddit/comment/search/?link_id=8a9wkm
https://api.pushshift.io/reddit/comment/search/?link_id=8a9zer
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8aw37r
https://api.pushshift.io/reddit/comment/search/?link_id=8aw5u0
https://api.pushshift.io/reddit/comment/search/?link_id=8aw7jc
https://api.pushshift.io/reddit/comment/search/?link_id=8aw9q3
https://api.pushshift.io/reddit/comment/search/?link_id=8awagt
https://api.pushshift.io/reddit/comment/search/?link_id=8awanr
https://api.pushshift.io/reddit/comment/search/?link_id=8awb3y
https://api.pushshift.io/reddit/comment/search/?link_id=8awcxh
https://api.pushshift.io/reddit/comment/search/?link_id=8awe1l
https://api.pushshift.io/reddit/comment/search/?link_id=8awfrw
https://api.pushshift.io/reddit/comment/search/?link_id=8awhdu
https://api.pushshift.io/reddit/comment/search/?link_id=8awklq
https://api.pushshift.io/reddit/comment/search/?link_id=8awlen
https://api.pushshift.io/reddit/comment/search/?link_id=8awmrk
https://api.pushshift.io/reddit/comment/search/?link_id=8awst8
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8b8gti
https://api.pushshift.io/reddit/comment/search/?link_id=8b8kaf
https://api.pushshift.io/reddit/comment/search/?link_id=8b8tyg
https://api.pushshift.io/reddit/comment/search/?link_id=8b8ynq
https://api.pushshift.io/reddit/comment/search/?link_id=8b9a0q
https://api.pushshift.io/reddit/comment/search/?link_id=8b9e82
https://api.pushshift.io/reddit/comment/search/?link_id=8b9f92
https://api.pushshift.io/reddit/comment/search/?link_id=8b9rq8
https://api.pushshift.io/reddit/comment/search/?link_id=8bamor
https://api.pushshift.io/reddit/comment/search/?link_id=8ban2o
https://api.pushshift.io/reddit/comment/search/?link_id=8baoos
https://api.pushshift.io/reddit/comment/search/?link_id=8barpz
https://api.pushshift.io/reddit/comment/search/?link_id=8bbakc
https://api.pushshift.io/reddit/comment/search/?link_id=8bbcqe
https://api.pushshift.io/reddit/comment/search/?link_id=8bbtrt
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8bq2ka
https://api.pushshift.io/reddit/comment/search/?link_id=8bq4ej
https://api.pushshift.io/reddit/comment/search/?link_id=8bq6wh
https://api.pushshift.io/reddit/comment/search/?link_id=8bqga7
https://api.pushshift.io/reddit/comment/search/?link_id=8bqm4o
https://api.pushshift.io/reddit/comment/search/?link_id=8bqohp
https://api.pushshift.io/reddit/comment/search/?link_id=8bqozy
https://api.pushshift.io/reddit/comment/search/?link_id=8bqplt
https://api.pushshift.io/reddit/comment/search/?link_id=8bqqw1
https://api.pushshift.io/reddit/comment/search/?link_id=8bqre7
https://api.pushshift.io/reddit/comment/search/?link_id=8bqw7g
https://api.pushshift.io/reddit/comment/search/?link_id=8bqy7c
https://api.pushshift.io/reddit/comment/search/?link_id=8br2ft
https://api.pushshift.io/reddit/comment/search/?link_id=8br5bm
https://api.pushshift.io/reddit/comment/search/?link_id=8br92t
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8caeao
https://api.pushshift.io/reddit/comment/search/?link_id=8caxjt
https://api.pushshift.io/reddit/comment/search/?link_id=8ccvy7
https://api.pushshift.io/reddit/comment/search/?link_id=8cd1lz
https://api.pushshift.io/reddit/comment/search/?link_id=8cd3a1
https://api.pushshift.io/reddit/comment/search/?link_id=8cd81x
https://api.pushshift.io/reddit/comment/search/?link_id=8cddxt
https://api.pushshift.io/reddit/comment/search/?link_id=8ceazm
https://api.pushshift.io/reddit/comment/search/?link_id=8cekkt
https://api.pushshift.io/reddit/comment/search/?link_id=8cemnj
https://api.pushshift.io/reddit/comment/search/?link_id=8ceum9
https://api.pushshift.io/reddit/comment/search/?link_id=8cf03j
https://api.pushshift.io/reddit/comment/search/?link_id=8cf69u
https://api.pushshift.io/reddit/comment/search/?link_id=8cgdfb
https://api.pushshift.io/reddit/comment/search/?link_id=8cgdii
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8ctwi9
https://api.pushshift.io/reddit/comment/search/?link_id=8cu4q8
https://api.pushshift.io/reddit/comment/search/?link_id=8cu7sh
https://api.pushshift.io/reddit/comment/search/?link_id=8cu99c
https://api.pushshift.io/reddit/comment/search/?link_id=8cuf8d
https://api.pushshift.io/reddit/comment/search/?link_id=8cufoh
https://api.pushshift.io/reddit/comment/search/?link_id=8cutbx
https://api.pushshift.io/reddit/comment/search/?link_id=8cuwvl
https://api.pushshift.io/reddit/comment/search/?link_id=8cuxwo
https://api.pushshift.io/reddit/comment/search/?link_id=8cv3x7
https://api.pushshift.io/reddit/comment/search/?link_id=8cv67s
https://api.pushshift.io/reddit/comment/search/?link_id=8cv7na
https://api.pushshift.io/reddit/comment/search/?link_id=8cv9ha
https://api.pushshift.io/reddit/comment/search/?link_id=8cvc97
https://api.pushshift.io/reddit/comment/search/?link_id=8cveao
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8d7uob
https://api.pushshift.io/reddit/comment/search/?link_id=8d7z5z
https://api.pushshift.io/reddit/comment/search/?link_id=8d86sw
https://api.pushshift.io/reddit/comment/search/?link_id=8d88tb
https://api.pushshift.io/reddit/comment/search/?link_id=8d89vc
https://api.pushshift.io/reddit/comment/search/?link_id=8d8apz
https://api.pushshift.io/reddit/comment/search/?link_id=8d8k2c
https://api.pushshift.io/reddit/comment/search/?link_id=8d8pdc
https://api.pushshift.io/reddit/comment/search/?link_id=8d8r4p
https://api.pushshift.io/reddit/comment/search/?link_id=8d916m
https://api.pushshift.io/reddit/comment/search/?link_id=8d9qho
https://api.pushshift.io/reddit/comment/search/?link_id=8d9y9o
https://api.pushshift.io/reddit/comment/search/?link_id=8d9yai
https://api.pushshift.io/reddit/comment/search/?link_id=8da13s
https://api.pushshift.io/reddit/comment/search/?link_id=8dabqk
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8dn9a0
https://api.pushshift.io/reddit/comment/search/?link_id=8dngil
https://api.pushshift.io/reddit/comment/search/?link_id=8dngpq
https://api.pushshift.io/reddit/comment/search/?link_id=8dnjdo
https://api.pushshift.io/reddit/comment/search/?link_id=8dnjxj
https://api.pushshift.io/reddit/comment/search/?link_id=8dnn7n
https://api.pushshift.io/reddit/comment/search/?link_id=8dnnd4
https://api.pushshift.io/reddit/comment/search/?link_id=8dnnic
https://api.pushshift.io/reddit/comment/search/?link_id=8dnpyw
https://api.pushshift.io/reddit/comment/search/?link_id=8dnqg9
https://api.pushshift.io/reddit/comment/search/?link_id=8dnr58
https://api.pushshift.io/reddit/comment/search/?link_id=8dnz55
https://api.pushshift.io/reddit/comment/search/?link_id=8do1s1
https://api.pushshift.io/reddit/comment/search/?link_id=8do3jn
https://api.pushshift.io/reddit/comment/search/?link_id=8do6lp
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8e9sfi
https://api.pushshift.io/reddit/comment/search/?link_id=8e9syu
https://api.pushshift.io/reddit/comment/search/?link_id=8e9yxp
https://api.pushshift.io/reddit/comment/search/?link_id=8ea72q
https://api.pushshift.io/reddit/comment/search/?link_id=8eac4s
https://api.pushshift.io/reddit/comment/search/?link_id=8eac4z
https://api.pushshift.io/reddit/comment/search/?link_id=8eafoy
https://api.pushshift.io/reddit/comment/search/?link_id=8ealxu
https://api.pushshift.io/reddit/comment/search/?link_id=8eaqpa
https://api.pushshift.io/reddit/comment/search/?link_id=8earox
https://api.pushshift.io/reddit/comment/search/?link_id=8easxr
https://api.pushshift.io/reddit/comment/search/?link_id=8eaxjp
https://api.pushshift.io/reddit/comment/search/?link_id=8eb3sf
https://api.pushshift.io/reddit/comment/search/?link_id=8eb9mz
https://api.pushshift.io/reddit/comment/search/?link_id=8eb9q8
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8eok4a
https://api.pushshift.io/reddit/comment/search/?link_id=8eowzp
https://api.pushshift.io/reddit/comment/search/?link_id=8ep1ht
https://api.pushshift.io/reddit/comment/search/?link_id=8ep6po
https://api.pushshift.io/reddit/comment/search/?link_id=8epcug
https://api.pushshift.io/reddit/comment/search/?link_id=8epih1
https://api.pushshift.io/reddit/comment/search/?link_id=8epjmk
https://api.pushshift.io/reddit/comment/search/?link_id=8epmts
https://api.pushshift.io/reddit/comment/search/?link_id=8epnph
https://api.pushshift.io/reddit/comment/search/?link_id=8epoci
https://api.pushshift.io/reddit/comment/search/?link_id=8epovp
https://api.pushshift.io/reddit/comment/search/?link_id=8epru0
https://api.pushshift.io/reddit/comment/search/?link_id=8eptyj
https://api.pushshift.io/reddit/comment/search/?link_id=8epxvz
https://api.pushshift.io/reddit/comment/search/?link_id=8epz0k
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8f8we7
https://api.pushshift.io/reddit/comment/search/?link_id=8f9dfa
https://api.pushshift.io/reddit/comment/search/?link_id=8f9dvx
https://api.pushshift.io/reddit/comment/search/?link_id=8f9gg0
https://api.pushshift.io/reddit/comment/search/?link_id=8f9u0w
https://api.pushshift.io/reddit/comment/search/?link_id=8f9v96
https://api.pushshift.io/reddit/comment/search/?link_id=8f9y48
https://api.pushshift.io/reddit/comment/search/?link_id=8f9yng
https://api.pushshift.io/reddit/comment/search/?link_id=8facc2
https://api.pushshift.io/reddit/comment/search/?link_id=8fagpa
https://api.pushshift.io/reddit/comment/search/?link_id=8fahrw
https://api.pushshift.io/reddit/comment/search/?link_id=8faipz
https://api.pushshift.io/reddit/comment/search/?link_id=8falyn
https://api.pushshift.io/reddit/comment/search/?link_id=8fanok
https://api.pushshift.io/reddit/comment/search/?link_id=8faphk
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8fyuoa
https://api.pushshift.io/reddit/comment/search/?link_id=8fyuse
https://api.pushshift.io/reddit/comment/search/?link_id=8fyx7g
https://api.pushshift.io/reddit/comment/search/?link_id=8fyx7k
https://api.pushshift.io/reddit/comment/search/?link_id=8fyzav
https://api.pushshift.io/reddit/comment/search/?link_id=8fz1gj
https://api.pushshift.io/reddit/comment/search/?link_id=8fz38m
https://api.pushshift.io/reddit/comment/search/?link_id=8fz4nx
https://api.pushshift.io/reddit/comment/search/?link_id=8fz659
https://api.pushshift.io/reddit/comment/search/?link_id=8fzkwt
https://api.pushshift.io/reddit/comment/search/?link_id=8fzqxu
https://api.pushshift.io/reddit/comment/search/?link_id=8fzr1w
https://api.pushshift.io/reddit/comment/search/?link_id=8fzrpn
https://api.pushshift.io/reddit/comment/search/?link_id=8fzv0n
https://api.pushshift.io/reddit/comment/search/?link_id=8g00rl
https://api.pushshift.io/reddit/comment/search/?link_id

In [8]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [9]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,88mv2c,2018-04-01 00:09:34,1522541374,تفسير الاحلام في الصناع وأصحاب الحرف والعملة و...,,0,1,0.0,[]


In [10]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [11]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,88mv2c,2018-04-01 00:09:34,1522541374,تفسير الاحلام في الصناع وأصحاب الحرف والعملة و...,,0,1,0.0,[]


In [12]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-04-01_to_2018-05-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)