# Function Definitions and Imports

In [16]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [17]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [18]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [19]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [20]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-01-01', 
                              before='2018-02-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-01-01&before=2018-02-01&limit=1000
1516102638
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1516102638&before=2018-02-01&limit=1000
955


In [21]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,7ncgte,2018-01-01 01:41:09,1514770869,"Versatile, self-deploying, relocatable buildings",,1,1,


In [22]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=7ncgte
https://api.pushshift.io/reddit/comment/search/?link_id=7ncnmp
https://api.pushshift.io/reddit/comment/search/?link_id=7ncy6w
https://api.pushshift.io/reddit/comment/search/?link_id=7ncyjm
https://api.pushshift.io/reddit/comment/search/?link_id=7ndboh
https://api.pushshift.io/reddit/comment/search/?link_id=7ndbzp
https://api.pushshift.io/reddit/comment/search/?link_id=7ndtjc
https://api.pushshift.io/reddit/comment/search/?link_id=7ndw82
https://api.pushshift.io/reddit/comment/search/?link_id=7ndzl3
https://api.pushshift.io/reddit/comment/search/?link_id=7neest
https://api.pushshift.io/reddit/comment/search/?link_id=7negd6
https://api.pushshift.io/reddit/comment/search/?link_id=7nemuv
https://api.pushshift.io/reddit/comment/search/?link_id=7nes79
https://api.pushshift.io/reddit/comment/search/?link_id=7nevh3
https://api.pushshift.io/reddit/comment/search/?link_id=7nevz5
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7nuvip
https://api.pushshift.io/reddit/comment/search/?link_id=7nuyyo
https://api.pushshift.io/reddit/comment/search/?link_id=7nv0g2
https://api.pushshift.io/reddit/comment/search/?link_id=7nv6jh
https://api.pushshift.io/reddit/comment/search/?link_id=7nv6tf
https://api.pushshift.io/reddit/comment/search/?link_id=7nvd7k
https://api.pushshift.io/reddit/comment/search/?link_id=7nvdhj
https://api.pushshift.io/reddit/comment/search/?link_id=7nvfkq
https://api.pushshift.io/reddit/comment/search/?link_id=7nvgyb
https://api.pushshift.io/reddit/comment/search/?link_id=7nvj0g
https://api.pushshift.io/reddit/comment/search/?link_id=7nvka7
https://api.pushshift.io/reddit/comment/search/?link_id=7nvljk
https://api.pushshift.io/reddit/comment/search/?link_id=7nvoh6
https://api.pushshift.io/reddit/comment/search/?link_id=7nvqoe
https://api.pushshift.io/reddit/comment/search/?link_id=7nvrwy
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7obx5y
https://api.pushshift.io/reddit/comment/search/?link_id=7oc16i
https://api.pushshift.io/reddit/comment/search/?link_id=7oc6lg
https://api.pushshift.io/reddit/comment/search/?link_id=7oc73f
https://api.pushshift.io/reddit/comment/search/?link_id=7ocng0
https://api.pushshift.io/reddit/comment/search/?link_id=7oczax
https://api.pushshift.io/reddit/comment/search/?link_id=7od94l
https://api.pushshift.io/reddit/comment/search/?link_id=7odnt2
https://api.pushshift.io/reddit/comment/search/?link_id=7odt6d
https://api.pushshift.io/reddit/comment/search/?link_id=7oe969
https://api.pushshift.io/reddit/comment/search/?link_id=7oedik
https://api.pushshift.io/reddit/comment/search/?link_id=7oegh8
https://api.pushshift.io/reddit/comment/search/?link_id=7oehkr
https://api.pushshift.io/reddit/comment/search/?link_id=7oekka
https://api.pushshift.io/reddit/comment/search/?link_id=7oeni3
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7oviz7
https://api.pushshift.io/reddit/comment/search/?link_id=7ovkmw
https://api.pushshift.io/reddit/comment/search/?link_id=7ovl2h
https://api.pushshift.io/reddit/comment/search/?link_id=7ovl5q
https://api.pushshift.io/reddit/comment/search/?link_id=7ovllf
https://api.pushshift.io/reddit/comment/search/?link_id=7ovmee
https://api.pushshift.io/reddit/comment/search/?link_id=7ovmh7
https://api.pushshift.io/reddit/comment/search/?link_id=7ovn0y
https://api.pushshift.io/reddit/comment/search/?link_id=7ovpha
https://api.pushshift.io/reddit/comment/search/?link_id=7ovqb9
https://api.pushshift.io/reddit/comment/search/?link_id=7ovqux
https://api.pushshift.io/reddit/comment/search/?link_id=7ovrdk
https://api.pushshift.io/reddit/comment/search/?link_id=7ovrog
https://api.pushshift.io/reddit/comment/search/?link_id=7ovrt1
https://api.pushshift.io/reddit/comment/search/?link_id=7ovs2j
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7p798y
https://api.pushshift.io/reddit/comment/search/?link_id=7p7dhz
https://api.pushshift.io/reddit/comment/search/?link_id=7p7hgn
https://api.pushshift.io/reddit/comment/search/?link_id=7p7lgl
https://api.pushshift.io/reddit/comment/search/?link_id=7p7n1a
https://api.pushshift.io/reddit/comment/search/?link_id=7p7o5f
https://api.pushshift.io/reddit/comment/search/?link_id=7p7q3u
https://api.pushshift.io/reddit/comment/search/?link_id=7p7vdk
https://api.pushshift.io/reddit/comment/search/?link_id=7p7w10
https://api.pushshift.io/reddit/comment/search/?link_id=7p82ra
https://api.pushshift.io/reddit/comment/search/?link_id=7p836o
https://api.pushshift.io/reddit/comment/search/?link_id=7p852q
https://api.pushshift.io/reddit/comment/search/?link_id=7p853v
https://api.pushshift.io/reddit/comment/search/?link_id=7p86yq
https://api.pushshift.io/reddit/comment/search/?link_id=7p89mx
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7pjirs
https://api.pushshift.io/reddit/comment/search/?link_id=7pk6ug
https://api.pushshift.io/reddit/comment/search/?link_id=7pk9i2
https://api.pushshift.io/reddit/comment/search/?link_id=7pke7u
https://api.pushshift.io/reddit/comment/search/?link_id=7pkmw8
https://api.pushshift.io/reddit/comment/search/?link_id=7plobg
https://api.pushshift.io/reddit/comment/search/?link_id=7plpbo
https://api.pushshift.io/reddit/comment/search/?link_id=7pm074
https://api.pushshift.io/reddit/comment/search/?link_id=7pm3bf
https://api.pushshift.io/reddit/comment/search/?link_id=7pm4el
https://api.pushshift.io/reddit/comment/search/?link_id=7pm52b
https://api.pushshift.io/reddit/comment/search/?link_id=7pmaba
https://api.pushshift.io/reddit/comment/search/?link_id=7pmeuu
https://api.pushshift.io/reddit/comment/search/?link_id=7pmhzt
https://api.pushshift.io/reddit/comment/search/?link_id=7pmjbp
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7q1udp
https://api.pushshift.io/reddit/comment/search/?link_id=7q215q
https://api.pushshift.io/reddit/comment/search/?link_id=7q28gj
https://api.pushshift.io/reddit/comment/search/?link_id=7q2ejb
https://api.pushshift.io/reddit/comment/search/?link_id=7q2gqm
https://api.pushshift.io/reddit/comment/search/?link_id=7q2jpv
https://api.pushshift.io/reddit/comment/search/?link_id=7q2pvd
https://api.pushshift.io/reddit/comment/search/?link_id=7q2ria
https://api.pushshift.io/reddit/comment/search/?link_id=7q2s4b
https://api.pushshift.io/reddit/comment/search/?link_id=7q2zo5
https://api.pushshift.io/reddit/comment/search/?link_id=7q30uu
https://api.pushshift.io/reddit/comment/search/?link_id=7q33r4
https://api.pushshift.io/reddit/comment/search/?link_id=7q34xp
https://api.pushshift.io/reddit/comment/search/?link_id=7q352q
https://api.pushshift.io/reddit/comment/search/?link_id=7q3agl
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7qiy5q
https://api.pushshift.io/reddit/comment/search/?link_id=7qiza4
https://api.pushshift.io/reddit/comment/search/?link_id=7qj05m
https://api.pushshift.io/reddit/comment/search/?link_id=7qj47o
https://api.pushshift.io/reddit/comment/search/?link_id=7qj94v
https://api.pushshift.io/reddit/comment/search/?link_id=7qja05
https://api.pushshift.io/reddit/comment/search/?link_id=7qjdrk
https://api.pushshift.io/reddit/comment/search/?link_id=7qjirh
https://api.pushshift.io/reddit/comment/search/?link_id=7qjll5
https://api.pushshift.io/reddit/comment/search/?link_id=7qjmif
https://api.pushshift.io/reddit/comment/search/?link_id=7qjnod
https://api.pushshift.io/reddit/comment/search/?link_id=7qjuza
https://api.pushshift.io/reddit/comment/search/?link_id=7qk0h3
https://api.pushshift.io/reddit/comment/search/?link_id=7qk67w
https://api.pushshift.io/reddit/comment/search/?link_id=7qk8b9
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7qyzab
https://api.pushshift.io/reddit/comment/search/?link_id=7qz0wj
https://api.pushshift.io/reddit/comment/search/?link_id=7qz37a
https://api.pushshift.io/reddit/comment/search/?link_id=7qz5e2
https://api.pushshift.io/reddit/comment/search/?link_id=7qz5u0
https://api.pushshift.io/reddit/comment/search/?link_id=7qz6x4
https://api.pushshift.io/reddit/comment/search/?link_id=7qz7eq
https://api.pushshift.io/reddit/comment/search/?link_id=7qz8ox
https://api.pushshift.io/reddit/comment/search/?link_id=7qz9jt
https://api.pushshift.io/reddit/comment/search/?link_id=7qz9vz
https://api.pushshift.io/reddit/comment/search/?link_id=7qzaig
https://api.pushshift.io/reddit/comment/search/?link_id=7qzb06
https://api.pushshift.io/reddit/comment/search/?link_id=7qzfbw
https://api.pushshift.io/reddit/comment/search/?link_id=7qzhob
https://api.pushshift.io/reddit/comment/search/?link_id=7qzhp8
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7rg1y1
https://api.pushshift.io/reddit/comment/search/?link_id=7rg42s
https://api.pushshift.io/reddit/comment/search/?link_id=7rg5kz
https://api.pushshift.io/reddit/comment/search/?link_id=7rg6by
https://api.pushshift.io/reddit/comment/search/?link_id=7rgb0n
https://api.pushshift.io/reddit/comment/search/?link_id=7rgblz
https://api.pushshift.io/reddit/comment/search/?link_id=7rgdps
https://api.pushshift.io/reddit/comment/search/?link_id=7rgdy2
https://api.pushshift.io/reddit/comment/search/?link_id=7rgeay
https://api.pushshift.io/reddit/comment/search/?link_id=7rgf71
https://api.pushshift.io/reddit/comment/search/?link_id=7rggl4
https://api.pushshift.io/reddit/comment/search/?link_id=7rggr3
https://api.pushshift.io/reddit/comment/search/?link_id=7rgh9z
https://api.pushshift.io/reddit/comment/search/?link_id=7rh0b4
https://api.pushshift.io/reddit/comment/search/?link_id=7rh1k8
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7s55uf
https://api.pushshift.io/reddit/comment/search/?link_id=7s56ch
https://api.pushshift.io/reddit/comment/search/?link_id=7s5d8w
https://api.pushshift.io/reddit/comment/search/?link_id=7s5gc0
https://api.pushshift.io/reddit/comment/search/?link_id=7s5kil
https://api.pushshift.io/reddit/comment/search/?link_id=7s5koy
https://api.pushshift.io/reddit/comment/search/?link_id=7s5sht
https://api.pushshift.io/reddit/comment/search/?link_id=7s5t02
https://api.pushshift.io/reddit/comment/search/?link_id=7s60j6
https://api.pushshift.io/reddit/comment/search/?link_id=7s68p2
https://api.pushshift.io/reddit/comment/search/?link_id=7s6crh
https://api.pushshift.io/reddit/comment/search/?link_id=7s6dze
https://api.pushshift.io/reddit/comment/search/?link_id=7s6ejr
https://api.pushshift.io/reddit/comment/search/?link_id=7s6h5d
https://api.pushshift.io/reddit/comment/search/?link_id=7s6l00
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7slkzv
https://api.pushshift.io/reddit/comment/search/?link_id=7sll60
https://api.pushshift.io/reddit/comment/search/?link_id=7slowm
https://api.pushshift.io/reddit/comment/search/?link_id=7slp3t
https://api.pushshift.io/reddit/comment/search/?link_id=7slss4
https://api.pushshift.io/reddit/comment/search/?link_id=7slvpl
https://api.pushshift.io/reddit/comment/search/?link_id=7slw8m
https://api.pushshift.io/reddit/comment/search/?link_id=7slzvl
https://api.pushshift.io/reddit/comment/search/?link_id=7sm2mu
https://api.pushshift.io/reddit/comment/search/?link_id=7sm3p9
https://api.pushshift.io/reddit/comment/search/?link_id=7sm5te
https://api.pushshift.io/reddit/comment/search/?link_id=7sm62x
https://api.pushshift.io/reddit/comment/search/?link_id=7sm83j
https://api.pushshift.io/reddit/comment/search/?link_id=7smauc
https://api.pushshift.io/reddit/comment/search/?link_id=7smfx4
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7t1ooo
https://api.pushshift.io/reddit/comment/search/?link_id=7t1set
https://api.pushshift.io/reddit/comment/search/?link_id=7t1trg
https://api.pushshift.io/reddit/comment/search/?link_id=7t1x0e
https://api.pushshift.io/reddit/comment/search/?link_id=7t20c2
https://api.pushshift.io/reddit/comment/search/?link_id=7t22h1
https://api.pushshift.io/reddit/comment/search/?link_id=7t2rb4
https://api.pushshift.io/reddit/comment/search/?link_id=7t2vp3
https://api.pushshift.io/reddit/comment/search/?link_id=7t2wul
https://api.pushshift.io/reddit/comment/search/?link_id=7t2y43
https://api.pushshift.io/reddit/comment/search/?link_id=7t2yel
https://api.pushshift.io/reddit/comment/search/?link_id=7t358r
https://api.pushshift.io/reddit/comment/search/?link_id=7t3bmh
https://api.pushshift.io/reddit/comment/search/?link_id=7t3o76
https://api.pushshift.io/reddit/comment/search/?link_id=7t3ual
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7tkbyi
https://api.pushshift.io/reddit/comment/search/?link_id=7tkejw
https://api.pushshift.io/reddit/comment/search/?link_id=7tkgk8
https://api.pushshift.io/reddit/comment/search/?link_id=7tkk62
https://api.pushshift.io/reddit/comment/search/?link_id=7tkob7
https://api.pushshift.io/reddit/comment/search/?link_id=7tkrsa
https://api.pushshift.io/reddit/comment/search/?link_id=7tkusi
https://api.pushshift.io/reddit/comment/search/?link_id=7tl99o
https://api.pushshift.io/reddit/comment/search/?link_id=7tlax1
https://api.pushshift.io/reddit/comment/search/?link_id=7tlbdj
https://api.pushshift.io/reddit/comment/search/?link_id=7tlkao
https://api.pushshift.io/reddit/comment/search/?link_id=7tlmje
https://api.pushshift.io/reddit/comment/search/?link_id=7tlomy
https://api.pushshift.io/reddit/comment/search/?link_id=7tlpvz
https://api.pushshift.io/reddit/comment/search/?link_id=7tlxml
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=7tzk5e
https://api.pushshift.io/reddit/comment/search/?link_id=7tzkty
https://api.pushshift.io/reddit/comment/search/?link_id=7tzmyw
https://api.pushshift.io/reddit/comment/search/?link_id=7tzn7z
https://api.pushshift.io/reddit/comment/search/?link_id=7tzoc8
https://api.pushshift.io/reddit/comment/search/?link_id=7tzvv2
https://api.pushshift.io/reddit/comment/search/?link_id=7tzvxl
https://api.pushshift.io/reddit/comment/search/?link_id=7tzwx1
https://api.pushshift.io/reddit/comment/search/?link_id=7u00bl
https://api.pushshift.io/reddit/comment/search/?link_id=7u0261
https://api.pushshift.io/reddit/comment/search/?link_id=7u0773
https://api.pushshift.io/reddit/comment/search/?link_id=7u0ar2
https://api.pushshift.io/reddit/comment/search/?link_id=7u0bg8
https://api.pushshift.io/reddit/comment/search/?link_id=7u0jha
https://api.pushshift.io/reddit/comment/search/?link_id=7u0jnm
https://api.pushshift.io/reddit/comment/search/?link_id

In [23]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [24]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,7ncgte,2018-01-01 01:41:09,1514770869,"Versatile, self-deploying, relocatable buildings",,1,1,,"[Hello, /u/adarkthirty! Thanks for contributin..."


In [25]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [26]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,7ncgte,2018-01-01 01:41:09,1514770869,"Versatile, self-deploying, relocatable buildings",,1,1,,"[Hello, /u/adarkthirty! Thanks for contributin..."


In [27]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-01-01_to_2018-02-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)