# Function Definitions and Imports

In [11]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd

from datetime import datetime, timedelta, timezone

In [12]:
# Get all the submissions for a specific subreddit in the given timeframe
# subreddit = string of subreddit to scrape
# after = epoch time (earliest submissions)
# before = epoch time (latest submissions)
def get_submissions(subreddit, after, before):
    
    url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
           str(subreddit) +  # Subreddit to scrape
           '&after=' + 
           str(after) +  # Scrape posts after (UTC format)
           '&before=' + 
           str(before) +  # Scrape posts after (UTC format)
           '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time

    print(url)
    response = requests.get(url)
    submissions = json.loads(response.text)

    submissions = submissions['data']  # Only key in submissions is 'data'
    
    # Columns to include in submissions dataframe
    submission_columns = ['subreddit',     # Subreddit name
                          'id',            # Post ID
                          'created_utc',   # UTC time post was created
                          'title',         # Post title
                          'selftext',      # Post body
                          'num_comments',  # Number of comments on post
                          'score',         # Number of upvotes
                          'gilded'         # Number of silver/gold/platinum badges
                         ]
    
    # Create dataframe, where each row contains a submission
    submissions_df = pd.DataFrame(submissions).loc[:, submission_columns]

    while len(submissions) == 1000:
        
        after = submissions_df.created_utc.iloc[-1]  # Query from latest time of previous query
        print(after)
        
        url = ('https://api.pushshift.io/reddit/submission/search/?subreddit=' +  # Basic URL header
       str(subreddit) +  # Subreddit to scrape
       '&after=' + 
       str(after) +  # Scrape posts after (UTC format)
       '&before=' + 
       str(before) +  # Scrape posts after (UTC format)
       '&limit=1000')  # Can get at max 1000 submissions from pushshift at a time
        
        print(url)
        
        # Sometimes there is no data, or there's a moderator comment. In that case, pass
        try:
            response = requests.get(url)
            submissions = json.loads(response.text)
        

            submissions = submissions['data']  # Only key in submissions is 'data'

            print(len(submissions))  # Loop won't occur again if < 1000 submissions were found
                                     # since that means all submissions were queried

            # Append data to dataframe
            submissions_df = pd.concat([submissions_df, 
                                        pd.DataFrame(submissions).loc[:, submission_columns]],
                                      ignore_index=True)
        except:
            pass
        
    
    submissions_df.insert(2, 'created_time', np.nan)  # Create new column for time, with values initialized to NaN
    
    # Convert UTC time to datetime
    submissions_df['created_time'] = [datetime.utcfromtimestamp(utc).strftime('%Y-%m-%d %H:%M:%S') 
                                      for utc in submissions_df['created_utc']]
    
    return submissions_df

In [13]:
# Get list of comments for a submission
def get_submission_comments(submission_id):
    
    url = ('https://api.pushshift.io/reddit/comment/search/?link_id=' + 
           str(submission_id))
    
    print(url)
    response = requests.get(url)
    
    # Sometimes there is no data, or there's a moderator comment. In that case, pass
    try:
        comments_all_data = json.loads(response.text)['data']  # List of dicts

        # Get only the comment ('body') for each comment block
        # Comment block includes extraneous info (author, score, etc.)
        comments = [comment_block['body'] for comment_block in comments_all_data]

        return comments # Return list of comments
    except:
        pass

In [14]:
# Given list of comments, combine them into a single string
def combine_comments(comments):
    try:
        return ' '.join([str(elem) for elem in comments]) 
    except:
        pass

# Data Manipulation

__The following line of code is the only thing that has to be changed in this file:__

In [15]:
# Create DF with submissions between the following epoch times 
# Comments have not been added yet
submissions = get_submissions(subreddit='gadgets', 
                              after='2018-07-01', 
                              before='2018-08-01')


https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=2018-07-01&before=2018-08-01&limit=1000
1532157010
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1532157010&before=2018-08-01&limit=1000
1532157010
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1532157010&before=2018-08-01&limit=1000
1532157010
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1532157010&before=2018-08-01&limit=1000
1532157010
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1532157010&before=2018-08-01&limit=1000
1532157010
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1532157010&before=2018-08-01&limit=1000
1532157010
https://api.pushshift.io/reddit/submission/search/?subreddit=gadgets&after=1532157010&before=2018-08-01&limit=1000
546


In [16]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded
0,gadgets,8v7cxb,2018-07-01 03:55:54,1530417354,5つの発明であなたのマインドを吹くこと,,0,1,0


In [17]:
# Create column 'comments' which contains list of comments for the submission
submissions['comments'] = submissions.id.apply(get_submission_comments)

https://api.pushshift.io/reddit/comment/search/?link_id=8v7cxb
https://api.pushshift.io/reddit/comment/search/?link_id=8v8tyn
https://api.pushshift.io/reddit/comment/search/?link_id=8v8xha
https://api.pushshift.io/reddit/comment/search/?link_id=8v91o1
https://api.pushshift.io/reddit/comment/search/?link_id=8v926v
https://api.pushshift.io/reddit/comment/search/?link_id=8v95sj
https://api.pushshift.io/reddit/comment/search/?link_id=8v9k14
https://api.pushshift.io/reddit/comment/search/?link_id=8v9orh
https://api.pushshift.io/reddit/comment/search/?link_id=8va3nr
https://api.pushshift.io/reddit/comment/search/?link_id=8va59z
https://api.pushshift.io/reddit/comment/search/?link_id=8vabu6
https://api.pushshift.io/reddit/comment/search/?link_id=8vahn6
https://api.pushshift.io/reddit/comment/search/?link_id=8vanl8
https://api.pushshift.io/reddit/comment/search/?link_id=8varcm
https://api.pushshift.io/reddit/comment/search/?link_id=8vbh3n
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8vzra8
https://api.pushshift.io/reddit/comment/search/?link_id=8w02i8
https://api.pushshift.io/reddit/comment/search/?link_id=8w09l6
https://api.pushshift.io/reddit/comment/search/?link_id=8w0blk
https://api.pushshift.io/reddit/comment/search/?link_id=8w0lx9
https://api.pushshift.io/reddit/comment/search/?link_id=8w0o8r
https://api.pushshift.io/reddit/comment/search/?link_id=8w0ov1
https://api.pushshift.io/reddit/comment/search/?link_id=8w0pxc
https://api.pushshift.io/reddit/comment/search/?link_id=8w0zkj
https://api.pushshift.io/reddit/comment/search/?link_id=8w131u
https://api.pushshift.io/reddit/comment/search/?link_id=8w1ag7
https://api.pushshift.io/reddit/comment/search/?link_id=8w1apq
https://api.pushshift.io/reddit/comment/search/?link_id=8w1bu9
https://api.pushshift.io/reddit/comment/search/?link_id=8w1h7p
https://api.pushshift.io/reddit/comment/search/?link_id=8w1rvk
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8wyedl
https://api.pushshift.io/reddit/comment/search/?link_id=8wykr6
https://api.pushshift.io/reddit/comment/search/?link_id=8wywfd
https://api.pushshift.io/reddit/comment/search/?link_id=8wyxs1
https://api.pushshift.io/reddit/comment/search/?link_id=8wzmt3
https://api.pushshift.io/reddit/comment/search/?link_id=8wzrop
https://api.pushshift.io/reddit/comment/search/?link_id=8wzw9a
https://api.pushshift.io/reddit/comment/search/?link_id=8x00ap
https://api.pushshift.io/reddit/comment/search/?link_id=8x047r
https://api.pushshift.io/reddit/comment/search/?link_id=8x0m85
https://api.pushshift.io/reddit/comment/search/?link_id=8x110c
https://api.pushshift.io/reddit/comment/search/?link_id=8x1n7h
https://api.pushshift.io/reddit/comment/search/?link_id=8x1opq
https://api.pushshift.io/reddit/comment/search/?link_id=8x1oqg
https://api.pushshift.io/reddit/comment/search/?link_id=8x1x2p
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8xr3xh
https://api.pushshift.io/reddit/comment/search/?link_id=8xrftm
https://api.pushshift.io/reddit/comment/search/?link_id=8xro0h
https://api.pushshift.io/reddit/comment/search/?link_id=8xrphr
https://api.pushshift.io/reddit/comment/search/?link_id=8xs0nr
https://api.pushshift.io/reddit/comment/search/?link_id=8xs3qf
https://api.pushshift.io/reddit/comment/search/?link_id=8xs8mb
https://api.pushshift.io/reddit/comment/search/?link_id=8xsmbq
https://api.pushshift.io/reddit/comment/search/?link_id=8xt00q
https://api.pushshift.io/reddit/comment/search/?link_id=8xtcds
https://api.pushshift.io/reddit/comment/search/?link_id=8xtrjq
https://api.pushshift.io/reddit/comment/search/?link_id=8xu7bb
https://api.pushshift.io/reddit/comment/search/?link_id=8xua99
https://api.pushshift.io/reddit/comment/search/?link_id=8xucdl
https://api.pushshift.io/reddit/comment/search/?link_id=8xujfk
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8yc0r5
https://api.pushshift.io/reddit/comment/search/?link_id=8yc0s8
https://api.pushshift.io/reddit/comment/search/?link_id=8yc2p7
https://api.pushshift.io/reddit/comment/search/?link_id=8yc41k
https://api.pushshift.io/reddit/comment/search/?link_id=8yc8ib
https://api.pushshift.io/reddit/comment/search/?link_id=8yca1x
https://api.pushshift.io/reddit/comment/search/?link_id=8ycebn
https://api.pushshift.io/reddit/comment/search/?link_id=8yceip
https://api.pushshift.io/reddit/comment/search/?link_id=8ycfbo
https://api.pushshift.io/reddit/comment/search/?link_id=8yct52
https://api.pushshift.io/reddit/comment/search/?link_id=8yd9nq
https://api.pushshift.io/reddit/comment/search/?link_id=8yd9rb
https://api.pushshift.io/reddit/comment/search/?link_id=8ydvht
https://api.pushshift.io/reddit/comment/search/?link_id=8ye0b6
https://api.pushshift.io/reddit/comment/search/?link_id=8yeu5w
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8z0y7o
https://api.pushshift.io/reddit/comment/search/?link_id=8z18r3
https://api.pushshift.io/reddit/comment/search/?link_id=8z190e
https://api.pushshift.io/reddit/comment/search/?link_id=8z1agj
https://api.pushshift.io/reddit/comment/search/?link_id=8z1bp2
https://api.pushshift.io/reddit/comment/search/?link_id=8z1e1w
https://api.pushshift.io/reddit/comment/search/?link_id=8z1kug
https://api.pushshift.io/reddit/comment/search/?link_id=8z1yh4
https://api.pushshift.io/reddit/comment/search/?link_id=8z2027
https://api.pushshift.io/reddit/comment/search/?link_id=8z21vk
https://api.pushshift.io/reddit/comment/search/?link_id=8z2lya
https://api.pushshift.io/reddit/comment/search/?link_id=8z2t7g
https://api.pushshift.io/reddit/comment/search/?link_id=8z2ztb
https://api.pushshift.io/reddit/comment/search/?link_id=8z37td
https://api.pushshift.io/reddit/comment/search/?link_id=8z3bfb
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=8zlqy2
https://api.pushshift.io/reddit/comment/search/?link_id=8zlv91
https://api.pushshift.io/reddit/comment/search/?link_id=8zlx44
https://api.pushshift.io/reddit/comment/search/?link_id=8zm96p
https://api.pushshift.io/reddit/comment/search/?link_id=8zmhbg
https://api.pushshift.io/reddit/comment/search/?link_id=8zmltf
https://api.pushshift.io/reddit/comment/search/?link_id=8zmuju
https://api.pushshift.io/reddit/comment/search/?link_id=8zmuzw
https://api.pushshift.io/reddit/comment/search/?link_id=8zn6q9
https://api.pushshift.io/reddit/comment/search/?link_id=8znn9v
https://api.pushshift.io/reddit/comment/search/?link_id=8znqvh
https://api.pushshift.io/reddit/comment/search/?link_id=8zntn5
https://api.pushshift.io/reddit/comment/search/?link_id=8zo3q4
https://api.pushshift.io/reddit/comment/search/?link_id=8zok8d
https://api.pushshift.io/reddit/comment/search/?link_id=8zow7s
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=907f1c
https://api.pushshift.io/reddit/comment/search/?link_id=907l29
https://api.pushshift.io/reddit/comment/search/?link_id=907n21
https://api.pushshift.io/reddit/comment/search/?link_id=907qly
https://api.pushshift.io/reddit/comment/search/?link_id=907rgp
https://api.pushshift.io/reddit/comment/search/?link_id=90802x
https://api.pushshift.io/reddit/comment/search/?link_id=9082s3
https://api.pushshift.io/reddit/comment/search/?link_id=9083h9
https://api.pushshift.io/reddit/comment/search/?link_id=908ayo
https://api.pushshift.io/reddit/comment/search/?link_id=908jw5
https://api.pushshift.io/reddit/comment/search/?link_id=908twj
https://api.pushshift.io/reddit/comment/search/?link_id=908y40
https://api.pushshift.io/reddit/comment/search/?link_id=9095xk
https://api.pushshift.io/reddit/comment/search/?link_id=909cl3
https://api.pushshift.io/reddit/comment/search/?link_id=909ghk
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=90ywp7
https://api.pushshift.io/reddit/comment/search/?link_id=90z2b8
https://api.pushshift.io/reddit/comment/search/?link_id=90z8tr
https://api.pushshift.io/reddit/comment/search/?link_id=90zgjd
https://api.pushshift.io/reddit/comment/search/?link_id=90znf5
https://api.pushshift.io/reddit/comment/search/?link_id=910cda
https://api.pushshift.io/reddit/comment/search/?link_id=910ol8
https://api.pushshift.io/reddit/comment/search/?link_id=910xwz
https://api.pushshift.io/reddit/comment/search/?link_id=911yqi
https://api.pushshift.io/reddit/comment/search/?link_id=912yow
https://api.pushshift.io/reddit/comment/search/?link_id=9130lc
https://api.pushshift.io/reddit/comment/search/?link_id=9133cj
https://api.pushshift.io/reddit/comment/search/?link_id=91393d
https://api.pushshift.io/reddit/comment/search/?link_id=913bax
https://api.pushshift.io/reddit/comment/search/?link_id=913iod
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=91kd13
https://api.pushshift.io/reddit/comment/search/?link_id=91kdju
https://api.pushshift.io/reddit/comment/search/?link_id=91kjp5
https://api.pushshift.io/reddit/comment/search/?link_id=91kn7t
https://api.pushshift.io/reddit/comment/search/?link_id=91knym
https://api.pushshift.io/reddit/comment/search/?link_id=91lmdb
https://api.pushshift.io/reddit/comment/search/?link_id=91lnt6
https://api.pushshift.io/reddit/comment/search/?link_id=91lypb
https://api.pushshift.io/reddit/comment/search/?link_id=91mk4u
https://api.pushshift.io/reddit/comment/search/?link_id=91mq85
https://api.pushshift.io/reddit/comment/search/?link_id=91n4mi
https://api.pushshift.io/reddit/comment/search/?link_id=91n6c9
https://api.pushshift.io/reddit/comment/search/?link_id=91nocv
https://api.pushshift.io/reddit/comment/search/?link_id=91nsrp
https://api.pushshift.io/reddit/comment/search/?link_id=91o0gk
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=925sut
https://api.pushshift.io/reddit/comment/search/?link_id=925yfo
https://api.pushshift.io/reddit/comment/search/?link_id=926qr3
https://api.pushshift.io/reddit/comment/search/?link_id=9274av
https://api.pushshift.io/reddit/comment/search/?link_id=9275n4
https://api.pushshift.io/reddit/comment/search/?link_id=928bav
https://api.pushshift.io/reddit/comment/search/?link_id=928c47
https://api.pushshift.io/reddit/comment/search/?link_id=928hf1
https://api.pushshift.io/reddit/comment/search/?link_id=928hgu
https://api.pushshift.io/reddit/comment/search/?link_id=928t5h
https://api.pushshift.io/reddit/comment/search/?link_id=928txt
https://api.pushshift.io/reddit/comment/search/?link_id=928yjk
https://api.pushshift.io/reddit/comment/search/?link_id=92972k
https://api.pushshift.io/reddit/comment/search/?link_id=92a12d
https://api.pushshift.io/reddit/comment/search/?link_id=92a36v
https://api.pushshift.io/reddit/comment/search/?link_id

https://api.pushshift.io/reddit/comment/search/?link_id=931fs2
https://api.pushshift.io/reddit/comment/search/?link_id=931nw4
https://api.pushshift.io/reddit/comment/search/?link_id=931tf4
https://api.pushshift.io/reddit/comment/search/?link_id=931xme
https://api.pushshift.io/reddit/comment/search/?link_id=9323ab
https://api.pushshift.io/reddit/comment/search/?link_id=932932
https://api.pushshift.io/reddit/comment/search/?link_id=932jp8
https://api.pushshift.io/reddit/comment/search/?link_id=932pde
https://api.pushshift.io/reddit/comment/search/?link_id=932taj
https://api.pushshift.io/reddit/comment/search/?link_id=932u5i
https://api.pushshift.io/reddit/comment/search/?link_id=932yga
https://api.pushshift.io/reddit/comment/search/?link_id=9331my
https://api.pushshift.io/reddit/comment/search/?link_id=933475
https://api.pushshift.io/reddit/comment/search/?link_id=933bb7
https://api.pushshift.io/reddit/comment/search/?link_id=933fgl
https://api.pushshift.io/reddit/comment/search/?link_id

In [18]:
# Make list of comments into a single string
# submissions.comments = submissions.comments.apply(combine_comments)

In [19]:
submissions.head(1)

Unnamed: 0,subreddit,id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,8v7cxb,2018-07-01 03:55:54,1530417354,5つの発明であなたのマインドを吹くこと,,0,1,0,


In [20]:
# Construct output_df, which includes all the info we need from submissions
#output_df = submissions[['subreddit', 'id', 'created_time', 'created_utc', 'num_comments', 'score', 'gilded']]

# Combine the submission title, body, and comments into a single column called submission_text
#output_df['submission_text'] = (submissions['title'].map(str) + 
#                                submissions['selftext'].map(str) + 
#                                submissions['comments'].map(str))

# Rename id to submission_id (for clarity)
submissions.rename(columns={'id':'submission_id'}, inplace=True)

In [21]:
submissions.head(1)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
0,gadgets,8v7cxb,2018-07-01 03:55:54,1530417354,5つの発明であなたのマインドを吹くこと,,0,1,0,


In [22]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('2018-07-01_to_2018-08-01', 'wb') as picklefile:
    pickle.dump(submissions, picklefile)