In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission/'

In [3]:
def read_post(post):
    try:
        return {
            'title': post['title'],
            'text': post['selftext'],
            'auth': post['author'],
            'time': post['created_utc']
        }
    except KeyError:
        return None

def pushshift_query(subreddit, num_loops = 5, size = 100):
    posts_list = []
    current_time = int(time.time())
    for query in range(num_loops):
        print(f"query # {query}")
        res = requests.get(url, {'subreddit': subreddit, 'size': size, 'before': current_time})
        if res.status_code != 200:
            print(f"query failed - code {res.status_code")
            continue
        posts = []
        for post in res.json()['data']:
            post_parsed = read_post(post)
            if post_parsed is not None and post_parsed['text'] != "[removed]":
                posts.append(post_parsed)
        current_time = posts[-1]['time']
        posts_list.append(pd.DataFrame(posts))
        time.sleep(1)
        print(f"posts collected: {len(posts)} / {size}")

    return pd.concat(posts_list)

In [4]:
import os
try:
    os.mkdir('data')
except:
    pass

In [11]:
data_poetry = pushshift_query('OCPoetry', num_loops=500)
data_poetry.to_csv('data/OCPoetry.csv')

query # 0
posts collected: 51 / 100
query # 1
posts collected: 55 / 100
query # 2
posts collected: 61 / 100
query # 3
posts collected: 50 / 100
query # 4
posts collected: 65 / 100
query # 5
posts collected: 64 / 100
query # 6
posts collected: 61 / 100
query # 7
posts collected: 65 / 100
query # 8
posts collected: 68 / 100
query # 9
posts collected: 63 / 100
query # 10
posts collected: 49 / 100
query # 11
posts collected: 56 / 100
query # 12
posts collected: 71 / 100
query # 13
posts collected: 57 / 100
query # 14
posts collected: 57 / 100
query # 15
posts collected: 55 / 100
query # 16
posts collected: 56 / 100
query # 17
posts collected: 61 / 100
query # 18
posts collected: 65 / 100
query # 19
posts collected: 70 / 100
query # 20
posts collected: 57 / 100
query # 21
posts collected: 66 / 100
query # 22
posts collected: 61 / 100
query # 23
posts collected: 62 / 100
query # 24
posts collected: 61 / 100
query # 25
posts collected: 67 / 100
query # 26
posts collected: 56 / 100
query # 27


In [12]:
data_sss = pushshift_query('shortscarystories', num_loops=300)
data_sss.to_csv('data/shortscarystories.csv')

query # 0
posts collected: 100 / 100
query # 1
posts collected: 100 / 100
query # 2
posts collected: 100 / 100
query # 3
posts collected: 99 / 100
query # 4
posts collected: 100 / 100
query # 5
posts collected: 97 / 100
query # 6
posts collected: 100 / 100
query # 7
posts collected: 93 / 100
query # 8
posts collected: 92 / 100
query # 9
posts collected: 97 / 100
query # 10
posts collected: 93 / 100
query # 11
posts collected: 93 / 100
query # 12
posts collected: 94 / 100
query # 13
posts collected: 96 / 100
query # 14
posts collected: 97 / 100
query # 15
posts collected: 97 / 100
query # 16
posts collected: 99 / 100
query # 17
posts collected: 100 / 100
query # 18
posts collected: 96 / 100
query # 19
posts collected: 98 / 100
query # 20
posts collected: 94 / 100
query # 21
posts collected: 95 / 100
query # 22
posts collected: 92 / 100
query # 23
posts collected: 96 / 100
query # 24
posts collected: 92 / 100
query # 25
posts collected: 87 / 100
query # 26
posts collected: 93 / 100
query

In [13]:
data_poetry.shape

(29905, 4)

In [14]:
data_sss.shape

(28687, 4)

In [15]:
data_poetry.tail()

Unnamed: 0,title,text,auth,time
44,CHROMATOPHORES,I realized you could never change\n\nthe secon...,loooofa,1587003038
45,New Summer Moon,"The pill passes fondly between hands, \nOverhe...",TheKingGrizz,1587000164
46,The Realization,"\n\nThe realization of what life truly is,\n\...",Childish_lobino,1586999276
47,“Branches” or “Webs” or “Climbing”,A Thousand Splendid Branches \n \n\n Stret...,RoyaltyFreeAccount,1586997464
48,Love,[deleted],[deleted],1586996710


In [16]:
data_poetry.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29905 entries, 0 to 48
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   29905 non-null  object
 1   text    29905 non-null  object
 2   auth    29905 non-null  object
 3   time    29905 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


In [18]:
res = requests.get(url, {'subreddit': 'OCPoetry'})

In [25]:
res.json()['data'][0].keys()

dict_keys(['all_awardings', 'allow_live_comments', 'author', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_text', 'author_flair_type', 'author_fullname', 'author_is_blocked', 'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post', 'content_categories', 'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id', 'is_created_from_ads_ui', 'is_crosspostable', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'link_flair_background_color', 'link_flair_richtext', 'link_flair_text_color', 'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'pwls', 'removed_by_category', 'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title', 'total_awards_received', 'treatment_tags', 'upvote_ratio', '

Other possible features:
* upvote_ratio
* url