In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time

from datetime import datetime, timedelta

## Pull from pushshift.io

In [2]:
def get_json(url):
    response = requests.get(url)
    assert response.status_code == 200

    return json.loads(response.content)

def make_request(url, max_retries = 5):
    current_tries = 1
    
    while current_tries < max_retries:
        time.sleep(1)
        
        try:
            return get_json(url)
        except:
            current_tries += 1
            
    return get_json(uri)

In [3]:
def pull_posts_for(subreddit, start_at, end_at):
    
    def map_posts(posts):
        collection = []
        for post in posts:
            mapped = {
                'id': post['id'],
                'author': post['author'],
                'created_utc': post['created_utc'],
                'prefix': 't4_'
            }
            collection.append(mapped)
            
        return collection
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    
    response = make_request(URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))
    post_collections = map_posts(response['data'])
    
    n = len(post_collections)
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        response = make_request(URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))
        more_posts = map_posts(response['data'])
        
        n = len(more_posts)
        post_collections.extend(more_posts)
        
    return post_collections

In [4]:
def give_me_intervals(start_at, number_of_days_per_interval = 3):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
        
    ## 1 day = 86400,
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    
    yield (int(start_at), int(end))
    
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        
        yield int(start_at), int(end)

In [5]:
subreddit = 'Siacoin'

In [6]:
posts = []

offset = (datetime.utcnow() - timedelta(days=365*4)).timestamp()
start_at = math.floor(offset)

for interval in give_me_intervals(start_at, 5):
    
    pulled_posts = pull_posts_for(
        subreddit,
        interval[0],
        interval[1]
    )
    
    posts.extend(pulled_posts)
    
    print(f'collected #{len(pulled_posts)} for {interval[0]} - {interval[1]})')
    time.sleep(1)
    
with open('../data/reddit/pushshift_posts.json', 'w') as pushshift_posts:
    pushshift_posts.write(json.dumps(posts))

collected #2 for 1461257850 - 1461689850)
collected #2 for 1461689851 - 1462121850)
collected #2 for 1462121851 - 1462553850)
collected #0 for 1462553851 - 1462985850)
collected #1 for 1462985851 - 1463417850)
collected #7 for 1463417851 - 1463849850)
collected #7 for 1463849851 - 1464281850)
collected #5 for 1464281851 - 1464713850)
collected #9 for 1464713851 - 1465145850)
collected #8 for 1465145851 - 1465577850)
collected #9 for 1465577851 - 1466009850)
collected #10 for 1466009851 - 1466441850)
collected #5 for 1466441851 - 1466873850)
collected #34 for 1466873851 - 1467305850)
collected #18 for 1467305851 - 1467737850)
collected #10 for 1467737851 - 1468169850)
collected #9 for 1468169851 - 1468601850)
collected #9 for 1468601851 - 1469033850)
collected #6 for 1469033851 - 1469465850)
collected #7 for 1469465851 - 1469897850)
collected #3 for 1469897851 - 1470329850)
collected #6 for 1470329851 - 1470761850)
collected #3 for 1470761851 - 1471193850)
collected #2 for 1471193851 - 

## Pull from Reddit

In [8]:
posts = []
with open('../data/reddit/pushshift_posts.json', 'r') as pushshift_json:
    posts = json.loads(pushshift_json.read())

In [9]:
config = {}
with open('../data/reddit.json', 'r') as config_file:
    config = json.loads(config_file.read())

In [10]:
import praw

reddit = praw.Reddit(client_id = config['client_id'],
                     client_secret = config['client_secret'],
                     user_agent = config['user_agent'])

In [11]:
TIMEOUT_AFTER_COMMENT_IN_SECS = .5

reddit_data = []

submission_count = 0
total_submission_count = len(posts)

submission_ids = np.unique([ post['id'] for post in posts ])
for submission_id in submission_ids:
    submission = reddit.submission(id=submission_id)
    
    submission_author = submission.author
    if submission_author != None:
        submission_author = submission.author.name
    else:
        submission_author = 'deleted'
    
    reddit_data.append({
        'id': submission_id,
        'type': 'submission',
        'post_id': submission_id,
        'author': submission_author,
        'text': submission.selftext,
        'created_at': submission.created
    })
    
    submission.comments.replace_more(limit=None)
    for comment in submission.comments:
        
        comment_author = comment.author
        if comment_author != None:
            comment_author = comment.author.name
        else:
            comment_author = 'deleted'
        
        reddit_data.append({
            'id': comment.id,
            'type': 'comment',
            'post_id': submission_id,
            'author': comment_author,
            'text': comment.body,
            'created_at': comment.created
        })
        
        if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
            time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)
            
    submission_count += 1
    
    if submission_count % 500 == 0:
        print(submission_count, total_submission_count)

500 9826
1000 9826
1500 9826
2000 9826
2500 9826
3000 9826
3500 9826
4000 9826
4500 9826
5000 9826
5500 9826
6000 9826
6500 9826
7000 9826
7500 9826
8000 9826
8500 9826
9000 9826
9500 9826


In [12]:
print(f'len: {len(reddit_data)}')

with open('../data/reddit/dataset.json', 'w') as reddit_data_output:
    reddit_data_output.write(json.dumps(reddit_data))

len: 36873
