In [1]:
import pandas as pd
import numpy as np
import json, requests
import time

In [2]:
def get_reddit_stats(url, num_its):
    '''
    Parameters:
    url (str): Reddit url where we'll scrape data from 
    example: 'http://www.reddit.com/r/cryptocurrency/top.json?sort=top&t=year'
    
    num_its (int): The number of times specified to run the program using the 'after' endpoint
    
    Returns:
    dfs (DataFrame): DataFrame containing post title's, date/time posted, and the number of upvotes
    
    '''
    # delay program a bit to avoid getting blocked by Reddit
    time.sleep(2)
    
    # create lists to hold the post title (text), time in utc format, and the number of upvotes
    author = []
    title = []
    selftext = []
    created_utc = []
    ups = []
    downs = []
    upvote_ratio = []
    num_comments = []
    total_awards = []
    

    # I decided to use the requests package to scrape the data (which is in JSON format)
    r = requests.get(url,  headers={'user-agent': 'Mozilla/5.0'})

    # the title, time, and upvotes can be found in "children", which can be found in "data"
    for post in r.json()['data']['children']:
        author.append(post['data']['author'])
        title.append(post['data']['title'])
        selftext.append(post['data']['selftext'])
        created_utc.append(post['data']['created_utc'])
        ups.append(post['data']['ups'])
        downs.append(post['data']['downs'])
        upvote_ratio.append(post['data']['upvote_ratio'])
        num_comments.append(post['data']['num_comments'])
        total_awards.append(post['data']['total_awards_received'])
        
    num = 0
    
    # there is an "after" attribute in the url which essentially loops to the next page with more posts
    # I put a try/except to break out of the function once the data is null
    while num < num_its:
        try:
            urla = url + '&after=' + r.json()['data']['after']
            r = requests.get(urla, headers={'user-agent': 'Mozilla/5.0'})
            for post in r.json()['data']['children']:
                author.append(post['data']['author'])
                title.append(post['data']['title'])
                selftext.append(post['data']['selftext'])
                created_utc.append(post['data']['created_utc'])
                ups.append(post['data']['ups'])
                downs.append(post['data']['downs'])
                upvote_ratio.append(post['data']['upvote_ratio'])
                num_comments.append(post['data']['num_comments'])
                total_awards.append(post['data']['total_awards_received'])


            num += 1
        except:
            num = num_its
            break
        
    author = np.array(author)
    title = np.array(title)
    selftext = np.array(selftext)
    created_utc = np.array(created_utc)
    ups = np.array(ups)
    downs = np.array(downs)
    upvote_ratio = np.array(upvote_ratio)
    num_comments = np.array(num_comments)
    total_awards = np.array(total_awards)
    
    dfs = pd.DataFrame({'Author': author,
                        'Title': title,
                        'Selftext': selftext,
                        'Created Utc': created_utc,
                        'Ups': ups,
                        'Downs': downs,
                        'Upvote Ratio': upvote_ratio,
                        'Num Comments': num_comments,
                        'Total Awards': total_awards,
                        })

    return dfs


In [11]:
# Scrape
url = 'http://www.reddit.com/r/bitcoin/top.json?sort=top&t=year'
df_rd = get_reddit_stats(url,10000)

In [12]:
df_rd['Date'] = pd.to_datetime(df_rd['Created Utc'] , unit = 's').dt.date
df_rd = df_rd.sort_values('Date', ascending=False)

# Drop posts with the same post titles
df_rd.drop_duplicates(subset = 'Title',keep = 'first', inplace = True)

In [13]:
df_rd.shape

(999, 10)

In [14]:
df_rd.drop(columns=['Downs'], inplace=True)

In [15]:
df_rd.to_csv('df_rd_row.csv', index=False)

# EDA

In [None]:
import matplotlib.pyplot as plt
df_rd.hist(column='Num Comments', bins=100)

In [None]:
df_rd

# Backup

In [None]:
counter = 0

for post in r.json()['data']['children']:
    print()
    print()
    counter += 1
    print('=================')
    for p in post['data'].items():
        print(p)
    if counter >2:
        break