In [1]:
#Project Objectives:
#Collect posts from two subreddits of my choice (r/oculus and r/Vive).
#Use NLP to train a classifier model on which subreddit a given post came from.

#Problem Statement:
#Predict if a subreddit originates from Oculus or Vive. At the same time, conduct a sentiment analysis on both subreddits.

In [2]:
import numpy as np
import pandas as pd
import requests
import time
import datetime as dt
import json

#### Define Pushshift search function

In [3]:
from bs4 import BeautifulSoup

In [4]:
#Ref: https://github.com/pushshift/api

def pushshift(subreddit, post_type, loops=1, size=500, skip=20):
# subreddit: r/oculus and r/Vive
# post_type: type of post to search for: submission, comment
# loops: number of times to request posts
# size: number of posts per request (max 500 per pushshift api)
# skip: skip posts

    # data fields to return for submissions
    subfields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments', 'permalink', 
                 'score', 'selftext', 'subreddit', 'title', 'url', 'is_self']    
    
    # data fields to return for comments
    comfields = ['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 
                'permalink', 'score', 'subreddit']
    
    # instantiate list for posts data
    list_posts = [] 
    url_stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)
    
    # skip a minimum of 1 day
    after = 1    

    # check before requesting data
    if post_type not in ['submission', 'comment']:
        print("post_type must be 'submission' or 'comment'")
        return None
    
    for i in range(loops):
        # add parameters to url to skip posts 
        url = '{}&after={}d'.format(url_stem, skip * i + after) 
        
        # monitor status as loops run
        print(i, url)
        
        # get data from url
        res = requests.get(url)
        # add dictionaries for posts to list_posts
        list_posts.extend(res.json()['data']) 
        
        # allow for break in between requests
        time.sleep(1) 

    # turn list_posts (a list of dictionaries where each dictionary contains data on one post) into a dataframe
    df_posts = pd.DataFrame.from_dict(list_posts) 

    # filter fields for submissions or comments
    if post_type == 'submission':
        df_posts = df_posts[subfields]
    elif post_type == 'comment':
        df_posts = df_posts[comfields]  

    # drop any duplicates
    df_posts.drop_duplicates(inplace=True)
    
    # add a field identifying submissions or comments
    df_posts['post_type'] = post_type
    
    return df_posts

#### Get subreddit posts and save to csv

In [5]:
oculus_sub = pushshift('oculus', post_type='submission', loops=20, size=500, skip=1)
print('shape', oculus_sub.shape)
oculus_sub.to_csv('oculus_sub.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=oculus&size=500&after=11d
11 https://api.pushshift.io/r

In [6]:
vive_sub = pushshift('Vive', post_type='submission', loops=20, size=500, skip=1)
print('shape', vive_sub.shape)
vive_sub.to_csv('vive_sub.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=Vive&size=500&after=11d
11 https://api.pushshift.io/reddit/search/submissio

In [7]:
oculus_com = pushshift('oculus', post_type='comment', loops=20, size=500, skip=1)
print('shape', oculus_com.shape)
oculus_com.to_csv('oculus_com.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=oculus&size=500&after=11d
11 https://api.pushshift.io/reddit/search/comment/?subreddit=o

In [8]:
vive_com = pushshift('Vive', post_type='comment', loops=20, size=500, skip=1)
print('shape', vive_com.shape)
vive_com.to_csv('vive_com.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=11d
11 https://api.pushshift.io/reddit/search/comment/?subreddit=Vive&size=500&after=12d

#### Create csv to analyse comments of subreddits

In [11]:
df = pd.concat([oculus_com[['body', 'subreddit']], vive_com[['body', 'subreddit']]], ignore_index=True)
df.to_csv('comments.csv', index=False)