In [70]:
import requests
import time
import pandas as pd
import numpy as np

## Problem Statement:

The problem we are tackling is a binary classification problem. The goal is to create a model to accurately classify reddit posts based on the subreddits that it originated from - r/pcmasterrace or r/mac. We will be creating two models to compare - Bayes Naive Classifier and Logistic Regression Model. The aim is to create a model  that is able to generalize well across new observations and accurately predict the origin of the posts from the correct subreddit.

## Testing Connection:

In [71]:
url = "https://www.reddit.com/hot.json"
headers = {'User-agent':'Bleep blorp bot 0.1'}
res = requests.get(url, headers=headers)
the_json = res.json()
sorted(the_json['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [72]:
res.status_code

200

In [73]:
sorted(the_json['data'].keys())
len(the_json['data']['children'])

25

## Functions for data scraping and transforming into Data Frame:

In [74]:
def get_posts(url,interactions,header,sleep):
    posts = []
    after = None
    for i in range(interactions):
        print(i)
        if after == None:
            params = {}
        else:
            params = {'after':after}
        res = requests.get(url, params=params, headers=headers)
        if res.status_code == 200:
            the_json = res.json()
            posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
        else:
            print(res.status_code)
            break
        time.sleep(sleep)
    return(posts)

In [75]:
def create_cols(dataframe):
    dataframe['subreddit'] = dataframe['data'].map(lambda x: x['subreddit'])
    dataframe['title'] = dataframe['data'].map(lambda x: x['title'])
    dataframe['name'] = dataframe['data'].map(lambda x: x['name'])
    dataframe['selftext'] = dataframe['data'].map(lambda x: x['selftext'])
    dataframe['domain'] = dataframe['data'].map(lambda x: x['domain'])
    return dataframe

## Grabing Data:

In [76]:
header = {'User-agent': 'Bleep blorp bot 0.1'}
url = 'https://www.reddit.com/r/pcmasterrace.json'
interations = 40
sleep_sec = 1.5
pcmasterrace_df = pd.DataFrame(get_posts(url,interations,header,sleep_sec))
pcmasterrace_df = create_cols(pcmasterrace_df)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [77]:
header = {'User-agent': 'Bleep blorp bot 0.1'}
url = 'https://www.reddit.com/r/mac.json'
interations = 40
sleep_sec = 1.5
mac_df = pd.DataFrame(get_posts(url,interations,header,sleep_sec))
mac_df = create_cols(mac_df)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


## Saving Scraped Data

In [78]:
pcmasterrace_df.to_csv('./data/pcmasterraceDataFrame.csv', encoding='utf-8')
mac_df.to_csv('./data/macDataFrame.csv', encoding='utf-8')