# Project 3 - Part 1: Gather raw data from Reddit

In [2]:
# Import related libraries
import requests
import json
import pandas as pd
import numpy as np
import time
import random
import re

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score,auc
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Accessing Reddit API

In [11]:
# Selected Basketball and Baseball subreddits for analysis
url_0 = 'https://www.reddit.com/r/Basketball.json'
url_1='https://www.reddit.com/r/baseball.json'

In [3]:
# Successful request
res_0 = requests.get(url_0, headers={'User-agent': 'sat 1.0'})
res_0.status_code

200

# Explore data

We are interested in the top/most popular 1000 posts for each subreddit. The subreddit contains many layers of keys:
Top keys: 'kind', 'data'

'data' key: 'modhash', 'dist', 'children', 'after', 'before'

'children' key: Important post information

## Start with Basketball subreddit

In [4]:
dict_0=res_0.json()

In [5]:
dict_0.keys()

dict_keys(['kind', 'data'])

In [6]:
dict_0['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [7]:
# 'children' key contains desired info
dict_0['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'Basketball',
   'author_fullname': 't2_aj47j',
   'saved': False,
   'mod_reason_title': None,
   'gilded': 0,
   'clicked': False,
   'title': 'Rule: You cannot Ask for or Post Copyright Material on the Sub',
   'link_flair_richtext': [],
   'subreddit_name_prefixed': 'r/Basketball',
   'hidden': False,
   'pwls': 6,
   'link_flair_css_class': None,
   'downs': 0,
   'thumbnail_height': None,
   'hide_score': False,
   'name': 't3_dk4qe2',
   'quarantine': False,
   'link_flair_text_color': 'dark',
   'author_flair_background_color': None,
   'subreddit_type': 'public',
   'ups': 60,
   'total_awards_received': 0,
   'media_embed': {},
   'thumbnail_width': None,
   'author_flair_template_id': None,
   'is_original_content': False,
   'user_reports': [],
   'secure_media': None,
   'is_reddit_media_domain': False,
   'is_meta': False,
   'category': None,
   'secure_media_embed': {},
   'link_flair_text': None,
   'c

In [8]:
# There are 27 posts gathered per access
len(dict_0['data']['children'])

27

In [9]:
# 'after' key contains the id of next post
dict_0['data']['after']

't3_fdvy1p'

In [10]:
# An overview of the posts
posts = [p['data'] for p in dict_0['data']['children']]
posts=pd.DataFrame(posts)
posts.head().T

Unnamed: 0,0,1,2,3,4
all_awardings,[],[],[],[],[]
allow_live_comments,False,False,False,False,False
approved_at_utc,,,,,
approved_by,,,,,
archived,False,False,False,False,False
author,Commandant1,AutoModerator,isaacrespondek,JoaniCreanFanClub,pardonmytake13
author_flair_background_color,,,,,
author_flair_css_class,,ballflair,,,
author_flair_richtext,[],[],[],[],[]
author_flair_template_id,,,,,


In [11]:
# How many posts per extract
posts.name

0     t3_dk4qe2
1     t3_fcc4h8
2     t3_ff03x7
3     t3_fewdq3
4     t3_ff1pcq
5     t3_ff0g7c
6     t3_fehyyz
7     t3_ferz6p
8     t3_feeqyp
9     t3_fe3gpl
10    t3_feao0u
11    t3_fec3op
12    t3_fe86ev
13    t3_fe9bfp
14    t3_fe7lo2
15    t3_fe8fxx
16    t3_fe6dmb
17    t3_fe7ycs
18    t3_fe27os
19    t3_fe4pc9
20    t3_fe4gsb
21    t3_fe2639
22    t3_fdzzmv
23    t3_fdzik8
24    t3_fe2px9
25    t3_fdib5q
26    t3_fdvy1p
Name: name, dtype: object

In [12]:
# Created a loop to extract 2600 posts 
posts_0 = []
after = None

for a in range(40):
    if after == None:
        current_url = url_0
    else:
        current_url = url_0 + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'sat 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts_0.extend(current_posts)
    after = current_dict['data']['after']

    # generate a random sleep duration 
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/Basketball.json
5
https://www.reddit.com/r/Basketball.json?after=t3_fdvy1p
3
https://www.reddit.com/r/Basketball.json?after=t3_fcoepu
6
https://www.reddit.com/r/Basketball.json?after=t3_fbnpey
3
https://www.reddit.com/r/Basketball.json?after=t3_fawy8t
3
https://www.reddit.com/r/Basketball.json?after=t3_f9qp9u
2
https://www.reddit.com/r/Basketball.json?after=t3_f94hug
6
https://www.reddit.com/r/Basketball.json?after=t3_f7xxx1
6
https://www.reddit.com/r/Basketball.json?after=t3_f75ypg
2
https://www.reddit.com/r/Basketball.json?after=t3_f6232p
4
https://www.reddit.com/r/Basketball.json?after=t3_f4le7s
4
https://www.reddit.com/r/Basketball.json?after=t3_f4eimp
4
https://www.reddit.com/r/Basketball.json?after=t3_f2xtel
5
https://www.reddit.com/r/Basketball.json?after=t3_f21sjc
2
https://www.reddit.com/r/Basketball.json?after=t3_f1bg8t
2
https://www.reddit.com/r/Basketball.json?after=t3_f0gvgo
5
https://www.reddit.com/r/Basketball.json?after=t3_ezl6vm
6
https://www.r

In [14]:
# Save all the raw basketball posts as csv so that I do not need to run the above loop each time
pd.DataFrame(posts_0).to_csv('basketball.csv', index = False)

## Repeat the above process for baseball subreddit

In [12]:
res_1 = requests.get(url_1, headers={'User-agent': 'Ruili 1.0'})
res_1.status_code

200

In [21]:
dict_1 = res_1.json()

In [22]:
len(dict_1['data']['children'])

27

In [23]:
posts_1 = [p['data'] for p in dict_1['data']['children']]
posts_1=pd.DataFrame(posts_1)

In [24]:
# Created a loop to extract 1080 posts 
posts_1 = []
after = None

for a in range(40):
    if after == None:
        current_url = url_1
    else:
        current_url = url_1 + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'molly Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts_1.extend(current_posts)
    after = current_dict['data']['after']
    
    if a > 0:
        prev_posts = pd.read_csv('baseball.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts_1).to_csv('baseball.csv', index = False)


    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/baseball.json
2
https://www.reddit.com/r/baseball.json?after=t3_ff5wx7
3
https://www.reddit.com/r/baseball.json?after=t3_fewuxk
5
https://www.reddit.com/r/baseball.json?after=t3_fei3ag
6
https://www.reddit.com/r/baseball.json?after=t3_fejowo
6
https://www.reddit.com/r/baseball.json?after=t3_fejwvi
4
https://www.reddit.com/r/baseball.json?after=t3_feht88
4
https://www.reddit.com/r/baseball.json?after=t3_fe9nfa
2
https://www.reddit.com/r/baseball.json?after=t3_fe3frp
4
https://www.reddit.com/r/baseball.json?after=t3_fdfwsp
2
https://www.reddit.com/r/baseball.json?after=t3_fdlid6
5
https://www.reddit.com/r/baseball.json?after=t3_fdg87h
2
https://www.reddit.com/r/baseball.json?after=t3_fd15nr
2
https://www.reddit.com/r/baseball.json?after=t3_fcv5dk
3
https://www.reddit.com/r/baseball.json?after=t3_fcvmyq
6
https://www.reddit.com/r/baseball.json?after=t3_fchm4e
6
https://www.reddit.com/r/baseball.json?after=t3_fd0up8
3
https://www.reddit.com/r/baseball.json?after=t3

In [25]:
pd.DataFrame(posts_1).to_csv('baseball.csv', index = False)