Set up library imports

In [77]:
import requests, re, json, praw, pprint, time
import pandas as pd
import numpy as np
from psaw import PushshiftAPI    # PSAW recommended by following PRAW errors
import datetime as dt            # PSAW docs
from IPython.display import display
from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Initializing reddit API

reddit = praw.Reddit(client_id='CmKUgfSklwH6Gw',
                     client_secret='WprZwImA7V8TcggsN0GfpZOfl2g',
                     user_agent='ClassProjectBot-PRAW/PSAW',
                     password='dsBaLpQSua2ctCXU2XyupJ',
                     username='refused_dev')

# Set PushShiftAPI
api = PushshiftAPI()

# Set a variable equal to the target subreddit
r_all = reddit.subreddit('all')


# Grab submissions from the subreddit
subs = []
for sub in r_all.hot(limit=None):
    sub_dict = {}
    sub_dict['submissions'] = sub
    subs.append(sub_dict)
subscrape = pd.DataFrame(subs)
subscrape.shape, subscrape.head()

# Save submission pull to csv
subscrape.to_csv("subs.csv")

# Grab features from the submission IDs
sublist = []
for c in subscrape['submissions']:
    subdict = {}
    subdict['title'] = c.title
    subdict['comments'] = c.num_comments
    subdict['crossposts'] = c.num_crossposts
    subdict['score'] = c.score
    subdict['subreddit'] = c.subreddit
    subdict['domain'] = c.domain
    subdict['gilded'] = c.gilded
    subdict['upvote_ratio'] = c.upvote_ratio
    subdict['created'] = c.created
    fri_sublist.append(subdict)

df = pd.DataFrame(sublist)

def get_date(created):
    return dt.datetime.fromtimestamp(created)
stamp = data['created'].apply(get_date)
data = data.assign(timestamp = stamp)

# get the dates
datelist = []
def humantime(data):
    for date in data['timestamp']:
        date = str(date)
        year = int(re.match(r"(\d{4})-(\d{2})-(\d{2})\s(.{8})", date).group(1))
        month = int(re.match(r"(\d{4})-(\d{2})-(\d{2})\s(.{8})", date).group(2))
        day = int(re.match(r"(\d{4})-(\d{2})-(\d{2})\s(.{8})", date).group(3))
        time = re.match(r"(\d{4})-(\d{2})-(\d{2})\s(.{8})", date).group(4)
        
        data['year'] = year
        data['month'] = month
        data['day'] = day
        data['time'] = time

Load up previous scrapes

In [2]:
# Wednesday, May 30, 2018 Scrape : 2500 subs
weds_subs2500 = pd.read_csv("2500subs.csv")
# Thursday, May 31, 2018 Scrape : 9199 subs
thurs_subs9199 = pd.read_csv("9199subs.csv")
# Friday, Jun 1, 2018 Scrape
fri_subs9068 = pd.read_csv("9068subs.csv")

display( weds_subs2500.shape, weds_subs2500.head() )
display( thurs_subs9199.shape, thurs_subs9199.head() )
display( fri_subs9068.shape, fri_subs9068.head())

(2500, 2)

Unnamed: 0.1,Unnamed: 0,subs
0,0,8ncnpf
1,1,8nbvoa
2,2,8ncbjw
3,3,8nbtt0
4,4,8ncwal


(9199, 2)

Unnamed: 0.1,Unnamed: 0,submissions
0,0,8nijjr
1,1,8nids3
2,2,8ni3fe
3,3,8ni82h
4,4,8ni1me


(9068, 2)

Unnamed: 0.1,Unnamed: 0,submissions
0,0,8nt15b
1,1,8ntcdf
2,2,8nsur5
3,3,8nstox
4,4,8nsvmj


In [3]:
# rename subs column to submissions
weds_subs2500.rename(columns={"subs" : "submissions"}, inplace=True)
weds_subs2500.head()

Unnamed: 0.1,Unnamed: 0,submissions
0,0,8ncnpf
1,1,8nbvoa
2,2,8ncbjw
3,3,8nbtt0
4,4,8ncwal


In [4]:
subs = pd.concat([weds_subs2500, thurs_subs9199]) 
#                   fri_subs9068])

In [5]:
display(subs.shape, subs.head())

(11699, 2)

Unnamed: 0.1,Unnamed: 0,submissions
0,0,8ncnpf
1,1,8nbvoa
2,2,8ncbjw
3,3,8nbtt0
4,4,8ncwal


In [6]:
subs.drop(columns='Unnamed: 0', inplace=True)
subs.shape

(11699, 1)

In [64]:
# check for duplicate ids
dupes = subs[subs.duplicated(['submissions'], keep='first')]
display(dupes.shape, dupes.head())

(966, 1)

Unnamed: 0,submissions
282,8nbsou
379,8n9t6h
401,8n8pal
800,8nd36n
805,8n7xtl


In [69]:
subs.drop_duplicates(inplace=True)

In [70]:
subs.shape

(10733, 1)

In [7]:
# Load up DataFrames with the submission content
weds = pd.read_csv("2500dataframe.csv")
thurs = pd.read_csv("9199dataframe.csv")
df = pd.concat([weds,thurs])
display(df.shape, df.head())

(11699, 10)

Unnamed: 0.1,Unnamed: 0,comments,created_utc,crossposts,domain,gilded,score,subreddit,title,upvote_ratio
0,0,124,1527718000.0,0,i.redd.it,0,15854,BikiniBottomTwitter,About to make a splash? More like just about t...,0.93
1,1,2263,1527712000.0,2,thehill.com,0,24831,politics,FBI is reconstructing shredded documents obtai...,0.93
2,2,1614,1527715000.0,3,youtube.com,3,32645,videos,Gamer bet if the new Bethesda reveal was not F...,0.87
3,3,1047,1527711000.0,1,smh.com.au,0,26425,worldnews,"Police faked 258,000 breath tests in shocking ...",0.94
4,4,103,1527719000.0,0,i.redd.it,0,7350,PrequelMemes,"Always 2 there are, no more, no less",0.94


In [22]:
df.drop(columns='Unnamed: 0', inplace=True)

In [38]:
y = df['comments']
# y.tolist()

In [39]:
# y = pd.DataFrame(y)
y.shape

(11699,)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=.20,
                                                    random_state=19)

In [49]:
X_train.shape, y_train.shape

((9359, 9), (9359,))

In [51]:
X_test.shape, y_test.shape

((2340, 9), (2340,))

In [60]:
sns.distplot(df['comments'<10000], bins= 2)

TypeError: '<' not supported between instances of 'str' and 'int'

In [61]:
df['comments'].median()

15.0

In [72]:
zerocoms = df[df['comments'] == 0]
df_w_coms = df[df['comments'] > 0]
df_w_coms['comments'].median()

17.0

In [73]:
df_w_coms[df_w_coms['comments']<10]

Unnamed: 0,comments,created_utc,crossposts,domain,gilded,score,subreddit,title,upvote_ratio
145,8,1.527703e+09,0,i.redd.it,0,3024,NotKenM,Not KenM on spelling,0.96
152,6,1.527712e+09,0,i.redd.it,0,1699,woof_irl,woof_irl,0.98
161,8,1.527702e+09,0,i.redd.it,0,3108,bonehurtingjuice,oof ouch my window,0.97
391,7,1.527718e+09,0,i.redd.it,0,613,carporn,GT3RS on my way to work today!,0.97
392,9,1.527720e+09,0,i.redd.it,0,576,cursedimages,Cursed_Mythbuster,0.96
424,6,1.527708e+09,0,i.imgur.com,0,906,SexyTummies,A little walk in the park,0.97
524,1,1.527719e+09,0,i.imgur.com,0,428,Amateur,as perky as they come,0.97
525,5,1.527712e+09,0,i.redd.it,0,610,gentlemanboners,Emilia Clarke.,0.96
535,5,1.527719e+09,0,i.redd.it,0,419,teefies,Bottom teefies,0.99
560,7,1.527722e+09,0,gfycat.com,0,344,CuteModeSlutMode,"Cute Mode | Slut Mode, Fitness Hottie",0.97


In [None]:
# pipeline stuff
# use a randomforest
tfvect = TfidfVectorizer(stop_words='english')
rf = RandomForestClassifier()
logreg = LogisticRegression()

tube = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('rf', RandomForestClassifier()),
    ('lr', LogisticRegression())
    
])

params = {
    'tfidf__min_df': ,
    'tfidf_max_df': ,
    'rf__max_depth': ,
    'rf__min_samples_',
    'lr__penalty':['l1','l2'],
    'lr__'
}

gs = GridSearchCV(tube, param_grid=params)
gs.fit(X_train, y_train)
gs.score(X_test, y_test)