Set up library imports

In [46]:
import requests, re, json, praw, pprint, time
import pandas as pd
import numpy as np
from psaw import PushshiftAPI    # PSAW recommended by following PRAW errors
import datetime as dt            # PSAW docs
from IPython.display import display
from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from jupyterthemes import jtplot

In [None]:
# Initializing reddit API

reddit = praw.Reddit(client_id='CmKUgfSklwH6Gw',
                     client_secret='WprZwImA7V8TcggsN0GfpZOfl2g',
                     user_agent='ClassProjectBot-PRAW/PSAW',
                     password='dsBaLpQSua2ctCXU2XyupJ',
                     username='refused_dev')

# Set PushShiftAPI
api = PushshiftAPI()

# Set a variable equal to the target subreddit
r_all = reddit.subreddit('all')


# Grab submissions from the subreddit
subs = []
for sub in r_all.hot(limit=None):
    sub_dict = {}
    sub_dict['submissions'] = sub
    subs.append(sub_dict)
subscrape = pd.DataFrame(subs)
subscrape.shape, subscrape.head()

# Save submission pull to csv
subscrape.to_csv("subs.csv")

# Grab features from the submission IDs
sublist = []
for c in subscrape['submissions']:
    subdict = {}
    subdict['title'] = c.title
    subdict['comments'] = c.num_comments
    subdict['crossposts'] = c.num_crossposts
    subdict['score'] = c.score
    subdict['subreddit'] = c.subreddit
    subdict['domain'] = c.domain
    subdict['gilded'] = c.gilded
    subdict['upvote_ratio'] = c.upvote_ratio
    subdict['created'] = c.created
    fri_sublist.append(subdict)

df = pd.DataFrame(sublist)

def get_date(created):
    return dt.datetime.fromtimestamp(created)
stamp = data['created'].apply(get_date)
data = data.assign(timestamp = stamp)

Load up previous scrapes

In [4]:
df = pd.read_csv('df_w_feats.csv')
display(df.shape, df.head().sort_values('score', ascending=False))

(22767, 10)

Unnamed: 0.1,Unnamed: 0,comments,created,crossposts,domain,gilded,score,subreddit,title,upvote_ratio
2,2,3232,1527744000.0,4,youtube.com,5,83372,videos,Gamer bet if the new Bethesda reveal was not F...,0.78
3,3,2439,1527740000.0,6,smh.com.au,0,61556,worldnews,"Police faked 258,000 breath tests in shocking ...",0.89
0,0,268,1527746000.0,1,i.redd.it,0,54696,BikiniBottomTwitter,About to make a splash? More like just about t...,0.82
1,1,3771,1527740000.0,2,thehill.com,0,39344,politics,FBI is reconstructing shredded documents obtai...,0.89
4,4,275,1527748000.0,0,i.redd.it,0,31701,PrequelMemes,"Always 2 there are, no more, no less",0.85


In [None]:
df.drop(columns='Unnamed: 0', inplace=True)

df.info()

# convert unix timecode to human readable

def get_date(created):
    return dt.datetime.fromtimestamp(created)
stamp = df['created'].apply(get_date)
df = df.assign(timestamp = stamp)

df.shape

df.head()

df['datetime'] = df.timestamp.values.astype('datetime64[D]')

df.timestamp.

y = df['comments']
# y.tolist()

# y = pd.DataFrame(y)
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=.20,
                                                    random_state=19)

In [137]:
def xtrainer(dft, y):
    # calculate percentiles

    p25, p50, p75 = np.percentile(dft.comments, 25), np.percentile(dft.comments, 50), np.percentile(dft.comments, 75)

    print(p25, p50, p75)



    dft['p25'] = dft['comments'] <= p25
    dft['p50'] = dft['comments'] <= p50
    dft['p75'] = dft['comments'] <= p75

    # posts with over the median amount of comments are a success
    dft['success'] = dft['p50'].map({False : 1, True : 0})
    dft['over25p'] = dft['p25'].map({False : 1, True : 0})
    dft['over50p'] = dft['p50'].map({False : 1, True : 0})
    dft['over75p'] = dft['p75'].map({False : 1, True : 0})

    y = dft['success']

In [144]:
# dft.drop(columns='status', inplace=True)

X_train = dft.subreddit

display(X_train.shape, y_train.shape, X_train.head(), y_train.head())

(18213,)

(18213,)

19058           FortNiteBR
15420    ShitPostCrusaders
22607    PropagandaPosters
20418         Ice_Poseidon
10071      rupaulsdragrace
Name: subreddit, dtype: object

19058    1
15420    1
22607    1
20418    0
10071    1
Name: success, dtype: int64

In [145]:
# Created a RandomForest model to predict High/Low number of comments using only
# the subreddit as a feature
cvec = CountVectorizer()

In [146]:
xtrain_counts = cvec.fit_transform(X_train)
print( xtrain_counts.shape, len(cvec.get_feature_names()))

(18213, 3288) 3288


In [148]:
# turn the cvec'd X_train to a dataframe
X_train = pd.DataFrame(xtrain_counts.todense(), columns=cvec.get_feature_names())
X_train.head()

# sum along the columns, and sort to give the most common subs
X_train.sum(axis=0).sort_values(ascending=False).head()

Unnamed: 0,1200isjerky,13or30,13reasonswhy,18_19,195,2007scape,2booty,2busty2hide,2healthbars,2mad4madlads,...,youtube,youtubehaiku,yugioh,yuri,yurop,yuzumiko,zelda,zerowaste,zettairyouiki,zoomies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [154]:
y_train.head()

xtrainer(X_test, y_test)

y_test = X_test['success']

display(X_test.head(), y_test.head())

19058    1
15420    1
22607    1
20418    0
10071    1
Name: success, dtype: int64

In [158]:
xtest_counts = cvec.transform(X_test)
X_test = pd.DataFrame(xtest_counts.todense(), columns=cvec.get_feature_names())

In [159]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

ValueError: Found input variables with inconsistent numbers of samples: [4554, 18]

In [None]:
# pipeline stuff
# use a randomforest
cvec = CountVectorizer()
tfvect = TfidfVectorizer(stop_words='english')
rf = RandomForestClassifier()
logreg = LogisticRegression()

tube = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('rf', RandomForestClassifier()),
    ('lr', LogisticRegression())
    sd
])

params = {
    'tfidf__min_df': ,
    'tfidf_max_df': ,
    'rf__max_depth': ,
    'rf__min_samples_',
    'lr__penalty':['l1','l2'],
    'lr__'
}

gs = GridSearchCV(tube, param_grid=params)
gs.fit(X_train, y_train)
gs.score(X_test, y_test)