In [175]:
import pandas as pd
import numpy as np

In [176]:
Xi = pd.read_csv('Xi.csv', index_col='Unnamed: 0')

In [177]:
Xi.head();

#### We're going to do some quick cleaning

In [178]:
#Creating a Yi that will be put into my K-Means method
Yi = np.sqrt(Xi[[
    'comments'
]])

X = Xi.drop(labels=['score','gilded','ups','downs'], axis=1)
#creating a new X dataframe

In [179]:
X['distinguished'].fillna(value=0, inplace=True)
#correcting the NaNs for 0's to signify non-moderator

X['distinguished'].replace('moderator', 1, inplace=True)
#filling in 1's for moderator status

In [180]:
from sklearn.preprocessing import LabelBinarizer
import scipy

bools = ['subreddit_type','over_18','distinguished',
         'stickied','locked','is_video']

lb = LabelBinarizer()

for column in X[bools]:
    print(column)
    try:
        lb.fit(X[column])
        X[column] = lb.transform(X[column])
    except ValueError:
        print(column+' needs cleaning!')

subreddit_type
over_18
distinguished
stickied
locked
is_video


In [181]:
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(X, Yi,
                                               random_state=1994,
                                                test_size=0.3,
                                                shuffle=True)

In [182]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 2, #2 clusters for binary output
               random_state = 1994,
               )
target_train = kmeans.fit_predict(trainY)
target_test = kmeans.predict(testY)
Xi['target'] = kmeans.predict(Yi)

In [183]:
list(zip(kmeans.cluster_centers_.T, Yi.columns))

[(array([ 4.30142428, 22.19539655]), 'comments')]

In [184]:
from sklearn.ensemble import RandomForestClassifier

In [185]:
trainX.dtypes

comments                   int64
title                     object
subreddit                 object
subreddit_subscribers      int64
subreddit_type             int64
time_since_posted        float64
over_18                    int64
distinguished              int64
stickied                   int64
locked                     int64
num_crossposts             int64
is_video                   int64
dtype: object

In [186]:
trainX_numeric= trainX.drop(labels=['title','subreddit'], axis=1)
testX_numeric = testX.drop(labels=['title','subreddit'], axis=1)

In [187]:
rf = RandomForestClassifier()
rf.fit(trainX_numeric, target_train)
rf.score(testX_numeric, target_test)

1.0

In [188]:
rf.feature_importances_

array([9.40944848e-01, 1.85557843e-02, 1.47036547e-04, 2.64383411e-03,
       2.74844565e-04, 5.71667365e-04, 6.29841552e-04, 2.21877795e-03,
       3.39139269e-02, 9.94382953e-05])

In [189]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = trainX_numeric.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
#script courtesy of Sam because I was too lazy to rewrite my exp_coeff

In [190]:
feature_importances

Unnamed: 0,importance
comments,0.940945
num_crossposts,0.033914
subreddit_subscribers,0.018556
time_since_posted,0.002644
locked,0.002219
stickied,0.00063
distinguished,0.000572
over_18,0.000275
subreddit_type,0.000147
is_video,9.9e-05


#### Because comments is highest, I'm going to try this again without it.

In [191]:
X_2 = Xi.drop(labels=['score','gilded','ups','downs', 'comments','target'], axis=1)

In [192]:
X_2['distinguished'].fillna(value=0, inplace=True)
#correcting the NaNs for 0's to signify non-moderator

X_2['distinguished'].replace('moderator', 1, inplace=True)
#filling in 1's for moderator status

lb = LabelBinarizer()

for column in X_2[bools]:
    print(column)
    try:
        lb.fit(X_2[column])
        X_2[column] = lb.transform(X_2[column])
    except ValueError:
        print(column+' needs cleaning!')

subreddit_type
over_18
distinguished
stickied
locked
is_video


In [193]:
trainX_2, testX_2, trainY_2, testY_2 = train_test_split(X_2, Yi,
                                               random_state=1994,
                                                test_size=0.3,
                                                shuffle=True)

In [194]:
kmeans = KMeans(n_clusters = 2, #2 clusters for binary output
               random_state = 1994,
               )
target_train_2 = kmeans.fit_predict(trainY)
target_test_2 = kmeans.predict(testY)
Xi['target'] = kmeans.predict(Yi)

In [195]:
list(zip(kmeans.cluster_centers_.T, Yi.columns))

[(array([ 4.30142428, 22.19539655]), 'comments')]

In [196]:
trainX_2_numeric= trainX_2.drop(labels=['title','subreddit'], axis=1)
testX_2_numeric = testX_2.drop(labels=['title','subreddit'], axis=1)

In [197]:
rf_2 = RandomForestClassifier()
rf_2.fit(trainX_2_numeric, target_train_2)
rf_2.score(testX_2_numeric, target_test_2)

0.9408065112837588

In [198]:
feature_importances_2 = pd.DataFrame(rf_2.feature_importances_,
                                   index = trainX_2_numeric.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
#script courtesy of Sam because I was too lazy to rewrite my exp_coeff

feature_importances_2

Unnamed: 0,importance
subreddit_subscribers,0.800607
num_crossposts,0.163945
time_since_posted,0.013053
is_video,0.007306
locked,0.004722
over_18,0.004028
stickied,0.003014
distinguished,0.002144
subreddit_type,0.001181


#### My score wasn't great, so I am going to try to run a `GridSearch` on it and see if we can improve

In [199]:
from sklearn.model_selection import GridSearchCV

In [200]:
rf_params = {
    'n_estimators':[40,50,60],
    'max_features':[7,8,9],
    'max_depth':[4,5,6]
}

gs = GridSearchCV(rf_2, param_grid = rf_params)
gs.fit(trainX_2_numeric, target_train_2)
gs.score(testX_2_numeric, target_test_2)
print(gs.best_score_)
print(gs.best_params_)

0.941801458928005
{'max_depth': 6, 'max_features': 8, 'n_estimators': 50}


### Let's look at our importances with a model optimized by `GridSearch`
#### Also note, I ran a few `GridSearch` and changed the params around

In [201]:
rf_3 = RandomForestClassifier(max_features=9,
                             n_estimators=50,
                             max_depth=6)
rf_3.fit(trainX_2_numeric, target_train_2)
rf_3.score(testX_2_numeric, target_test_2)

0.9411764705882353

### I haven't worked much with trees so I am sure this is not entirely solid in practice, but I'm going to accept this 0.001 improvement on my score and move on.

In [202]:
feature_importances_3 = pd.DataFrame(rf_2.feature_importances_,
                                   index = trainX_2_numeric.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
#script courtesy of Sam because I was too lazy to rewrite my exp_coeff

feature_importances_3

Unnamed: 0,importance
subreddit_subscribers,0.800607
num_crossposts,0.163945
time_since_posted,0.013053
is_video,0.007306
locked,0.004722
over_18,0.004028
stickied,0.003014
distinguished,0.002144
subreddit_type,0.001181


### After this, for fun, I explored a multi-class output. It wasn't a fruitful endeavor so I deleted that portion of the notebook. I know I could have, and should have, developed `Pipelines` considering how many times I ran these models, but I didn't. I was more focused on creating a coherent set of notebooks for now. The `pipeline_development` notebook is empty.
#### That concludes my project 3. Thanks for grading!