In [1]:
import pandas as pd
import numpy as np
from random import *
# import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score

import my_pickle as mp
import my_features as mf
import my_resample as ms
# from my_resample import div_count_pos_neg, undersample, oversample
from sklearn.model_selection import GridSearchCV

In [2]:
X = mp.unpickle_it('X')
print("Make sure we're not useing ECT data")
print(pd.to_datetime(X.timestamp.max()*1000000))

Make sure we're not useing ECT data
2017-10-02 01:35:58.644000


In [3]:
y = X.response.map({True:1,False:0})
X = X.drop(['conv_id', 'response', 'first_uid', 'second_uid', 'first_mid',
       'second_mid', 'timestamp'], axis=1)

In [4]:
def do_grid_search(X, y):
    '''
    X as 2d numpy array
    y as 1d numpy array
    
    PARAMETERS
    n_estimators: The number of trees in the forest
    criterion: gini or entropy
    max_features: The number of features to consider when looking for the best split
        If int, then consider max_features features at each split.
        If float, then max_features is a percentage and int(max_features * n_features) features are considered at each split.
        If “auto”, then max_features=sqrt(n_features).
        If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
        If “log2”, then max_features=log2(n_features).
        If None, then max_features=n_features.
    max_depth: The maximum depth of the tree
    n_jobs: The number of jobs to run in parallel for both fit and predict. If -1, then the number of jobs is set to the number of cores.
    '''
    
    # Split it up into our training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # resample
    X_train, y_train = ms.oversample(X_train, y_train, .5)
    
    # Initalize our model here
    model = RandomForestClassifier()

    # Here are the params we are tuning
    param_grid = {'max_features' : ["sqrt","log2",None],
                  'n_estimators' : [100],
                  'max_depth': [10, 50, 100, 200],
                  'min_samples_leaf': [3,5,10]
                  }

    # Plug in our model, params dict, and the number of jobs, then .fit()
    gs_cv = GridSearchCV(model, param_grid, n_jobs=-1).fit(X_train, y_train)

    # return the best score and the best params
    return gs_cv.best_score_, gs_cv.best_params_


In [5]:
# %%time
# # best_score, best_grid_params =  do_grid_search(X.as_matrix(),y.as_matrix())
# temp2 = do_grid_search(X.as_matrix(),y.as_matrix())

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

X_train, y_train = ms.oversample(X_train, y_train, .5)

# print("messages sent: {}".format(len(y_train)))
# print("responses:     {}\n".format(y_train.sum()))

model = RandomForestClassifier(max_depth= 100, max_features=None, min_samples_leaf=3, n_estimators= 100, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\nMETRICS")
# print("Prediction:   {}...".format(model.predict(X_test)[:20]))
# print("Actual:       {}...".format(y_test[:20]))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Model accuracy: {}".format(model.score(X_test, y_test)))

pred_all_0 = [0]*len(y_test)
pred_all_1 = [1]*len(y_test)
pred_50_50 = np.random.choice([0,1], size=len(y_test))
pred_90_10 = np.random.choice([0,1], size=len(y_test), p=[.9,.1])

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test, y_pred))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

# print(model.feature_importances_)

feature_importances = np.argsort(model.feature_importances_)
top_n = len(X.columns)
print("\nFEATURE RANKINGS")
for n in range(top_n):
    print(n+1, '\t',X.columns[feature_importances[-n-1]], '\t',sorted(model.feature_importances_)[-n-1])
       
# TO DOUBLE CHECK STUFF:    
# for n in range(top_n):   
#     print(model.feature_importances_[n], X.columns[n])

print("\nRECALL AND ACCURACY FOR DIFFERNET MODELS")
print("recall     \t precision   \tmodel")
print(recall_score(y_test, y_pred), '\t',precision_score(y_test, y_pred), "my model")
print(recall_score(y_test, pred_all_0),'\t','\t', precision_score(y_test, pred_all_0), "\t\tpredict all zero")
print(recall_score(y_test, pred_all_1),'\t','\t', precision_score(y_test, pred_all_1), "predict all one")
print(recall_score(y_test, pred_50_50),'\t', precision_score(y_test, pred_50_50), "predict 50-50")
print(recall_score(y_test, pred_90_10), precision_score(y_test, pred_90_10), "predict 90-10")


METRICS
Model recall: 0.5115511551155115
Model precision: 0.4155495978552279
Model accuracy: 0.8661791590493602

CONFUSION MATRIX
[[2214  218]
 [ 148  155]]

key:
 TN   FP 
 FN   TP 

FEATURE RANKINGS
1 	 sender_attractiveness 	 0.58812702093
2 	 age_dif 	 0.105712521564
3 	 rent_overlap 	 0.0700951376247
4 	 receiver_selectivity 	 0.0544935178398
5 	 roommate_num_sim 	 0.0423270438504
6 	 same_clean 	 0.0238367816407
7 	 same_night 	 0.0231371204318
8 	 same_relate 	 0.0223716341465
9 	 same_smoking 	 0.0216630172617
10 	 same_gender 	 0.0198480505567
11 	 same_student 	 0.0145379313071
12 	 same_term 	 0.0138502228464

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.511551155116 	 0.415549597855 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.110786106033 predict all one
0.475247524752 	 0.105340160936 predict 50-50
0.112211221122 0.126865671642 predict 90-10


  'precision', 'predicted', average, warn_for)
