In [None]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
# from sklearn.metrics import confusion_matrix
from my_pickle import unpickle_it
import my_features as mf
import my_resample as ms
# from my_resample import div_count_pos_neg, undersample, oversample
from sklearn.model_selection import GridSearchCV

In [None]:
user_df = unpickle_it('user_df')
convo_df = unpickle_it('convo_df')

print("Make sure we're not useing ECT data")
print(pd.to_datetime(convo_df.timestamp.max()*1000000))

In [None]:
X = mf.feature_time(convo_df, user_df)
print(X.columns)

In [None]:
y = X.response.map({True:1,False:0})
X = X.drop(['conv_id', 'response', 'first_uid', 'second_uid', 'first_mid',
       'second_mid', 'timestamp'], axis=1)

In [None]:
print(len(X))
print(len(y))#.sum()
print(y.sum())

In [None]:
X.as_matrix()

In [None]:
y.as_matrix()

In [None]:
def do_grid_search(X, y):
    '''
    X as 2d numpy array
    y as 1d numpy array
    
    PARAMETERS
    n_estimators: The number of trees in the forest
    criterion: gini or entropy
    max_features: The number of features to consider when looking for the best split
        If int, then consider max_features features at each split.
        If float, then max_features is a percentage and int(max_features * n_features) features are considered at each split.
        If “auto”, then max_features=sqrt(n_features).
        If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
        If “log2”, then max_features=log2(n_features).
        If None, then max_features=n_features.
    max_depth: The maximum depth of the tree
    n_jobs: The number of jobs to run in parallel for both fit and predict. If -1, then the number of jobs is set to the number of cores.
    '''
    
    # Split it up into our training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # resample
    X_train, y_train = ms.oversample(X_train, y_train, .5)
    
    # Initalize our model here
    model = RandomForestClassifier()

    # Here are the params we are tuning
    param_grid = {'max_features' : ["sqrt","log2",None],
                  'n_estimators' : [100],
                  'max_depth': [10, 50, 100, 200],
                  'min_samples_leaf': [3,5,10]
                  }

    # Plug in our model, params dict, and the number of jobs, then .fit()
    gs_cv = GridSearchCV(model, param_grid, n_jobs=-1).fit(X_train, y_train)

    # return the best score and the best params
    return gs_cv.best_score_, gs_cv.best_params_


In [None]:
%%time
# best_score, best_grid_params =  do_grid_search(X.as_matrix(),y.as_matrix())
temp2 = do_grid_search(X.as_matrix(),y.as_matrix())

In [None]:
temp

In [None]:
temp2

In [None]:
model = RandomForestClassifier(max_depth= 100, max_features=None, min_samples_leaf=3, n_estimators= 100, n_jobs=-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

X_train, y_train = ms.oversample(X_train, y_train, .5)

print("messages sent: {}".format(len(y_train)))
print("responses:     {}\n".format(y_train.sum()))

model = RandomForestClassifier(max_depth= 100, max_features=None, min_samples_leaf=3, n_estimators= 100, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Prediction:   {}...".format(model.predict(X_test)[:20]))
print("Actual:       {}...".format(y_test[:20]))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

In [None]:

def do_regular_decision_tree(X, y, best_max_depth):
    # Split it up into our training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X,y)

    # Initalize our decision tree algo, set the max_depth to our maximum_max_depth
    clf = DecisionTreeClassifier(max_depth=best_max_depth)
    
    # Fit our tree
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Find the precision, recall, and score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    score = clf.score(X_test, y_test)
    return precision, recall, score


In [None]:

# def do_regular_decision_tree(X, y, best_max_depth):
#     # Split it up into our training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(X,y)

#     # Initalize our decision tree algo, set the max_depth to our maximum_max_depth
#     clf = DecisionTreeClassifier(max_depth=best_max_depth)
    
#     # Fit our tree
#     clf.fit(X_train, y_train)

#     # Make predictions
#     y_pred = clf.predict(X_test)

#     # Find the precision, recall, and score
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     score = clf.score(X_test, y_test)
#     return precision, recall, score



# best_score, best_grid_params =  do_grid_search(X,y)

# best_max_depth = best_grid_params['max_depth']

# print(do_regular_decision_tree(best_max_depth))