In [1]:
import sys
sys.path.append("../scripts")

In [4]:
## General
import pandas as pd
import numpy as np

## Data cleaning/setup
from clean_data import load_and_clean_data
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Tuning
from sklearn.model_selection import GridSearchCV

## Evaluation metrics
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

## Other
pd.set_option('display.max_rows', 400)

In [3]:
def upsample(X, y):
    idx_class_1 = np.where(y == 1)[0]
    class_1 = X.iloc[idx_class_1]
    n_resample = len(X) - len(idx_class_1)
    X_upsampled = resample(class_1, n_samples = n_resample, random_state = 42)
    y_upsampled = y.loc[X_upsampled.index]
    X_up = pd.concat([X_upsampled, X.iloc[np.where(y == 0)[0]]])
    y_up = pd.concat([y_upsampled, y.iloc[np.where(y == 0)[0]]])
    
    return X_up, y_up

In [5]:
def run_model(X_train, y_train, X_val, y_val, model = DecisionTreeClassifier()):
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(fpr, tpr)
    
    return model, y_pred, roc_auc, fpr, tpr

## Data

#### Pull and split data

In [6]:
X_train, X_test, y_train, y_test = load_and_clean_data()

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [7]:
selected_features = pd.read_csv('feature_selection_final_columns.csv', names=["feats"], skiprows=1)

In [8]:
selected_features = selected_features.feats.values

X_train = X_train[selected_features]

## Logistic Regression

In [19]:
X_train_up, y_train_up = upsample(X_train, y_train)

In [20]:
lr_model = LogisticRegression(solver='liblinear')
lr_params = {'penalty': ['none', 'l2', 'l1', 'elasticnet'],
            'C': [.01, 0.1, 1, 10],
            'max_iter': [100, 200, 300]}
lr_gs = GridSearchCV(lr_model, lr_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [None]:
lr_gs.fit(X_train_up, y_train_up)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [None]:
lr_params_best = lr_gs.best_params_
lr_score_best = lr_gs.best_score_
print(f'Best score: {lr_score_best}, Best params: {lr_params_best}')

## Decision Tree

In [41]:
dt_model = DecisionTreeClassifier()
dt_params = {'max_depth':[None, 5, 10, 50, 100], 
             'min_samples_split':[2, 10, 100, 1000, 10000, 50000], 
             'min_samples_leaf':[2, 10, 100, 1000, 10000, 50000]}
dt_gs = GridSearchCV(dt_model, dt_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [42]:
dt_gs.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  8.4min finished


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [None, 5, 10, 50, 100],
                         'min_samples_leaf': [2, 10, 100, 1000, 10000, 50000],
                         'min_samples_split': [2, 10, 100, 1000, 10000, 50000]},
             scoring='roc_auc', verbose=3)

In [43]:
dt_params_best = dt_gs.best_params_
dt_score_best = dt_gs.best_score_
print(f'Best score: {dt_score_best}, Best params: {dt_params_best}')

Best score: 0.817022376813583, Best params: {'max_depth': 50, 'min_samples_leaf': 2, 'min_samples_split': 1000}


In [44]:
dt_df = pd.DataFrame(dt_gs.cv_results_['params'])
dt_df['auc'] = dt_gs.cv_results_['mean_test_score']

In [45]:
dt_df

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,auc
0,,2,2,0.729827
1,,2,10,0.744838
2,,2,100,0.78606
3,,2,1000,0.816474
4,,2,10000,0.806289
5,,2,50000,0.777566
6,,10,2,0.76441
7,,10,10,0.764638
8,,10,100,0.787855
9,,10,1000,0.813815


In [46]:
dt_df.to_csv('dt_hyperparams.csv')

## Random Forest

In [35]:
rf_model = RandomForestClassifier()
rf_params = {
    'max_depth':[None, 5, 10, 50, 100], 
    'min_samples_split':[2, 10, 100, 1000, 10000, 50000], 
    'min_samples_leaf':[2, 10, 100, 1000, 10000, 50000],
    'n_estimators':[10, 50, 100, 200]
    }
rf_gs = GridSearchCV(rf_model, rf_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [36]:
rf_gs.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 55.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 60.6min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 71.0min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 82.5min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed: 104.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 153.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 200.6min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 214.0min finished


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [None, 5, 10, 50, 100],
                         'min_samples_leaf': [2, 10, 100, 1000, 10000, 50000],
                         'min_samples_split': [2, 10, 100, 1000, 10000, 50000],
                         'n_estimators': [10, 50, 100, 200]},
             scoring='roc_auc', verbose=3)

In [37]:
rf_params_best = rf_gs.best_params_
rf_score_best = rf_gs.best_score_
print(f'Best score: {rf_score_best}, Best params: {rf_params_best}')

Best score: 0.8304663288784431, Best params: {'max_depth': 50, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 200}


In [38]:
rf_df = pd.DataFrame(rf_gs.cv_results_['params'])
rf_df['auc'] = rf_gs.cv_results_['mean_test_score']

In [39]:
rf_df

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,n_estimators,auc
0,,2,2,10,0.796901
1,,2,2,50,0.819508
2,,2,2,100,0.823513
3,,2,2,200,0.827595
4,,2,10,10,0.804377
...,...,...,...,...,...
715,100.0,50000,10000,200,0.555695
716,100.0,50000,50000,10,0.521904
717,100.0,50000,50000,50,0.554085
718,100.0,50000,50000,100,0.555374


In [40]:
rf_df.to_csv('rf_hyperparams.csv')

## XGBoost

In [17]:
xgb_model = XGBClassifier()
xgb_params = {'max_depth': [2, 6, 10, 20],
              'eta': [0.1, 0.2, 0.5, 0.75, 1], 
              'eval_metric':['auc']}
xgb_gs = GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [18]:
xgb_gs.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 63.0min finished


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
            

In [19]:
xgb_params_best = xgb_gs.best_params_
xgb_score_best = xgb_gs.best_score_
print(f'Best score: {xgb_score_best}, Best params: {xgb_params_best}')

Best score: 0.8451799972636577, Best params: {'eta': 0.2, 'eval_metric': 'auc', 'max_depth': 6}


In [22]:
xgb_gs.cv_results_

{'mean_fit_time': array([ 59.59231124, 182.66312904, 306.18019161, 776.27782845,
         50.02427506, 155.41358767, 297.53343463, 704.09414616,
         49.97850485, 146.63416457, 263.82119246, 690.34791198,
         43.64460769, 148.39301758, 303.65110302, 697.15628376,
         52.16836004, 162.90340872, 284.6257483 , 446.82943039]),
 'std_fit_time': array([ 0.10767439,  1.83819943, 12.38322255,  5.30562937,  0.59625533,
         3.75554809,  8.57213632, 13.66887472,  0.80048545,  8.49763275,
         2.82595408,  5.14764714,  2.01982458,  2.00844831,  4.50952665,
         4.29625628,  4.25285025,  3.33420535, 11.85665871, 44.00706359]),
 'mean_score_time': array([0.28325348, 0.68361588, 0.62394032, 1.03745027, 0.35537386,
        0.47546816, 0.64677844, 0.88735757, 0.41875134, 0.41612444,
        0.44577174, 0.92092438, 0.2686316 , 0.40999713, 0.65550442,
        0.81555705, 0.42051792, 0.55980196, 0.56535234, 0.4974936 ]),
 'std_score_time': array([0.03146172, 0.05991953, 0.052383

In [30]:
xgb_df = pd.DataFrame(xgb_gs.cv_results_['params'])
xgb_df['auc'] = xgb_gs.cv_results_['mean_test_score']

In [31]:
xgb_df

Unnamed: 0,eta,eval_metric,max_depth,auc
0,0.1,auc,2,0.809783
1,0.1,auc,6,0.841515
2,0.1,auc,10,0.844897
3,0.1,auc,20,0.835604
4,0.2,auc,2,0.828277
5,0.2,auc,6,0.84518
6,0.2,auc,10,0.840671
7,0.2,auc,20,0.81439
8,0.5,auc,2,0.840481
9,0.5,auc,6,0.834285


In [32]:
xgb_df.to_csv('xgb_hyperparams.csv')