In [1]:
import sys
sys.path.append("../scripts")

In [2]:
## General
import pandas as pd
import numpy as np

## Data cleaning/setup
from clean_data import load_and_clean_data
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBoostClassifier

## Tuning
from sklearn.model_selection import GridSearchCV

## Evaluation metrics
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

## Other
pd.set_option('display.max_rows', 400)

In [3]:
def upsample(X, y):
    idx_class_1 = np.where(y == 1)[0]
    class_1 = X.iloc[idx_class_1]
    n_resample = len(X) - len(idx_class_1)
    X_upsampled = resample(class_1, n_samples = n_resample, random_state = 42)
    y_upsampled = y.loc[X_upsampled.index]
    X_up = pd.concat([X_upsampled, X.iloc[np.where(y == 0)[0]]])
    y_up = pd.concat([y_upsampled, y.iloc[np.where(y == 0)[0]]])
    
    return X_up, y_up

In [4]:
def run_model(X_train, y_train, X_val, y_val, model = DecisionTreeClassifier()):
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(fpr, tpr)
    
    return model, y_pred, roc_auc, fpr, tpr

## Data

#### Pull and split data

In [5]:
X_train, X_test, y_train, y_test = load_and_clean_data()

  X_train, X_test, y_train, y_test = load_and_clean_data()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df['insulin'] = df.DIABETES  # Duplicate diabetes column for one-hot encoding


In [6]:
selected_features = pd.read_csv('feature_selection_final_columns.csv', names=["feats"], skiprows=1)

In [7]:
selected_features = selected_features.feats.values

X_train = X_train[selected_features]

## Logistic Regression

In [19]:
X_train_up, y_train_up = upsample(X_train, y_train)

In [20]:
lr_model = LogisticRegression(solver='liblinear')
lr_params = {'penalty': ['none', 'l2', 'l1', 'elasticnet'],
            'C': [.01, 0.1, 1, 10],
            'max_iter': [100, 200, 300]}
lr_gs = GridSearchCV(lr_model, lr_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [None]:
lr_gs.fit(X_train_up, y_train_up)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [None]:
lr_params_best = lr_gs.best_params_
lr_score_best = lr_gs.best_score_
print(f'Best score: {lr_score_best}, Best params: {lr_params_best}')

## Decision Tree

In [15]:
dt_model = DecisionTreeClassifier()
dt_params = {'max_depth':[None, 2, 3, 5, 10], 
             'min_samples_split':[2, 10, 100, 1000, 10000, 50000], 
             'min_samples_leaf':[2, 10, 100, 1000, 10000, 50000]}
dt_gs = GridSearchCV(dt_model, dt_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [16]:
dt_gs.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [None, 2, 3, 5, 10],
                         'min_samples_leaf': [2, 10, 100, 1000, 10000, 50000],
                         'min_samples_split': [2, 10, 100, 1000, 10000, 50000]},
             scoring='roc_auc', verbose=3)

[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.736 total time=  15.4s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=100;, score=0.796 total time=  13.2s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=1000;, score=0.814 total time=  12.7s
[CV 5/5] END max_depth=None, min_samples_leaf=2, min_samples_split=50000;, score=0.776 total time=   6.0s
[CV 2/5] END max_depth=None, min_samples_leaf=10, min_samples_split=10;, score=0.763 total time=  12.7s
[CV 5/5] END max_depth=None, min_samples_leaf=10, min_samples_split=100;, score=0.784 total time=  11.3s
[CV 4/5] END max_depth=None, min_samples_leaf=10, min_samples_split=10000;, score=0.810 total time=   8.0s
[CV 2/5] END max_depth=None, min_samples_leaf=100, min_samples_split=2;, score=0.788 total time=   7.7s
[CV 5/5] END max_depth=None, min_samples_leaf=100, min_samples_split=10;, score=0.788 total time=   8.0s
[CV 3/5] END max_depth=None, min_samples_leaf=100, min_sam

In [18]:
dt_params_best = dt_gs.best_params_
dt_score_best = dt_gs.best_score_
print(f'Best score: {dt_score_best}, Best params: {dt_params_best}')

Best score: 0.8164791492254946, Best params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 1000}


## Random Forest

In [None]:
rf_model = RandomForest()
rf_params = {
    'max_depth':[None, 2, 3, 5, 10], 
    'min_samples_split':[2, 10, 100, 1000, 10000, 50000], 
    'min_samples_leaf':[2, 10, 100, 1000, 10000, 50000]
    'n_estimators':[10, 50, 100, 200]
    }
rf_gs = GridSearchCV(rf_model, rf_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [None]:
rf_gs.fit(X_train, y_train)

In [None]:
rf_params_best = rf_gs.best_params_
rf_score_best = rf_gs.best_score_
print(f'Best score: {rf_score_best}, Best params: {rf_params_best}')

## XGBoost

In [None]:
xgb_model = XGBClassifier()
xgb_params = {'max_depth': [2, 3, 5, 10], 
              'eta': [0.1, 0.2, 0.5, 0.75, 1], 
              'eval_metric':'auc'}
xgb_gs = GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', n_jobs=-1, verbose=3)

In [None]:
xgb_gs.fit(X_train, y_train)

In [None]:
xgb_params_best = xgb_gs.best_params_
xgb_score_best = xgb_gs.best_score_
print(f'Best score: {xgb_score_best}, Best params: {xgb_params_best}')