In [2]:
import pandas as pd
import logging
import numpy as np
import sys
import matplotlib.pyplot as plt
import time
import operator
import os
from sklearn.cross_validation import train_test_split
from random import shuffle
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.optimize import minimize
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import StratifiedKFold
import tqdm
%matplotlib inline

#PLOT CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
import itertools
import pickle
#matrix inverse
from numpy.linalg import inv
import jj_basic_fn as JJ
from hyperparams import Hyperparams as hp
from patient import patient
import prep
import plot_funcs
import modules
#default size of the graph
plt.rcParams['figure.figsize'] = (10.0, 8.0) 

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', -1)

n_classifier = 7

col_rs = hp.col_rs
col_es = hp.col_es
col_le = hp.col_le

  from ._conv import register_converters as _register_converters


In [15]:
#p231 = modules.build_patients(231, 90)
p231, p222_1, p222_2, p229 = modules.build_patients(-1,124)
# p231, p222_1, p222_2, p229 = modules.build_patients()
# pat_list = [p231, p222_1, p222_2, p229]
# for pat in pat_list:
#     JJ.save_object(pat, '../patients/' + pat.id +'.p')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [4]:
def parameter_tuning(pat, X_train, X_test, y_train, y_test, classifier, C_range_num = 30, if_save = 0,
                     nfold = 10, if_show = 1):
    #defs is a dictionary to initiate classifier with the parameters that don't need to be tuned
    defs = {}
    defs['classifier'] = classifier
    
    num_instances, num_features = X_train.shape[0], X_train.shape[1]
    n_fold = 10

    skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    CV = skf.split(np.zeros(len(y_train)), y_train)
    

    if classifier==1:
        clf_name = 'Logistic Regression'
        defs['class_weight'] = 'balanced'
        defs['max_iter'] = 200
        C_range = 10 ** np.random.uniform(-2, 1, size = C_range_num)
        tuned_params = dict(penalty=['l1','l2'], C=C_range)
    elif classifier == 2: 
        clf_name = 'SVM'
        defs['class_weight'] = 'balanced'
        kernel_list = ['rbf']    
        gamma_list = [2**i*1/num_features for i in range(1)]
        #degree_list = [2,3,4,5]
        C_range = 10 ** np.random.uniform(-3, 1, size = C_range_num)
        tuned_params = dict(kernel=kernel_list,gamma = gamma_list, C=C_range)

    elif classifier==3:
        clf_name = 'Gaussian Naive Bayes classifier'
        prr = np.ones(2) * (1.0/2)
        defs['priors'] = prr
        clf_try = JJ.clf_list(defs)
        clf_try.fit(X_train, y_train)
        pat.estimator[classifier] = clf_try
        return

    elif classifier==4:
        clf_name = 'Linear Discriminant Analysis'
        prr = np.ones(2) * (1.0/2)
        defs['solver'] = 'eigen'  # 'svd', 'lsqr', 'eigen'
        defs['shrinkage'] = 'auto'
        defs['priors'] = prr
        clf_try = JJ.clf_list(defs)
        clf_try.fit(X_train, y_train)
        pat.estimator[classifier] = clf_try
        return
    elif classifier == 5:
        clf_name = 'decision tree'
        mss_list = [5,10,20,40,60]
        criterion_list = ['entropy']
        max_depth_list = [3,4,5,8,12,18]
        clf_name = 'decision tree'
        tuned_params = dict(criterion=criterion_list, min_samples_split=mss_list, max_depth = max_depth_list)
    elif classifier == 6:
        clf_name = 'random forest'
        defs['n_estimators'] = 600
        mss_list = [20,25,30,40]
        criterion_list = ['entropy']
        max_depth_list = [12,13,14,15,16]
        max_features_list = ['auto']
        tuned_params = dict(criterion=criterion_list, min_samples_split=mss_list, max_depth = max_depth_list, max_features = max_features_list)
    elif classifier == 7:
        clf_name = 'gradient boosting'
        defs['n_estimators'] = 2000
        max_depth_list, subsample_list, learning_rate_list, min_samples_leaf_list = [1,2,3], [0.1,0.15,0.2, 0.3, 0.4], [0.02, 0.01,0.005], [10,20,30] 
        #params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
        #  'learning_rate': 0.01, 'min_samples_leaf': 10, 'random_state': 3}
        tuned_params = dict(max_depth=max_depth_list, subsample = subsample_list,learning_rate = learning_rate_list, min_samples_leaf= min_samples_leaf_list)
    
        
    clf_try = JJ.clf_list(defs)
    
    clf_grid = GridSearchCV(clf_try,
                            param_grid=tuned_params,
                            cv=CV,
                            scoring = 'roc_auc',
                            verbose=1,
                           return_train_score = True)
    clf_grid.fit(X_train, y_train)
    print('Best score for validations set: {}'.format(clf_grid.best_score_))
    print('Best parameters: {}'.format(clf_grid.best_params_))

    clf_best = clf_grid.best_estimator_

    y_pred = clf_best.predict(X_test)
    df = pd.DataFrame(clf_grid.cv_results_)
    if if_show:
        JJ.show_result(y_pred, y_test, df, clf_name, if_save = if_save)
    
    if if_save:
        pat.result[classifier] = df
        pat.estimator[classifier] = clf_best
        pat.score[classifier] = clf_grid.best_score_
        pat.params[classifier] = clf_grid.best_params_



In [5]:
# X_train, X_test, y_train, y_test = modules.get_ml_data(p2, if_remove_icd = 1, if_remove_sleep=1, if_remove_le=0, le_class = 1)
# parameter_tuning(p229,X_train, X_test, y_train, y_test,1, C_range_num = 100, 
#                      nfold = 10, if_save = 0, if_show = 1)

In [27]:
def parameter_tuning_all(pat, C_range_num, if_scaler = 1, if_remove_icd = 1, if_remove_sleep=1, if_remove_le = 1, le_class = None, if_save = 1, if_show = 0, if_remove_delta = 1):
    X_train, X_test, y_train, y_test = modules.get_ml_data(pat, if_scaler = if_scaler, if_remove_icd = if_remove_icd, if_remove_sleep = if_remove_sleep, if_remove_le = if_remove_le, le_class = le_class, if_remove_delta = if_remove_delta)
    train_para = {}
    train_para['if_scaler'], train_para['if_remove_icd'], train_para['if_remove_sleep'], train_para['if_remove_le'] \
    , train_para['le_class'], train_para['if_remove_delta'] = \
    if_scaler, if_remove_icd, if_remove_sleep, if_remove_le, le_class, if_remove_delta
    pat.train_para = train_para
    pat.X_train, pat.X_test, pat.y_train, pat.y_test = X_train, X_test, y_train, y_test
    for classifier_int in tqdm.trange(1,hp.num_classifier + 1):
        parameter_tuning(pat, X_train, X_test, y_train, y_test, C_range_num = C_range_num, classifier = classifier_int, if_save = if_save, if_show = if_show)
    if if_save:
        JJ.save_object(pat, hp.prepath_pat + pat.id +'_trained_124_d.p')

In [28]:
parameter_tuning_all(p229, C_range_num = 100, if_scaler = 1, 
                     if_remove_icd =1, if_remove_sleep = 0, \
                     le_class = None, if_remove_le = 1, if_save = 1, if_remove_delta = 0)

TypeError: parameter_tuning_all() got an unexpected keyword argument 'if_remove_outliers'

In [22]:
parameter_tuning_all(p222_1, C_range_num = 100, if_scaler = hp.if_scaler, 
                     if_remove_icd = hp.if_remove_icd, if_remove_sleep = 1, \
                     le_class = None, if_remove_le = 1, if_save = 1, if_remove_delta = 0, if_remove_outliers = 0)

  0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:   13.5s finished
 14%|█▍        | 1/7 [00:13<01:21, 13.56s/it]

Best score for validations set: 0.6607585081923943
Best parameters: {'penalty': 'l1', 'C': 0.1951653215411029}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   25.6s finished
 29%|██▊       | 2/7 [00:39<01:37, 19.60s/it]

Best score for validations set: 0.6868076921304174
Best parameters: {'C': 5.971226783134906, 'gamma': 0.03571428571428571, 'kernel': 'rbf'}
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    3.4s finished
 71%|███████▏  | 5/7 [00:42<00:17,  8.54s/it]

Best score for validations set: 0.6423380207557707
Best parameters: {'min_samples_split': 10, 'max_depth': 8, 'criterion': 'entropy'}
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  4.8min finished
 86%|████████▌ | 6/7 [05:34<00:55, 55.79s/it]

Best score for validations set: 0.6665981718648684
Best parameters: {'max_features': 'auto', 'min_samples_split': 40, 'max_depth': 12, 'criterion': 'entropy'}
Fitting 10 folds for each of 135 candidates, totalling 1350 fits


[Parallel(n_jobs=1)]: Done 1350 out of 1350 | elapsed: 25.1min finished
100%|██████████| 7/7 [30:40<00:00, 262.92s/it]

Best score for validations set: 0.6512095812857803
Best parameters: {'learning_rate': 0.005, 'subsample': 0.2, 'min_samples_leaf': 30, 'max_depth': 2}





In [23]:
parameter_tuning_all(p222_2, C_range_num = 100, if_scaler = hp.if_scaler, 
                     if_remove_icd = hp.if_remove_icd, if_remove_sleep = 1, le_class = None, if_remove_le = 1, if_save = 1, if_remove_delta = 0)

  0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:   19.0s finished
 14%|█▍        | 1/7 [00:19<01:54, 19.02s/it]

Best score for validations set: 0.7203378992647184
Best parameters: {'penalty': 'l2', 'C': 0.04929480254992697}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   18.3s finished
 29%|██▊       | 2/7 [00:37<01:33, 18.68s/it]

Best score for validations set: 0.71920389517653
Best parameters: {'C': 2.127053661141449, 'gamma': 0.03571428571428571, 'kernel': 'rbf'}
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    2.8s finished
 71%|███████▏  | 5/7 [00:40<00:16,  8.03s/it]

Best score for validations set: 0.6727831315713977
Best parameters: {'min_samples_split': 60, 'max_depth': 8, 'criterion': 'entropy'}
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  4.1min finished
 86%|████████▌ | 6/7 [04:44<00:47, 47.46s/it]

Best score for validations set: 0.721667921602273
Best parameters: {'max_features': 'auto', 'min_samples_split': 20, 'max_depth': 15, 'criterion': 'entropy'}
Fitting 10 folds for each of 135 candidates, totalling 1350 fits


[Parallel(n_jobs=1)]: Done 1350 out of 1350 | elapsed: 22.9min finished
100%|██████████| 7/7 [27:43<00:00, 237.58s/it]

Best score for validations set: 0.7397763881248091
Best parameters: {'learning_rate': 0.005, 'subsample': 0.4, 'min_samples_leaf': 20, 'max_depth': 2}





In [23]:
JJ.ensemble_model(X_train, y_train, X_test, y_test, p231, if_save = 0)

0.8603351955307262


  if diff:


In [7]:
parameter_tuning_all(p222_1, C_range_num = 100, if_scaler = hp.if_scaler, if_remove_icd = hp.if_remove_icd, if_save = 1)

  0%|          | 0/7 [00:00<?, ?it/s]

Total outliers removed: 19
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:   11.9s finished
 14%|█▍        | 1/7 [00:11<01:11, 11.96s/it]

Best score for validations set: 0.6607585081923943
Best parameters: {'C': 0.19517445578083695, 'penalty': 'l1'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   27.6s finished
 29%|██▊       | 2/7 [00:39<01:38, 19.80s/it]

Best score for validations set: 0.6867889496195189
Best parameters: {'C': 4.90241546749882, 'kernel': 'rbf', 'gamma': 0.03571428571428571}
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    3.6s finished
 71%|███████▏  | 5/7 [00:43<00:17,  8.65s/it]

Best score for validations set: 0.6311262313615518
Best parameters: {'min_samples_split': 10, 'criterion': 'entropy', 'max_depth': 8}
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  5.1min finished
 86%|████████▌ | 6/7 [05:50<00:58, 58.35s/it]

Best score for validations set: 0.6664817143862415
Best parameters: {'max_features': 'auto', 'min_samples_split': 30, 'criterion': 'entropy', 'max_depth': 16}
Fitting 10 folds for each of 135 candidates, totalling 1350 fits


[Parallel(n_jobs=1)]: Done 1350 out of 1350 | elapsed: 61.2min finished
100%|██████████| 7/7 [1:07:02<00:00, 574.67s/it]

Best score for validations set: 0.6507336969013348
Best parameters: {'learning_rate': 0.005, 'min_samples_leaf': 20, 'subsample': 0.2, 'max_depth': 1}



