In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams['figure.figsize'] = 11.7,8.27 # figure size in inches

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 30) 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%config Completer.use_jedi = False

# Note
* build model based on different months_bin

In [2]:
df_demo = pd.read_csv('T_demo.csv')
df_stage = pd.read_csv('T_stage.csv')
df_agg = pd.read_csv('df_aggregated.csv')

In [3]:
# Change the unknown in df_demo race to the mode
df_demo['race'].value_counts()
df_demo.loc[df_demo['race'] == 'Unknown','race'] = 'White'
df_demo['race'].value_counts()

White       226
Unknown      26
Black        24
Asian        17
Hispanic      7
Name: race, dtype: int64

White       252
Black        24
Asian        17
Hispanic      7
Name: race, dtype: int64

In [4]:
df_stage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              300 non-null    int64
 1   Stage_Progress  300 non-null    bool 
dtypes: bool(1), int64(1)
memory usage: 2.8 KB


In [22]:
# Change state to 0, 1
df_stage['Stage_Progress'].value_counts()
df_stage['Stage_Progress'] = np.where(df_stage['Stage_Progress'] == True, 1, 0)
df_stage['Stage_Progress'].value_counts()

False    200
True     100
Name: Stage_Progress, dtype: int64

0    200
1    100
Name: Stage_Progress, dtype: int64

In [23]:
df_agg.head()

Unnamed: 0,id,months_bin,sbp,dbp,creatinine,glucose,ldl,hgb,atenolol,atorvastatin,bisoprolol,canagliflozin,carvedilol,dapagliflozin,irbesartan,labetalol,losartan,lovastatin,metformin,metoprolol,nebivolol,olmesartan,pitavastatin,pravastatin,propranolol,rosuvastatin,simvastatin,telmisartan,valsartan
0,0,1,133.93,89.65,1.22,6.72,136.44,13.45,0.0,910.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,125.08,65.97,1.44,7.01,,12.84,0.0,910.0,0.0,0.0,0.0,0.0,0.0,0.0,27100.0,0.0,151000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,3,133.625,84.9,1.23,6.89,157.9,13.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9100.0,0.0,242000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,4,154.28,84.29,1.155,5.7,,13.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9100.0,0.0,182000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,5,,,,,,12.995,,,,,,,,,,,,,,,,,,,,,


# 6 months data

In [24]:
df_bin1 = df_agg[df_agg['months_bin']==1]
# Check if all 300 subjects are included
len(df_bin1['id'].unique())

300

In [25]:
# Check missing values
df_bin1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 1828
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             300 non-null    int64  
 1   months_bin     300 non-null    int64  
 2   sbp            300 non-null    float64
 3   dbp            300 non-null    float64
 4   creatinine     300 non-null    float64
 5   glucose        300 non-null    float64
 6   ldl            300 non-null    float64
 7   hgb            300 non-null    float64
 8   atenolol       184 non-null    float64
 9   atorvastatin   184 non-null    float64
 10  bisoprolol     184 non-null    float64
 11  canagliflozin  184 non-null    float64
 12  carvedilol     184 non-null    float64
 13  dapagliflozin  184 non-null    float64
 14  irbesartan     184 non-null    float64
 15  labetalol      184 non-null    float64
 16  losartan       184 non-null    float64
 17  lovastatin     184 non-null    float64
 18  metformin

In [26]:
# ALl missing values are at the med, so can just fill with 0
df_bin1 = df_bin1.fillna(0)

In [27]:
# Attach demographic and target
df_bin1 = df_bin1.merge(df_demo, on='id', how='outer')
df_bin1 = df_bin1.merge(df_stage, on='id', how='outer')

In [28]:
# Drop useless columns
df_bin1.drop(['id', 'months_bin'], axis=1, inplace=True)

In [29]:
df_bin1.columns

Index(['sbp', 'dbp', 'creatinine', 'glucose', 'ldl', 'hgb', 'atenolol',
       'atorvastatin', 'bisoprolol', 'canagliflozin', 'carvedilol',
       'dapagliflozin', 'irbesartan', 'labetalol', 'losartan', 'lovastatin',
       'metformin', 'metoprolol', 'nebivolol', 'olmesartan', 'pitavastatin',
       'pravastatin', 'propranolol', 'rosuvastatin', 'simvastatin',
       'telmisartan', 'valsartan', 'race', 'gender', 'age', 'Stage_Progress'],
      dtype='object')

In [30]:
X = df_bin1[['sbp', 'dbp', 'creatinine', 'glucose', 'ldl', 'hgb',
       'atenolol', 'atorvastatin', 'bisoprolol', 'canagliflozin', 'carvedilol',
       'dapagliflozin', 'irbesartan', 'labetalol', 'losartan', 'lovastatin',
       'metformin', 'metoprolol', 'nebivolol', 'olmesartan', 'pitavastatin',
       'pravastatin', 'propranolol', 'rosuvastatin', 'simvastatin',
       'telmisartan', 'valsartan', 'race', 'gender', 'age']]

y = df_bin1['Stage_Progress']

In [31]:
# Split train-test data
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0) # default 75-25 split

# Select categorical columns
categorical_cols = ['race', 'gender']

# Select numerical columns
numerical_cols = ['sbp', 'dbp', 'creatinine', 'glucose', 'ldl', 'hgb',
       'atenolol', 'atorvastatin', 'bisoprolol', 'canagliflozin', 'carvedilol',
       'dapagliflozin', 'irbesartan', 'labetalol', 'losartan', 'lovastatin',
       'metformin', 'metoprolol', 'nebivolol', 'olmesartan', 'pitavastatin',
       'pravastatin', 'propranolol', 'rosuvastatin', 'simvastatin',
       'telmisartan', 'valsartan', 'age']

# Pipeline

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import *

# Preprocess data

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
# Set weight to prevent bias, 4 to CKD (1), 1 to non-CKD (0)
model = LogisticRegressionCV(cv=5, random_state=0, max_iter=10000, class_weight={0: 1, 1: 4})

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])


## Logistic regression CV

In [32]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.linear_model import LogisticRegressionCV
# from sklearn.metrics import *

# metrics_1 = []

# # Preprocess data

# # Preprocessing for numerical data
# numerical_transformer = StandardScaler()

# # Preprocessing for categorical data
# categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# # Bundle preprocessing for numerical and categorical data
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Define model
# # Set weight to prevent bias, 4 to CKD (1), 1 to non-CKD (0)
# model = LogisticRegressionCV(cv=5, random_state=0, max_iter=10000, class_weight={0: 1, 1: 4})

# # Bundle preprocessing and modeling code in a pipeline
# clf = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('model', model)
#                      ])

# # Preprocessing of training data, fit model 
# clf.fit(X_train, y_train)

# # Preprocessing of validation data, get predictions
# y_pred = clf.predict(X_valid)
# y_pred_proba = clf.predict_proba(X_valid)[:,1] # only get the probability of 1

# acc = accuracy_score(y_valid, y_pred)
# f1 = f1_score(y_valid, y_pred)
# prec = precision_score(y_valid, y_pred)
# rec = recall_score(y_valid, y_pred)
# spec = specificity(y_valid, y_pred)
# roc = roc_auc_score(y_valid, y_pred_proba)

# metric_output = [acc, f1, prec, rec, spec, roc]
# metrics_1.append(metric_output)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['sbp', 'dbp', 'creatinine',
                                                   'glucose', 'ldl', 'hgb',
                                                   'atenolol', 'atorvastatin',
                                                   'bisoprolol',
                                                   'canagliflozin',
                                                   'carvedilol',
                                                   'dapagliflozin',
                                                   'irbesartan', 'labetalol',
                                                   'losartan', 'lovastatin',
                                                   'metformin', 'metoprolol',
                                                   'nebivolol', 'olmesartan',
                                                   'pitavastatin',
              

In [33]:
# # C selected
# clf.steps[1][1].C_

# # Coefficient
# clf.steps[1][1].coef_

array([1291.54966501])

array([[-3.40092913e-01, -1.64896755e-01,  1.04440276e-01,
         1.57641318e-01, -3.77814470e-01, -2.06891269e-01,
         1.53852323e-01, -4.05919978e-01, -8.04173051e-01,
         0.00000000e+00,  2.14828106e-01,  0.00000000e+00,
         1.79867958e+00, -7.44410499e-01, -2.21636063e-01,
        -1.77775607e-01,  1.79020702e-01,  5.29482546e-02,
         1.08613055e+00, -2.53864669e+00,  0.00000000e+00,
         5.98442627e-01,  0.00000000e+00, -3.97375437e-01,
        -1.17006548e-01,  2.32563802e+00, -4.28573277e-03,
        -5.47709914e-01, -8.03822110e+00,  1.78243512e+00,
         2.99866916e+00,  2.78394392e+00, -8.00458063e-01,
         3.27285165e-01]])

In [34]:
cols = ['Accuracy', 'F1', 'Precision', 'Recall', 'Specificiy', 'ROC_AUC']
pd.DataFrame(metrics_1, columns = cols)

Unnamed: 0,Accuracy,F1,Precision,Recall,Specificiy,ROC_AUC
0,0.506667,0.412698,0.325,0.565217,0.480769,0.561037


## GridSearchCV - logistic

* Same C and results as the LogisticRegressionCV

In [111]:
def specificity(y_valid, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_valid, y_pred).ravel()
    
    specificity = tn / (tn+fp)
    return specificity

In [112]:
scores = ['accuracy', 'f1', 'precision', 'recall', 'Specificiy']

In [113]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression


for score in scores:
    
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    # Set weight to prevent bias, 4 to CKD (1), 1 to non-CKD (0)
    model = LogisticRegression(random_state=0, max_iter=10000, class_weight={0: 1, 1: 4})
#     model = LogisticRegression(random_state=0, max_iter=10000,class_weight='balanced')
    
    param_grid = {
    'model__C': np.logspace(-5, 5, 10)
    }
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1)
    search.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print()
    print("Best parameters set found on development set:")
    print()
    print(search.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = search.cv_results_['mean_test_score']
    stds = search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_valid, search.predict(X_valid)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for accuracy



GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.600):

Best parameters set found on development set:

{'model__C': 7742.636826811277}

Grid scores on development set:

0.342 (+/-0.022) for {'model__C': 1e-05}
0.342 (+/-0.022) for {'model__C': 0.0001291549665014884}
0.342 (+/-0.022) for {'model__C': 0.0016681005372000592}
0.440 (+/-0.121) for {'model__C': 0.021544346900318846}
0.556 (+/-0.176) for {'model__C': 0.2782559402207126}
0.582 (+/-0.200) for {'model__C': 3.593813663804626}
0.582 (+/-0.236) for {'model__C': 46.41588833612782}
0.591 (+/-0.233) for {'model__C': 599.4842503189421}
0.600 (+/-0.230) for {'model__C': 7742.636826811277}
0.600 (+/-0.230) for {'model__C': 100000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.48      0.57        52
           1       0.33      0.57      0.41        23

    accuracy                   

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.600):

Best parameters set found on development set:

{'model__C': 7742.636826811277}

Grid scores on development set:

0.342 (+/-0.022) for {'model__C': 1e-05}
0.342 (+/-0.022) for {'model__C': 0.0001291549665014884}
0.342 (+/-0.022) for {'model__C': 0.0016681005372000592}
0.440 (+/-0.121) for {'model__C': 0.021544346900318846}
0.556 (+/-0.176) for {'model__C': 0.2782559402207126}
0.582 (+/-0.200) for {'model__C': 3.593813663804626}
0.582 (+/-0.236) for {'model__C': 46.41588833612782}
0.591 (+/-0.233) for {'model__C': 599.4842503189421}
0.600 (+/-0.230) for {'model__C': 7742.636826811277}
0.600 (+/-0.230) for {'model__C': 100000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.48      0.57        52
           1       0.33      0.57      0.41        23

    accuracy                   

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.600):

Best parameters set found on development set:

{'model__C': 7742.636826811277}

Grid scores on development set:

0.342 (+/-0.022) for {'model__C': 1e-05}
0.342 (+/-0.022) for {'model__C': 0.0001291549665014884}
0.342 (+/-0.022) for {'model__C': 0.0016681005372000592}
0.440 (+/-0.121) for {'model__C': 0.021544346900318846}
0.556 (+/-0.176) for {'model__C': 0.2782559402207126}
0.582 (+/-0.200) for {'model__C': 3.593813663804626}
0.582 (+/-0.236) for {'model__C': 46.41588833612782}
0.591 (+/-0.233) for {'model__C': 599.4842503189421}
0.600 (+/-0.230) for {'model__C': 7742.636826811277}
0.600 (+/-0.230) for {'model__C': 100000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.48      0.57        52
           1       0.33      0.57      0.41        23

    accuracy                   

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.600):

Best parameters set found on development set:

{'model__C': 7742.636826811277}

Grid scores on development set:

0.342 (+/-0.022) for {'model__C': 1e-05}
0.342 (+/-0.022) for {'model__C': 0.0001291549665014884}
0.342 (+/-0.022) for {'model__C': 0.0016681005372000592}
0.440 (+/-0.121) for {'model__C': 0.021544346900318846}
0.556 (+/-0.176) for {'model__C': 0.2782559402207126}
0.582 (+/-0.200) for {'model__C': 3.593813663804626}
0.582 (+/-0.236) for {'model__C': 46.41588833612782}
0.591 (+/-0.233) for {'model__C': 599.4842503189421}
0.600 (+/-0.230) for {'model__C': 7742.636826811277}
0.600 (+/-0.230) for {'model__C': 100000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.48      0.57        52
           1       0.33      0.57      0.41        23

    accuracy                   

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.600):

Best parameters set found on development set:

{'model__C': 7742.636826811277}

Grid scores on development set:

0.342 (+/-0.022) for {'model__C': 1e-05}
0.342 (+/-0.022) for {'model__C': 0.0001291549665014884}
0.342 (+/-0.022) for {'model__C': 0.0016681005372000592}
0.440 (+/-0.121) for {'model__C': 0.021544346900318846}
0.556 (+/-0.176) for {'model__C': 0.2782559402207126}
0.582 (+/-0.200) for {'model__C': 3.593813663804626}
0.582 (+/-0.236) for {'model__C': 46.41588833612782}
0.591 (+/-0.233) for {'model__C': 599.4842503189421}
0.600 (+/-0.230) for {'model__C': 7742.636826811277}
0.600 (+/-0.230) for {'model__C': 100000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.48      0.57        52
           1       0.33      0.57      0.41        23

    accuracy                   

In [114]:
search.best_estimator_

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['sbp', 'dbp', 'creatinine',
                                                   'glucose', 'ldl', 'hgb',
                                                   'atenolol', 'atorvastatin',
                                                   'bisoprolol',
                                                   'canagliflozin',
                                                   'carvedilol',
                                                   'dapagliflozin',
                                                   'irbesartan', 'labetalol',
                                                   'losartan', 'lovastatin',
                                                   'metformin', 'metoprolol',
                                                   'nebivolol', 'olmesartan',
                                                   'pitavastatin',
              

In [117]:
import joblib
joblib.dump(search.best_estimator_, 'models/6m/LogisticRegression.joblib')

['models/6m/LogisticRegression.joblib']

## Decision tree

In [118]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

for score in scores:
    
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    # Set weight to prevent bias, 4 to CKD (1), 1 to non-CKD (0)
    model = DecisionTreeClassifier(random_state=0, class_weight={0: 1, 1: 4}, min_samples_leaf = 30)
    
    param_grid = {
    'model__max_depth': np.linspace(10, 100, 10),
                
    }
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1)
    search.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print()
    print("Best parameters set found on development set:")
    print()
    print(search.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = search.cv_results_['mean_test_score']
    stds = search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_valid, search.predict(X_valid)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for accuracy



GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.467):

Best parameters set found on development set:

{'model__max_depth': 10.0}

Grid scores on development set:

0.467 (+/-0.109) for {'model__max_depth': 10.0}
0.467 (+/-0.109) for {'model__max_depth': 20.0}
0.467 (+/-0.109) for {'model__max_depth': 30.0}
0.467 (+/-0.109) for {'model__max_depth': 40.0}
0.467 (+/-0.109) for {'model__max_depth': 50.0}
0.467 (+/-0.109) for {'model__max_depth': 60.0}
0.467 (+/-0.109) for {'model__max_depth': 70.0}
0.467 (+/-0.109) for {'model__max_depth': 80.0}
0.467 (+/-0.109) for {'model__max_depth': 90.0}
0.467 (+/-0.109) for {'model__max_depth': 100.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.15      0.25        52
           1       0.31      0.87      0.46        23

    accuracy                           0.37        75
   macro avg       0.5

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.467):

Best parameters set found on development set:

{'model__max_depth': 10.0}

Grid scores on development set:

0.467 (+/-0.109) for {'model__max_depth': 10.0}
0.467 (+/-0.109) for {'model__max_depth': 20.0}
0.467 (+/-0.109) for {'model__max_depth': 30.0}
0.467 (+/-0.109) for {'model__max_depth': 40.0}
0.467 (+/-0.109) for {'model__max_depth': 50.0}
0.467 (+/-0.109) for {'model__max_depth': 60.0}
0.467 (+/-0.109) for {'model__max_depth': 70.0}
0.467 (+/-0.109) for {'model__max_depth': 80.0}
0.467 (+/-0.109) for {'model__max_depth': 90.0}
0.467 (+/-0.109) for {'model__max_depth': 100.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.15      0.25        52
           1       0.31      0.87      0.46        23

    accuracy                           0.37        75
   macro avg       0.5

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.467):

Best parameters set found on development set:

{'model__max_depth': 10.0}

Grid scores on development set:

0.467 (+/-0.109) for {'model__max_depth': 10.0}
0.467 (+/-0.109) for {'model__max_depth': 20.0}
0.467 (+/-0.109) for {'model__max_depth': 30.0}
0.467 (+/-0.109) for {'model__max_depth': 40.0}
0.467 (+/-0.109) for {'model__max_depth': 50.0}
0.467 (+/-0.109) for {'model__max_depth': 60.0}
0.467 (+/-0.109) for {'model__max_depth': 70.0}
0.467 (+/-0.109) for {'model__max_depth': 80.0}
0.467 (+/-0.109) for {'model__max_depth': 90.0}
0.467 (+/-0.109) for {'model__max_depth': 100.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.15      0.25        52
           1       0.31      0.87      0.46        23

    accuracy                           0.37        75
   macro avg       0.5

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.467):

Best parameters set found on development set:

{'model__max_depth': 10.0}

Grid scores on development set:

0.467 (+/-0.109) for {'model__max_depth': 10.0}
0.467 (+/-0.109) for {'model__max_depth': 20.0}
0.467 (+/-0.109) for {'model__max_depth': 30.0}
0.467 (+/-0.109) for {'model__max_depth': 40.0}
0.467 (+/-0.109) for {'model__max_depth': 50.0}
0.467 (+/-0.109) for {'model__max_depth': 60.0}
0.467 (+/-0.109) for {'model__max_depth': 70.0}
0.467 (+/-0.109) for {'model__max_depth': 80.0}
0.467 (+/-0.109) for {'model__max_depth': 90.0}
0.467 (+/-0.109) for {'model__max_depth': 100.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.15      0.25        52
           1       0.31      0.87      0.46        23

    accuracy                           0.37        75
   macro avg       0.5

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.467):

Best parameters set found on development set:

{'model__max_depth': 10.0}

Grid scores on development set:

0.467 (+/-0.109) for {'model__max_depth': 10.0}
0.467 (+/-0.109) for {'model__max_depth': 20.0}
0.467 (+/-0.109) for {'model__max_depth': 30.0}
0.467 (+/-0.109) for {'model__max_depth': 40.0}
0.467 (+/-0.109) for {'model__max_depth': 50.0}
0.467 (+/-0.109) for {'model__max_depth': 60.0}
0.467 (+/-0.109) for {'model__max_depth': 70.0}
0.467 (+/-0.109) for {'model__max_depth': 80.0}
0.467 (+/-0.109) for {'model__max_depth': 90.0}
0.467 (+/-0.109) for {'model__max_depth': 100.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.15      0.25        52
           1       0.31      0.87      0.46        23

    accuracy                           0.37        75
   macro avg       0.5

In [119]:
joblib.dump(search.best_estimator_, 'models/6m/DecisionTreeClassifier.joblib')

['models/6m/DecisionTreeClassifier.joblib']

## lightgbm

In [120]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

for score in scores:
    
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    # Set weight to prevent bias, 4 to CKD (1), 1 to non-CKD (0)
    model = LGBMClassifier(n_estimators=1000, objective='binary',scale_pos_weight=4)
    
    param_grid = {
    'model__learning_rate': [1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1, 10, 100]
                
    }
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1)
    search.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print()
    print("Best parameters set found on development set:")
    print()
    print(search.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = search.cv_results_['mean_test_score']
    stds = search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_valid, search.predict(X_valid)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for accuracy



GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.658):

Best parameters set found on development set:

{'model__learning_rate': 1e-05}

Grid scores on development set:

0.658 (+/-0.022) for {'model__learning_rate': 1e-05}
0.658 (+/-0.022) for {'model__learning_rate': 0.0001}
0.564 (+/-0.107) for {'model__learning_rate': 0.001}
0.564 (+/-0.100) for {'model__learning_rate': 0.01}
0.591 (+/-0.067) for {'model__learning_rate': 0.1}
0.569 (+/-0.096) for {'model__learning_rate': 1}
0.627 (+/-0.081) for {'model__learning_rate': 10}
0.453 (+/-0.036) for {'model__learning_rate': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      1.00      0.82        52
           1       0.00      0.00      0.00        23

    accuracy                           0.69        75
   macro avg       0.35      0.50      0.41        75
weighted avg       0.48      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.658):

Best parameters set found on development set:

{'model__learning_rate': 1e-05}

Grid scores on development set:

0.658 (+/-0.022) for {'model__learning_rate': 1e-05}
0.658 (+/-0.022) for {'model__learning_rate': 0.0001}
0.564 (+/-0.107) for {'model__learning_rate': 0.001}
0.564 (+/-0.100) for {'model__learning_rate': 0.01}
0.591 (+/-0.067) for {'model__learning_rate': 0.1}
0.569 (+/-0.096) for {'model__learning_rate': 1}
0.627 (+/-0.081) for {'model__learning_rate': 10}
0.453 (+/-0.036) for {'model__learning_rate': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      1.00      0.82        52
           1       0.00      0.00      0.00        23

    accuracy                           0.69        75
   macro avg       0.35      0.50      0.41        75
weighted avg       0.48      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.658):

Best parameters set found on development set:

{'model__learning_rate': 1e-05}

Grid scores on development set:

0.658 (+/-0.022) for {'model__learning_rate': 1e-05}
0.658 (+/-0.022) for {'model__learning_rate': 0.0001}
0.564 (+/-0.107) for {'model__learning_rate': 0.001}
0.564 (+/-0.100) for {'model__learning_rate': 0.01}
0.591 (+/-0.067) for {'model__learning_rate': 0.1}
0.569 (+/-0.096) for {'model__learning_rate': 1}
0.627 (+/-0.081) for {'model__learning_rate': 10}
0.453 (+/-0.036) for {'model__learning_rate': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      1.00      0.82        52
           1       0.00      0.00      0.00        23

    accuracy                           0.69        75
   macro avg       0.35      0.50      0.41        75
weighted avg       0.48      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.658):

Best parameters set found on development set:

{'model__learning_rate': 1e-05}

Grid scores on development set:

0.658 (+/-0.022) for {'model__learning_rate': 1e-05}
0.658 (+/-0.022) for {'model__learning_rate': 0.0001}
0.564 (+/-0.107) for {'model__learning_rate': 0.001}
0.564 (+/-0.100) for {'model__learning_rate': 0.01}
0.591 (+/-0.067) for {'model__learning_rate': 0.1}
0.569 (+/-0.096) for {'model__learning_rate': 1}
0.627 (+/-0.081) for {'model__learning_rate': 10}
0.453 (+/-0.036) for {'model__learning_rate': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      1.00      0.82        52
           1       0.00      0.00      0.00        23

    accuracy                           0.69        75
   macro avg       0.35      0.50      0.41        75
weighted avg       0.48      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['sbp',
                                                                          'dbp',
                                                                          'creatinine',
                                                                          'glucose',
                                                                          'ldl',
                                                                          'hgb',
                                                                          'atenolol',
                                                                          'atorvastatin',
                                                                          'bisoprolol',
            

Best parameter (CV score=0.658):

Best parameters set found on development set:

{'model__learning_rate': 1e-05}

Grid scores on development set:

0.658 (+/-0.022) for {'model__learning_rate': 1e-05}
0.658 (+/-0.022) for {'model__learning_rate': 0.0001}
0.564 (+/-0.107) for {'model__learning_rate': 0.001}
0.564 (+/-0.100) for {'model__learning_rate': 0.01}
0.591 (+/-0.067) for {'model__learning_rate': 0.1}
0.569 (+/-0.096) for {'model__learning_rate': 1}
0.627 (+/-0.081) for {'model__learning_rate': 10}
0.453 (+/-0.036) for {'model__learning_rate': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      1.00      0.82        52
           1       0.00      0.00      0.00        23

    accuracy                           0.69        75
   macro avg       0.35      0.50      0.41        75
weighted avg       0.48      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [123]:
joblib.dump(search.best_estimator_, 'models/6m/LGBMClassifier.joblib')

['models/6m/LGBMClassifier.joblib']

## random forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

for score in scores:
    
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    # Set weight to prevent bias, 4 to CKD (1), 1 to non-CKD (0)
    model = RandomForestClassifier(class_weight={0: 1, 1: 4}, n_estimators=800, n_jobs=-1, verbose=1)
    
    param_grid = {
    'model__max_depth': list(range(1,31))
                
    }
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1)
    search.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print()
    print("Best parameters set found on development set:")
    print()
    print(search.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = search.cv_results_['mean_test_score']
    stds = search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_valid, search.predict(X_valid)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for accuracy



# Compare models

In [None]:
metrics_1 = []

clf = search.best_estimator_

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
y_pred = clf.predict(X_valid)
y_pred_proba = clf.predict_proba(X_valid)[:,1] # only get the probability of 1

acc = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
prec = precision_score(y_valid, y_pred)
rec = recall_score(y_valid, y_pred)
spec = specificity(y_valid, y_pred)
roc = roc_auc_score(y_valid, y_pred_proba)

metric_output = [acc, f1, prec, rec, spec, roc]
metrics_1.append(metric_output)
cols = ['Accuracy', 'F1', 'Precision', 'Recall', 'Specificiy', 'ROC_AUC']
pd.DataFrame(metrics_1, columns = cols)