In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../src/')

In [2]:
pd.options.display.max_columns = None

In [3]:
path_raw    = '../data/raw/'
dirname     = 'diabetes/'
filename    = 'diabetes.csv'

## Load data

In [4]:
raw_df = pd.read_csv(os.path.join(path_raw, dirname, filename))

In [5]:
print(raw_df.shape)
raw_df.head(3)

(101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO


In [6]:
raw_df = raw_df.replace({'?': None})
raw_df = raw_df.replace({'None': None})
raw_df = raw_df.replace({'Unknown/Invalid': None})
raw_df = raw_df.replace({'Unknown': None})
raw_df = raw_df.replace({'Invalid': None})

In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      99493 non-null   object
 3   gender                    101763 non-null  object
 4   age                       101766 non-null  object
 5   weight                    3197 non-null    object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                61510 non-null   object
 11  medical_specialty         51817 non-null   object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

## Execution

In [8]:
raw_df['readmitted'] = np.where(raw_df['readmitted'] == 'NO', 0, 1)

In [9]:
# X features names
features_type = {'qualitative': [
                    'encounter_id',
                    'patient_nbr',
                    'race',
                    'gender',
                    'age',
                    'weight',
                    'admission_type_id',
                    'discharge_disposition_id',
                    'admission_source_id',
                    'payer_code',
                    'medical_specialty',
                    'diag_1',
                    'diag_2',
                    'diag_3',
                    'max_glu_serum',
                    'A1Cresult',

                    'metformin', 
                    'repaglinide',
                    'nateglinide',
                    'chlorpropamide',
                    'glimepiride', 
                    'acetohexamide',
                    'glipizide',
                    'glyburide',
                    'tolbutamide',
                    'pioglitazone',
                    'rosiglitazone',
                    'acarbose',
                    'miglitol',
                    'troglitazone',
                    'tolazamide',
                    'examide',
                    'citoglipton',
                    'insulin',
                    'glyburide-metformin',
                    'glipizide-metformin',
                    'glimepiride-pioglitazone',
                    'metformin-rosiglitazone',
                    'metformin-pioglitazone',
                    'change',
                    'diabetesMed'
                                ],
                'quantitative': [
                    'time_in_hospital',
                    'num_lab_procedures',
                    'num_procedures',
                    'num_medications',
                    'number_outpatient', 
                    'number_emergency',
                    'number_inpatient',
                    'number_diagnoses'
                                ]
                }

# target name
y_name = 'readmitted'

data = raw_df.copy()

path_output = '../data/output/diabetes/'

In [10]:
config = {}
config['test_size']        = 0.25
config['min_missing']      = 0.25
config['outliers_method']  = 'lof'
config['alpha_sta']        = 0.05
config['strategy_missing'] = 'median'
config['method_scale']     = 'standard'
config['metric_train']     = 'roc_auc'
config['seed']             = 42
config['njobs']            = -1

In [11]:
from datetime import datetime
from octopus import OctopusProcess
from octopus import OctopusPrepare
from octopus import OctopusTrain
from octopus import OctopusEvaluate

### Octopus Process

In [12]:
octoProcess = OctopusProcess(
                    test_size        = config['test_size'],
                    min_missing_values = config['min_missing'],
                    outliers_method  = config['outliers_method'],
                    alpha_sta        = config['alpha_sta'],
                    seed             = config['seed'])

train, test, features_type, html = octoProcess.process_data(data   = data, 
                                                            y_name = y_name,
                                                            features_type = features_type,
                                                            path_output   = path_output)

X_train, y_train = train
X_test, y_test = test

2021-05-29 15:07:50,332 INFO: Started to check the features consistency
2021-05-29 15:07:50,427 INFO: feature: discharge_disposition_idre-categorized
2021-05-29 15:07:50,477 INFO: feature: admission_source_idre-categorized
2021-05-29 15:07:50,488 INFO: feature: payer_codere-categorized
2021-05-29 15:07:50,499 INFO: feature: medical_specialtyre-categorized
2021-05-29 15:07:50,796 INFO: Features: ['encounter_id', 'patient_nbr', 'diag_1', 'diag_2', 'diag_3', 'nateglinide', 'chlorpropamide', 'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'] were removed because its distribution
2021-05-29 15:07:50,797 INFO: Consistency values finished!
2021-05-29 15:07:50,867 INFO: Feature race was imputer with "other"
2021-05-29 15:07:50,873 INFO: Feature gender was imputer with "other"
2021-05-29 15:07:50,892 INFO: Fea

<Figure size 432x288 with 0 Axes>

### Octopus Prepare

In [13]:
octoPrepare = OctopusPrepare(
                    strategy_missing = config['strategy_missing'],
                    method_scale     = config['method_scale'])

preparessor = octoPrepare.prepare_pipeline(features_type = features_type)

### Octopus Train

In [16]:
start = datetime.now()

octoTrain = OctopusTrain(metric     = config['metric_train'],
                         njobs      = config['njobs'],
                         seed       = config['seed'])

models_trained = octoTrain.train(X_train, y_train, preparessor)

finish = datetime.now()
print('Time execution training models:', finish - start)

Logistic Regression is going to be fitted
Logistic Regression fitted
Regularized Logistic Regression is going to be fitted
Regularized Logistic Regression fitted
Random Forest is going to be fitted
|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6701  [0m | [0m 4.996   [0m | [0m 0.9547  [0m | [0m 37.94   [0m | [0m 65.89   [0m | [0m 120.2   [0m |
| [95m 2       [0m | [95m 0.6723  [0m | [95m 3.248   [0m | [95m 0.1522  [0m | [95m 43.98   [0m | [95m 66.09   [0m | [95m 368.6   [0m |
| [0m 3       [0m | [0m 0.6499  [0m | [0m 2.165   [0m | [0m 0.9719  [0m | [0m 42.46   [0m | [0m 33.05   [0m | [0m 131.8   [0m |
| [0m 4       [0m | [0m 0.6693  [0m | [0m 3.467   [0m | [0m 0.3735  [0m | [0m 28.61   [0m | [0m 51.72   [0m | [0m 181.1   [0m |
| [95m 5       [0m | [95m 0.6814  [0m | [95m 6.895

### Octopus Evaluate

In [17]:
start = datetime.now()

OctoEval = OctopusEvaluate(metric = config['metric_train'],
                           seed   = config['seed'])

best_model, metrics_df = OctoEval.evaluate(X_train, 
                                           y_train,
                                           X_test,
                                           y_test,
                                           models_trained,
                                           path_output)

finish = datetime.now()
print('Time execution testing models:', finish - start)

LR model: roc_auc = 0.683720 (0.004626)
LRR model: roc_auc = 0.683726 (0.004622)
RF model: roc_auc = 0.688278 (0.005443)
XGB model: roc_auc = 0.698660 (0.004856)
Time execution testing models: 0:10:27.415559


<Figure size 432x288 with 0 Axes>

<Figure size 720x288 with 0 Axes>

### Renderize HTML

In [18]:
from utils import renderize_html

In [19]:
path_html = os.path.join(path_output, 'report.html')
renderize_html(html, path_html)