In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../src/')

from octopus import OctopusML

In [2]:
pd.options.display.max_columns = None

In [3]:
path_raw    = '../data/raw/'
dirname     = 'diabetes/'
filename    = 'diabetes.csv'

## Load data

In [4]:
raw_df = pd.read_csv(os.path.join(path_raw, dirname, filename))

In [5]:
print(raw_df.shape)
raw_df.head(3)

(101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO


## Configurations

In [6]:
raw_df['readmitted'] = np.where(raw_df['readmitted'] == 'NO', 0, 1)

In [7]:
# X features names
features_type = {'qualitative': [
                    'encounter_id',
                    'patient_nbr',
                    'race',
                    'gender',
                    'age',
                    'weight',
                    'admission_type_id',
                    'discharge_disposition_id',
                    'admission_source_id',
                    'payer_code',
                    'medical_specialty',
                    'diag_1',
                    'diag_2',
                    'diag_3',
                    'max_glu_serum',
                    'A1Cresult',

                    'metformin', 
                    'repaglinide',
                    'nateglinide',
                    'chlorpropamide',
                    'glimepiride', 
                    'acetohexamide',
                    'glipizide',
                    'glyburide',
                    'tolbutamide',
                    'pioglitazone',
                    'rosiglitazone',
                    'acarbose',
                    'miglitol',
                    'troglitazone',
                    'tolazamide',
                    'examide',
                    'citoglipton',
                    'insulin',
                    'glyburide-metformin',
                    'glipizide-metformin',
                    'glimepiride-pioglitazone',
                    'metformin-rosiglitazone',
                    'metformin-pioglitazone',
                    'change',
                    'diabetesMed'
                                ],
                'quantitative': [
                    'time_in_hospital',
                    'num_lab_procedures',
                    'num_procedures',
                    'num_medications',
                    'number_outpatient', 
                    'number_emergency',
                    'number_inpatient',
                    'number_diagnoses'
                                ]
                }

# target name
y_name = 'readmitted'

data = raw_df.copy()

path_output = '../data/output/diabetes/'

In [8]:
config = {}
config['test_size']        = 0.25
config['min_missing']      = 0.25
config['outliers_method']  = 'lof'
config['alpha_sta']        = 0.05
config['strategy_missing'] = 'median'
config['method_scale']     = 'standard'
config['metric_train']     = 'roc_auc'
config['seed']             = 42
config['njobs']            = -1

## Octopus Execution

In [10]:
OctoML = OctopusML(
                test_size        = config['test_size'],
                min_missing      = config['min_missing'],
                outliers_method  = config['outliers_method'],
                alpha_sta        = config['alpha_sta'],
                strategy_missing = config['strategy_missing'],
                method_scale     = config['method_scale'],
                metric_train     = config['metric_train'],
                njobs            = config['njobs'],
                seed             = config['seed'])

In [11]:
results = OctoML.autoML(
                   data          = data,
                   y_name        = y_name,
                   features_type = features_type,
                   path_output   = path_output)

2021-05-29 11:29:04,206 INFO: Started to check the features consistency
2021-05-29 11:29:04,358 INFO: feature: discharge_disposition_idre-categorized
2021-05-29 11:29:04,438 INFO: feature: admission_source_idre-categorized
2021-05-29 11:29:04,457 INFO: feature: payer_codere-categorized
2021-05-29 11:29:04,476 INFO: feature: medical_specialtyre-categorized
2021-05-29 11:29:04,983 INFO: Features: ['encounter_id', 'patient_nbr', 'diag_1', 'diag_2', 'diag_3', 'nateglinide', 'chlorpropamide', 'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'] were removed because its distribution
2021-05-29 11:29:04,984 INFO: Consistency values finished!
2021-05-29 11:29:05,105 INFO: None feature were removed because the missing values
2021-05-29 11:29:05,106 INFO: Check the missing values finished!
2021-05-29 11:29:05,133

Logistic Regression is going to be fitted
Logistic Regression fitted
Regularized Logistic Regression is going to be fitted
Regularized Logistic Regression fitted
Random Forest is going to be fitted
|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6702  [0m | [0m 4.996   [0m | [0m 0.9547  [0m | [0m 37.94   [0m | [0m 65.89   [0m | [0m 120.2   [0m |
| [95m 2       [0m | [95m 0.6729  [0m | [95m 3.248   [0m | [95m 0.1522  [0m | [95m 43.98   [0m | [95m 66.09   [0m | [95m 368.6   [0m |
| [0m 3       [0m | [0m 0.6498  [0m | [0m 2.165   [0m | [0m 0.9719  [0m | [0m 42.46   [0m | [0m 33.05   [0m | [0m 131.8   [0m |
| [0m 4       [0m | [0m 0.6698  [0m | [0m 3.467   [0m | [0m 0.3735  [0m | [0m 28.61   [0m | [0m 51.72   [0m | [0m 181.1   [0m |
| [95m 5       [0m | [95m 0.6822  [0m | [95m 6.895

TypeError: '<' not supported between instances of 'str' and 'int'

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>