In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/diabetic_data.csv')

In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
df.count()

encounter_id                101766
patient_nbr                 101766
race                        101766
gender                      101766
age                         101766
weight                      101766
admission_type_id           101766
discharge_disposition_id    101766
admission_source_id         101766
time_in_hospital            101766
payer_code                  101766
medical_specialty           101766
num_lab_procedures          101766
num_procedures              101766
num_medications             101766
number_outpatient           101766
number_emergency            101766
number_inpatient            101766
diag_1                      101766
diag_2                      101766
diag_3                      101766
number_diagnoses            101766
max_glu_serum               101766
A1Cresult                   101766
metformin                   101766
repaglinide                 101766
nateglinide                 101766
chlorpropamide              101766
glimepiride         

In [5]:
uk_df = df.loc[: , (df == '?').any()]

In [6]:
df = df.replace('?', np.NaN) 

In [7]:
print(df.isna().any())

encounter_id                False
patient_nbr                 False
race                         True
gender                      False
age                         False
weight                       True
admission_type_id           False
discharge_disposition_id    False
admission_source_id         False
time_in_hospital            False
payer_code                   True
medical_specialty            True
num_lab_procedures          False
num_procedures              False
num_medications             False
number_outpatient           False
number_emergency            False
number_inpatient            False
diag_1                       True
diag_2                       True
diag_3                       True
number_diagnoses            False
max_glu_serum               False
A1Cresult                   False
metformin                   False
repaglinide                 False
nateglinide                 False
chlorpropamide              False
glimepiride                 False
acetohexamide 

In [8]:
df.count()

encounter_id                101766
patient_nbr                 101766
race                         99493
gender                      101766
age                         101766
weight                        3197
admission_type_id           101766
discharge_disposition_id    101766
admission_source_id         101766
time_in_hospital            101766
payer_code                   61510
medical_specialty            51817
num_lab_procedures          101766
num_procedures              101766
num_medications             101766
number_outpatient           101766
number_emergency            101766
number_inpatient            101766
diag_1                      101745
diag_2                      101408
diag_3                      100343
number_diagnoses            101766
max_glu_serum               101766
A1Cresult                   101766
metformin                   101766
repaglinide                 101766
nateglinide                 101766
chlorpropamide              101766
glimepiride         

In [9]:
df = df.drop(['weight'], axis=1)

In [10]:
df = df.dropna()

In [11]:
df.count()

encounter_id                26755
patient_nbr                 26755
race                        26755
gender                      26755
age                         26755
admission_type_id           26755
discharge_disposition_id    26755
admission_source_id         26755
time_in_hospital            26755
payer_code                  26755
medical_specialty           26755
num_lab_procedures          26755
num_procedures              26755
num_medications             26755
number_outpatient           26755
number_emergency            26755
number_inpatient            26755
diag_1                      26755
diag_2                      26755
diag_3                      26755
number_diagnoses            26755
max_glu_serum               26755
A1Cresult                   26755
metformin                   26755
repaglinide                 26755
nateglinide                 26755
chlorpropamide              26755
glimepiride                 26755
acetohexamide               26755
glipizide     

In [12]:
df.loc[:, "medicare"] = (df.payer_code == "MC")
df.loc[:, "medicaid"] = (df.payer_code == "MD")

In [13]:
df.drop(['encounter_id', 'patient_nbr', 'payer_code', 'medical_specialty', 'admission_type_id', 
         'repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide',
        'pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton', 'metformin',
        'glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone', 'diag_2', 'diag_3', 'change'], axis=1, inplace=True)

In [14]:
df = df.drop(df[df.readmitted ==  '>30'].index)
df['readmitted'] = df['readmitted'].replace({"NO":"not readmitted", "<30":"readmitted"})

In [15]:
df.rename(columns={ 'readmitted':'readmit_status', 'number_outpatient':'prior_outpatient', 'number_emergency': 'prior_emergency', 'number_inpatient':'prior_inpatient', 'diag_1':'primary_diagnosis', 'discharge_disposition_id':'discharge_destination', 'admission_source_id':'admission_source', 'diabetesMed':'diabetes_Med_prescribe'}, inplace=True)

In [16]:
df.count()

race                      18011
gender                    18011
age                       18011
discharge_destination     18011
admission_source          18011
time_in_hospital          18011
num_lab_procedures        18011
num_procedures            18011
num_medications           18011
prior_outpatient          18011
prior_emergency           18011
prior_inpatient           18011
primary_diagnosis         18011
number_diagnoses          18011
max_glu_serum             18011
A1Cresult                 18011
insulin                   18011
diabetes_Med_prescribe    18011
readmit_status            18011
medicare                  18011
medicaid                  18011
dtype: int64

In [17]:
df.head()

Unnamed: 0,race,gender,age,discharge_destination,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,prior_outpatient,...,prior_inpatient,primary_diagnosis,number_diagnoses,max_glu_serum,A1Cresult,insulin,diabetes_Med_prescribe,readmit_status,medicare,medicaid
20446,Caucasian,Female,[70-80),22,7,7,58,2,15,0,...,0,821,9,,,Steady,Yes,not readmitted,True,False
20737,Caucasian,Female,[60-70),1,1,3,59,3,11,0,...,0,V56,6,,,Steady,Yes,not readmitted,True,False
20824,Caucasian,Female,[90-100),1,7,4,56,1,9,0,...,0,532,6,,,No,Yes,not readmitted,True,False
21083,Caucasian,Male,[70-80),2,7,10,68,1,18,0,...,0,682,6,,,Steady,Yes,not readmitted,True,False
23879,Caucasian,Female,[70-80),3,6,12,77,5,19,0,...,0,238,5,,>8,Steady,Yes,not readmitted,False,False


In [18]:

train, test = train_test_split(df, train_size=0.70)

In [19]:
train.to_parquet('data/training_data')
test.to_parquet('data/testing_data')

In [20]:
target_column_name = 'readmit_status'

In [21]:
def get_categorical_index(categorical_fields):
    cat_idx = []
    for col, value in categorical_fields.iteritems():
        if value.dtype == 'object':
            cat_idx.append(categorical_fields.columns.get_loc(col))
    print("col indices: ", cat_idx)  
    return cat_idx       

In [22]:
# Read in data
print("Reading data")
all_training_data = pd.read_parquet('data/training_data')
target = all_training_data[target_column_name]
features = all_training_data.drop([target_column_name], axis = 1)  

# Transform string data to numeric
numerical_selector = selector(dtype_include=np.number)
categorical_selector = selector(dtype_exclude=np.number)

numerical_columns = numerical_selector(features)
categorical_columns = categorical_selector(features)

categorial_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_encoder = StandardScaler()

preprocessor = ColumnTransformer([
('categorical-encoder', categorial_encoder, categorical_columns),
('standard_scaler', numerical_encoder, numerical_columns)])

categorical_indices = get_categorical_index(features)
clf = make_pipeline(preprocessor, LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(features, target, 
test_size=0.3, random_state=1)

print("Training model...") 
model = clf.fit(X_train, y_train)
print("Accuracy score: ", clf.score(X_test,y_test))

Reading data
col indices:  [0, 1, 2, 12, 14, 15, 16, 17]
Training model...
Accuracy score:  0.8609569125033043


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
