## Task 1: Data Preparation

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [90]:
df = pd.read_csv('data/diabetic_data.csv')

### Understanding the dataset

In [91]:
df.shape

(101766, 50)

In [92]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


### Checking for special characters

In [93]:
df = df.replace('?', np.NaN) 
df.head()

### Removing null values

In [95]:
df.count()

encounter_id                101766
patient_nbr                 101766
race                         99493
gender                      101766
age                         101766
weight                        3197
admission_type_id           101766
discharge_disposition_id    101766
admission_source_id         101766
time_in_hospital            101766
payer_code                   61510
medical_specialty            51817
num_lab_procedures          101766
num_procedures              101766
num_medications             101766
number_outpatient           101766
number_emergency            101766
number_inpatient            101766
diag_1                      101745
diag_2                      101408
diag_3                      100343
number_diagnoses            101766
max_glu_serum               101766
A1Cresult                   101766
metformin                   101766
repaglinide                 101766
nateglinide                 101766
chlorpropamide              101766
glimepiride         

In [96]:
df = df.drop(['weight'], axis=1)

In [97]:
df = df.dropna()

### Deleting irrelevant columns

In [98]:
df.drop(['encounter_id', 'patient_nbr', 'payer_code', 'medical_specialty', 'admission_type_id', 
         'repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide',
        'pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton', 'metformin',
        'glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone', 'diag_2', 'diag_3', 'change'], axis=1, inplace=True)

### Formatting data

In [99]:
df.loc[:, "age"] = df["age"].replace( ["[0-10)", "[10-20)", "[20-30)"], "30 years or younger")
df.loc[:, "age"] = df["age"].replace(["[30-40)", "[40-50)", "[50-60)"], "30-60 years")
df.loc[:, "age"] = df["age"].replace(["[60-70)", "[70-80)", "[80-90)", "[90-100)"], "Over 60 years")

In [100]:
df.rename(columns={ 'readmitted':'readmit_status', 'number_outpatient':'prior_outpatient', 'number_emergency': 'prior_emergency', 'number_inpatient':'prior_inpatient', 'diag_1':'primary_diagnosis', 'discharge_disposition_id':'discharge_destination', 'admission_source_id':'admission_source', 'diabetesMed':'diabetes_Med_prescribe'}, inplace=True)

In [101]:
df = df.drop(df[df.readmit_status ==  '>30'].index)
df['readmit_status'] = df['readmit_status'].replace({"NO":"not readmitted", "<30":"readmitted"})

In [102]:
df.loc[:, "primary_diagnosis"] = df["primary_diagnosis"].replace(
    regex={
        "[7][1-3][0-9]": "Musculoskeltal primary_diagnosis",
        "250.*": "Diabetes",
        "[4][6-9][0-9]|[5][0-1][0-9]|786": "Respitory Issues",
        "[5][8-9][0-9]|[6][0-2][0-9]|788": "Genitourinary Issues"
    }
)
diagnoses = ["Respitory Issues", "Diabetes", "Genitourinary Issues", "Musculoskeltal Issues"]
df.loc[:, "primary_diagnosis"] = df["primary_diagnosis"].apply(lambda x: x if x in diagnoses else "Other")

In [103]:
df = df[((df.discharge_destination != 11) & 
                                          (df.discharge_destination != 13) &
                                          (df.discharge_destination != 14) & 
                                          (df.discharge_destination != 19) & 
                                          (df.discharge_destination != 20) & 
                                          (df.discharge_destination != 21))] 

In [104]:
df.loc[:, "discharge_destination"] = (df.discharge_destination
                                            .apply(lambda x: "Discharged to Home" if x==1 else "Other"))

In [105]:
df.loc[:, "admission_source"] = df["admission_source"].apply(lambda x: x if x in ["Emergency", "Referral"] else "Other")

In [106]:
df.head()

Unnamed: 0,race,gender,age,discharge_destination,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,prior_outpatient,prior_emergency,prior_inpatient,primary_diagnosis,number_diagnoses,max_glu_serum,A1Cresult,insulin,diabetes_Med_prescribe,readmit_status
20446,Caucasian,Female,Over 60 years,Other,Other,7,58,2,15,0,0,0,Other,9,,,Steady,Yes,not readmitted
20737,Caucasian,Female,Over 60 years,Discharged to Home,Other,3,59,3,11,0,0,0,Other,6,,,Steady,Yes,not readmitted
20824,Caucasian,Female,Over 60 years,Discharged to Home,Other,4,56,1,9,0,0,0,Other,6,,,No,Yes,not readmitted
21083,Caucasian,Male,Over 60 years,Other,Other,10,68,1,18,0,0,0,Other,6,,,Steady,Yes,not readmitted
23879,Caucasian,Female,Over 60 years,Other,Other,12,77,5,19,0,0,0,Other,5,,>8,Steady,Yes,not readmitted


## Task 2: Training data

In [107]:
df = df.sample(frac=0.20)

In [108]:
train, test = train_test_split(df, train_size=0.80, random_state=1)

In [109]:
train.to_parquet('data/training_data.parquet')
test.to_parquet('data/testing_data.parquet')

In [112]:
# Split train and test data into features X and targets Y.
target_column_name = 'readmit_status'
Y_train = train[target_column_name]
X_train = train.drop([target_column_name], axis = 1)  
Y_test = test[target_column_name]
X_test = test.drop([target_column_name], axis = 1)  

# Transform string data to numeric one-hot vectors
categorical_selector = selector(dtype_exclude=np.number)
categorical_columns = categorical_selector(X_train)
categorial_encoder = OneHotEncoder(handle_unknown="ignore")

# Standardize numeric data by removing the mean and scaling to unit variance
numerical_selector = selector(dtype_include=np.number)
numerical_columns = numerical_selector(X_train)
numerical_encoder = StandardScaler()

# Create a preprocessor that will preprocess both numeric and categorical data
preprocessor = ColumnTransformer([
('categorical-encoder', categorial_encoder, categorical_columns),
('standard_scaler', numerical_encoder, numerical_columns)])

clf = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))

print("Training model...") 
model = clf.fit(X_train, Y_train)
print("Accuracy score: ", clf.score(X_test,Y_test))


Training model...
Accuracy score:  0.8433908045977011
