In [2]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.sanitize_column_names import sanitize_column_names

In [136]:
load_dotenv()

True

In [None]:
filepath = os.getenv('DATASET_PATH')

In [3]:
df = pd.read_csv(filepath)

In [4]:
df

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,[80-90),14,77,1,30,0,0,0,Missing,Circulatory,Other,Circulatory,no,normal,no,no,yes
24996,[80-90),2,66,0,24,0,0,0,Missing,Digestive,Injury,Other,no,high,yes,yes,yes
24997,[70-80),5,12,0,6,0,1,0,Missing,Other,Other,Other,normal,no,no,no,yes
24998,[70-80),2,61,3,15,0,0,0,Family/GeneralPractice,Respiratory,Diabetes,Other,no,no,yes,yes,no


In [5]:
df.shape

(25000, 17)

In [6]:
df.columns

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test',
       'A1Ctest', 'change', 'diabetes_med', 'readmitted'],
      dtype='object')

In [7]:
# Checking for missing values

df.isnull().sum()

age                  0
time_in_hospital     0
n_lab_procedures     0
n_procedures         0
n_medications        0
n_outpatient         0
n_inpatient          0
n_emergency          0
medical_specialty    0
diag_1               0
diag_2               0
diag_3               0
glucose_test         0
A1Ctest              0
change               0
diabetes_med         0
readmitted           0
dtype: int64

### Splitting the dataset into training and testing sets

In [8]:
X = df.drop('readmitted', axis=1)
y = df['readmitted']

In [9]:
X

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,[80-90),14,77,1,30,0,0,0,Missing,Circulatory,Other,Circulatory,no,normal,no,no
24996,[80-90),2,66,0,24,0,0,0,Missing,Digestive,Injury,Other,no,high,yes,yes
24997,[70-80),5,12,0,6,0,1,0,Missing,Other,Other,Other,normal,no,no,no
24998,[70-80),2,61,3,15,0,0,0,Family/GeneralPractice,Respiratory,Diabetes,Other,no,no,yes,yes


In [10]:
y

0         no
1         no
2        yes
3        yes
4         no
        ... 
24995    yes
24996    yes
24997    yes
24998     no
24999    yes
Name: readmitted, Length: 25000, dtype: object

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
X_train.shape

(17500, 16)

In [13]:
y_train.shape

(17500,)

In [14]:
X_test.shape

(7500, 16)

In [15]:
y_test.shape

(7500,)

### Analyzing categorical features

In [16]:
# Analyzing age feature

X_train['age'].unique()

array(['[50-60)', '[80-90)', '[60-70)', '[70-80)', '[40-50)', '[90-100)'],
      dtype=object)

In [17]:
X_test['age'].unique()

array(['[50-60)', '[60-70)', '[80-90)', '[70-80)', '[90-100)', '[40-50)'],
      dtype=object)

In [18]:
encoder = OneHotEncoder()
age_encoded_train = encoder.fit_transform(X_train[['age']])
age_encoded_test = encoder.transform(X_test[['age']])

In [19]:
age_encoded_train.toarray()

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], shape=(17500, 6))

In [20]:
age_encoded_test.toarray()

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], shape=(7500, 6))

In [21]:
encoder.get_feature_names_out(['age'])

array(['age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)',
       'age_[80-90)', 'age_[90-100)'], dtype=object)

In [22]:
age_encoded_train_df = pd.DataFrame(age_encoded_train.toarray(), columns=encoder.get_feature_names_out(['age']))

In [23]:
age_encoded_train_df

Unnamed: 0,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
17495,0.0,0.0,1.0,0.0,0.0,0.0
17496,0.0,1.0,0.0,0.0,0.0,0.0
17497,0.0,0.0,0.0,1.0,0.0,0.0
17498,0.0,0.0,1.0,0.0,0.0,0.0


In [24]:
age_encoded_test_df = pd.DataFrame(age_encoded_test.toarray(), columns=encoder.get_feature_names_out(['age']))

In [25]:
age_encoded_test_df

Unnamed: 0,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
7495,0.0,0.0,0.0,1.0,0.0,0.0
7496,0.0,0.0,0.0,1.0,0.0,0.0
7497,0.0,0.0,0.0,0.0,1.0,0.0
7498,0.0,0.0,0.0,1.0,0.0,0.0


In [26]:
X_train.drop('age', axis=1, inplace=True)

In [27]:
X_test.drop('age', axis=1, inplace=True)

In [28]:
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)

In [29]:
X_train

Unnamed: 0,index,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med
0,4913,9,66,1,27,0,0,0,InternalMedicine,Circulatory,Circulatory,Other,no,normal,yes,yes
1,9338,2,39,1,10,0,1,0,Other,Other,Circulatory,Diabetes,no,no,no,no
2,24211,1,23,0,3,0,0,0,Other,Diabetes,Other,Other,no,high,yes,yes
3,18791,5,54,4,21,0,0,0,Missing,Circulatory,Circulatory,Other,no,normal,no,yes
4,16066,4,54,0,9,0,1,0,Missing,Other,Diabetes,Circulatory,no,high,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,21575,7,44,0,27,4,3,0,Missing,Circulatory,Other,Other,no,no,no,no
17496,5390,4,62,0,33,0,2,0,Missing,Digestive,Respiratory,Respiratory,no,no,yes,yes
17497,860,3,13,1,32,0,0,0,Other,Musculoskeletal,Respiratory,Diabetes,no,no,yes,yes
17498,15795,8,58,4,18,0,0,0,Emergency/Trauma,Other,Circulatory,Other,no,high,yes,yes


In [30]:
X_test

Unnamed: 0,index,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med
0,6868,1,35,0,7,1,2,0,Missing,Circulatory,Circulatory,Other,no,no,no,no
1,24016,2,10,6,18,0,0,0,Missing,Circulatory,Respiratory,Circulatory,no,no,no,yes
2,9668,12,59,1,15,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,no,no
3,13640,2,32,0,19,0,0,0,Missing,Circulatory,Circulatory,Other,no,no,no,no
4,14018,1,48,0,18,0,0,0,InternalMedicine,Circulatory,Diabetes,Circulatory,no,no,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,21156,4,35,0,15,0,0,0,Family/GeneralPractice,Respiratory,Other,Other,no,no,yes,yes
7496,24654,5,40,1,12,0,0,0,Cardiology,Circulatory,Circulatory,Other,no,no,yes,yes
7497,14592,4,47,3,21,0,2,0,InternalMedicine,Circulatory,Respiratory,Diabetes,no,no,no,yes
7498,20160,3,39,0,12,0,0,0,Missing,Circulatory,Circulatory,Respiratory,no,no,no,no


In [31]:
X_train = pd.concat([X_train, age_encoded_train_df], axis=1)

In [32]:
X_train

Unnamed: 0,index,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,...,glucose_test,A1Ctest,change,diabetes_med,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,4913,9,66,1,27,0,0,0,InternalMedicine,Circulatory,...,no,normal,yes,yes,0.0,1.0,0.0,0.0,0.0,0.0
1,9338,2,39,1,10,0,1,0,Other,Other,...,no,no,no,no,0.0,0.0,0.0,0.0,1.0,0.0
2,24211,1,23,0,3,0,0,0,Other,Diabetes,...,no,high,yes,yes,0.0,0.0,1.0,0.0,0.0,0.0
3,18791,5,54,4,21,0,0,0,Missing,Circulatory,...,no,normal,no,yes,0.0,0.0,0.0,1.0,0.0,0.0
4,16066,4,54,0,9,0,1,0,Missing,Other,...,no,high,no,yes,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,21575,7,44,0,27,4,3,0,Missing,Circulatory,...,no,no,no,no,0.0,0.0,1.0,0.0,0.0,0.0
17496,5390,4,62,0,33,0,2,0,Missing,Digestive,...,no,no,yes,yes,0.0,1.0,0.0,0.0,0.0,0.0
17497,860,3,13,1,32,0,0,0,Other,Musculoskeletal,...,no,no,yes,yes,0.0,0.0,0.0,1.0,0.0,0.0
17498,15795,8,58,4,18,0,0,0,Emergency/Trauma,Other,...,no,high,yes,yes,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
X_test = pd.concat([X_test, age_encoded_test_df], axis=1)

In [34]:
X_test

Unnamed: 0,index,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,...,glucose_test,A1Ctest,change,diabetes_med,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,6868,1,35,0,7,1,2,0,Missing,Circulatory,...,no,no,no,no,0.0,1.0,0.0,0.0,0.0,0.0
1,24016,2,10,6,18,0,0,0,Missing,Circulatory,...,no,no,no,yes,0.0,0.0,1.0,0.0,0.0,0.0
2,9668,12,59,1,15,0,0,0,Missing,Circulatory,...,no,no,no,no,0.0,0.0,0.0,0.0,1.0,0.0
3,13640,2,32,0,19,0,0,0,Missing,Circulatory,...,no,no,no,no,0.0,0.0,0.0,0.0,1.0,0.0
4,14018,1,48,0,18,0,0,0,InternalMedicine,Circulatory,...,no,no,yes,yes,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,21156,4,35,0,15,0,0,0,Family/GeneralPractice,Respiratory,...,no,no,yes,yes,0.0,0.0,0.0,1.0,0.0,0.0
7496,24654,5,40,1,12,0,0,0,Cardiology,Circulatory,...,no,no,yes,yes,0.0,0.0,0.0,1.0,0.0,0.0
7497,14592,4,47,3,21,0,2,0,InternalMedicine,Circulatory,...,no,no,no,yes,0.0,0.0,0.0,0.0,1.0,0.0
7498,20160,3,39,0,12,0,0,0,Missing,Circulatory,...,no,no,no,no,0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
X_train.drop('index', axis=1, inplace=True)

In [36]:
X_test.drop('index', axis=1, inplace=True)

In [37]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,...,glucose_test,A1Ctest,change,diabetes_med,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,9,66,1,27,0,0,0,InternalMedicine,Circulatory,Circulatory,...,no,normal,yes,yes,0.0,1.0,0.0,0.0,0.0,0.0
1,2,39,1,10,0,1,0,Other,Other,Circulatory,...,no,no,no,no,0.0,0.0,0.0,0.0,1.0,0.0
2,1,23,0,3,0,0,0,Other,Diabetes,Other,...,no,high,yes,yes,0.0,0.0,1.0,0.0,0.0,0.0
3,5,54,4,21,0,0,0,Missing,Circulatory,Circulatory,...,no,normal,no,yes,0.0,0.0,0.0,1.0,0.0,0.0
4,4,54,0,9,0,1,0,Missing,Other,Diabetes,...,no,high,no,yes,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,Missing,Circulatory,Other,...,no,no,no,no,0.0,0.0,1.0,0.0,0.0,0.0
17496,4,62,0,33,0,2,0,Missing,Digestive,Respiratory,...,no,no,yes,yes,0.0,1.0,0.0,0.0,0.0,0.0
17497,3,13,1,32,0,0,0,Other,Musculoskeletal,Respiratory,...,no,no,yes,yes,0.0,0.0,0.0,1.0,0.0,0.0
17498,8,58,4,18,0,0,0,Emergency/Trauma,Other,Circulatory,...,no,high,yes,yes,0.0,0.0,1.0,0.0,0.0,0.0


In [38]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,...,glucose_test,A1Ctest,change,diabetes_med,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,1,35,0,7,1,2,0,Missing,Circulatory,Circulatory,...,no,no,no,no,0.0,1.0,0.0,0.0,0.0,0.0
1,2,10,6,18,0,0,0,Missing,Circulatory,Respiratory,...,no,no,no,yes,0.0,0.0,1.0,0.0,0.0,0.0
2,12,59,1,15,0,0,0,Missing,Circulatory,Circulatory,...,no,no,no,no,0.0,0.0,0.0,0.0,1.0,0.0
3,2,32,0,19,0,0,0,Missing,Circulatory,Circulatory,...,no,no,no,no,0.0,0.0,0.0,0.0,1.0,0.0
4,1,48,0,18,0,0,0,InternalMedicine,Circulatory,Diabetes,...,no,no,yes,yes,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,4,35,0,15,0,0,0,Family/GeneralPractice,Respiratory,Other,...,no,no,yes,yes,0.0,0.0,0.0,1.0,0.0,0.0
7496,5,40,1,12,0,0,0,Cardiology,Circulatory,Circulatory,...,no,no,yes,yes,0.0,0.0,0.0,1.0,0.0,0.0
7497,4,47,3,21,0,2,0,InternalMedicine,Circulatory,Respiratory,...,no,no,no,yes,0.0,0.0,0.0,0.0,1.0,0.0
7498,3,39,0,12,0,0,0,Missing,Circulatory,Circulatory,...,no,no,no,no,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
# Analyzing medical_specialty feature

X_train['medical_specialty'].unique()

array(['InternalMedicine', 'Other', 'Missing', 'Emergency/Trauma',
       'Surgery', 'Cardiology', 'Family/GeneralPractice'], dtype=object)

In [40]:
X_test['medical_specialty'].unique()

array(['Missing', 'InternalMedicine', 'Emergency/Trauma', 'Surgery',
       'Cardiology', 'Other', 'Family/GeneralPractice'], dtype=object)

In [41]:
encoder = OneHotEncoder()
medspec_encoded_train = encoder.fit_transform(X_train[['medical_specialty']])
medspec_encoded_test = encoder.transform(X_test[['medical_specialty']])

In [42]:
medspec_encoded_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], shape=(17500, 7))

In [43]:
medspec_encoded_test.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(7500, 7))

In [44]:
encoder.get_feature_names_out(['medical_specialty'])

array(['medical_specialty_Cardiology',
       'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery'],
      dtype=object)

In [45]:
medspec_encoded_train_df = pd.DataFrame(medspec_encoded_train.toarray(), columns=encoder.get_feature_names_out(['medical_specialty']))

In [46]:
medspec_encoded_train_df

Unnamed: 0,medical_specialty_Cardiology,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Missing,medical_specialty_Other,medical_specialty_Surgery
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
17495,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17496,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17497,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17498,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [47]:
medspec_encoded_test_df = pd.DataFrame(medspec_encoded_test.toarray(), columns=encoder.get_feature_names_out(['medical_specialty']))

In [48]:
medspec_encoded_test_df

Unnamed: 0,medical_specialty_Cardiology,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Missing,medical_specialty_Other,medical_specialty_Surgery
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
7495,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7496,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7497,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7498,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [49]:
X_train.drop('medical_specialty', axis=1, inplace=True)
X_test.drop('medical_specialty', axis=1, inplace=True)

In [50]:
X_train = pd.concat([X_train, medspec_encoded_train_df], axis=1)

In [51]:
X_test = pd.concat([X_test, medspec_encoded_test_df], axis=1)

In [52]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1,diag_2,diag_3,...,age_[70-80),age_[80-90),age_[90-100),medical_specialty_Cardiology,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Missing,medical_specialty_Other,medical_specialty_Surgery
0,9,66,1,27,0,0,0,Circulatory,Circulatory,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,39,1,10,0,1,0,Other,Circulatory,Diabetes,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,23,0,3,0,0,0,Diabetes,Other,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,54,4,21,0,0,0,Circulatory,Circulatory,Other,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,54,0,9,0,1,0,Other,Diabetes,Circulatory,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,Circulatory,Other,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17496,4,62,0,33,0,2,0,Digestive,Respiratory,Respiratory,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17497,3,13,1,32,0,0,0,Musculoskeletal,Respiratory,Diabetes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17498,8,58,4,18,0,0,0,Other,Circulatory,Other,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [53]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1,diag_2,diag_3,...,age_[70-80),age_[80-90),age_[90-100),medical_specialty_Cardiology,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Missing,medical_specialty_Other,medical_specialty_Surgery
0,1,35,0,7,1,2,0,Circulatory,Circulatory,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2,10,6,18,0,0,0,Circulatory,Respiratory,Circulatory,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,12,59,1,15,0,0,0,Circulatory,Circulatory,Circulatory,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2,32,0,19,0,0,0,Circulatory,Circulatory,Other,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,48,0,18,0,0,0,Circulatory,Diabetes,Circulatory,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,4,35,0,15,0,0,0,Respiratory,Other,Other,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7496,5,40,1,12,0,0,0,Circulatory,Circulatory,Other,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7497,4,47,3,21,0,2,0,Circulatory,Respiratory,Diabetes,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7498,3,39,0,12,0,0,0,Circulatory,Circulatory,Respiratory,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [54]:
X_train.columns

Index(['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications',
       'n_outpatient', 'n_inpatient', 'n_emergency', 'diag_1', 'diag_2',
       'diag_3', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med',
       'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)',
       'age_[80-90)', 'age_[90-100)', 'medical_specialty_Cardiology',
       'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery'],
      dtype='object')

In [55]:
# Analyzing diag features

X_train['diag_1'].unique()

array(['Circulatory', 'Other', 'Diabetes', 'Injury', 'Digestive',
       'Respiratory', 'Musculoskeletal', 'Missing'], dtype=object)

In [56]:
X_test['diag_1'].unique()

array(['Circulatory', 'Respiratory', 'Other', 'Digestive',
       'Musculoskeletal', 'Injury', 'Diabetes'], dtype=object)

In [57]:
X_train['diag_2'].unique()

array(['Circulatory', 'Other', 'Diabetes', 'Musculoskeletal', 'Digestive',
       'Respiratory', 'Injury', 'Missing'], dtype=object)

In [58]:
X_test['diag_2'].unique()

array(['Circulatory', 'Respiratory', 'Diabetes', 'Other', 'Digestive',
       'Injury', 'Musculoskeletal', 'Missing'], dtype=object)

In [59]:
X_train['diag_3'].unique()

array(['Other', 'Diabetes', 'Circulatory', 'Respiratory', 'Missing',
       'Digestive', 'Injury', 'Musculoskeletal'], dtype=object)

In [60]:
X_test['diag_3'].unique()

array(['Other', 'Circulatory', 'Diabetes', 'Digestive', 'Respiratory',
       'Injury', 'Missing', 'Musculoskeletal'], dtype=object)

In [61]:
encoder = OneHotEncoder(handle_unknown='ignore')
diag_encoded_train = encoder.fit_transform(X_train[['diag_1', 'diag_2', 'diag_3']])
diag_encoded_test = encoder.transform(X_test[['diag_1', 'diag_2', 'diag_3']])

In [62]:
diag_encoded_train.toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], shape=(17500, 24))

In [63]:
diag_encoded_test.toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(7500, 24))

In [64]:
encoder.get_feature_names_out(['diag_1', 'diag_2', 'diag_3'])

array(['diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Digestive',
       'diag_1_Injury', 'diag_1_Missing', 'diag_1_Musculoskeletal',
       'diag_1_Other', 'diag_1_Respiratory', 'diag_2_Circulatory',
       'diag_2_Diabetes', 'diag_2_Digestive', 'diag_2_Injury',
       'diag_2_Missing', 'diag_2_Musculoskeletal', 'diag_2_Other',
       'diag_2_Respiratory', 'diag_3_Circulatory', 'diag_3_Diabetes',
       'diag_3_Digestive', 'diag_3_Injury', 'diag_3_Missing',
       'diag_3_Musculoskeletal', 'diag_3_Other', 'diag_3_Respiratory'],
      dtype=object)

In [65]:
diag_encoded_train_df = pd.DataFrame(diag_encoded_train.toarray(), columns=encoder.get_feature_names_out(['diag_1', 'diag_2', 'diag_3']))

In [66]:
diag_encoded_train_df

Unnamed: 0,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Injury,diag_1_Missing,diag_1_Musculoskeletal,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [67]:
diag_encoded_test_df = pd.DataFrame(diag_encoded_test.toarray(), columns=encoder.get_feature_names_out(['diag_1', 'diag_2', 'diag_3']))

In [68]:
diag_encoded_test_df

Unnamed: 0,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Injury,diag_1_Missing,diag_1_Musculoskeletal,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7496,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7497,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [69]:
X_train.drop(['diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [70]:
X_test.drop(['diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [71]:
X_train = pd.concat([X_train, diag_encoded_train_df], axis=1)

In [72]:
X_test = pd.concat([X_test, diag_encoded_test_df], axis=1)

In [73]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,9,66,1,27,0,0,0,no,normal,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,39,1,10,0,1,0,no,no,no,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,23,0,3,0,0,0,no,high,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,54,4,21,0,0,0,no,normal,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,54,0,9,0,1,0,no,high,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,no,no,no,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,4,62,0,33,0,2,0,no,no,yes,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,3,13,1,32,0,0,0,no,no,yes,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,8,58,4,18,0,0,0,no,high,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [74]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,1,35,0,7,1,2,0,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,10,6,18,0,0,0,no,no,no,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,59,1,15,0,0,0,no,no,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,32,0,19,0,0,0,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,48,0,18,0,0,0,no,no,yes,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,4,35,0,15,0,0,0,no,no,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7496,5,40,1,12,0,0,0,no,no,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7497,4,47,3,21,0,2,0,no,no,no,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,3,39,0,12,0,0,0,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [75]:
X_train.shape

(17500, 48)

In [76]:
X_test.shape

(7500, 48)

In [77]:
X_train.columns

Index(['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications',
       'n_outpatient', 'n_inpatient', 'n_emergency', 'glucose_test', 'A1Ctest',
       'change', 'diabetes_med', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)',
       'age_[70-80)', 'age_[80-90)', 'age_[90-100)',
       'medical_specialty_Cardiology', 'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery',
       'diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Digestive',
       'diag_1_Injury', 'diag_1_Missing', 'diag_1_Musculoskeletal',
       'diag_1_Other', 'diag_1_Respiratory', 'diag_2_Circulatory',
       'diag_2_Diabetes', 'diag_2_Digestive', 'diag_2_Injury',
       'diag_2_Missing', 'diag_2_Musculoskeletal', 'diag_2_Other',
       'diag_2_Respiratory', 'diag_3_Circulatory', 'diag_3_Diabetes',
       'diag_3_Digestive', 'diag_3_Injury

In [78]:
# Analyzing glucose_test feature

X_train['glucose_test'].unique()

array(['no', 'normal', 'high'], dtype=object)

In [79]:
X_test['glucose_test'].unique()

array(['no', 'normal', 'high'], dtype=object)

In [80]:
encoder = OrdinalEncoder(categories=[['no', 'normal', 'high']])
glc_encoded_train = encoder.fit_transform(X_train[['glucose_test']])
glc_encoded_test = encoder.transform(X_test[['glucose_test']])

In [81]:
glc_encoded_train

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], shape=(17500, 1))

In [82]:
glc_encoded_test

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], shape=(7500, 1))

In [83]:
encoder.categories_

[array(['no', 'normal', 'high'], dtype=object)]

no - 0<br>
normal - 1<br>
high - 2<br>

In [84]:
X_train['glucose_test'] = glc_encoded_train

In [85]:
X_test['glucose_test'] = glc_encoded_test

In [86]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,9,66,1,27,0,0,0,0.0,normal,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,39,1,10,0,1,0,0.0,no,no,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,23,0,3,0,0,0,0.0,high,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,54,4,21,0,0,0,0.0,normal,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,54,0,9,0,1,0,0.0,high,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,0.0,no,no,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,4,62,0,33,0,2,0,0.0,no,yes,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,3,13,1,32,0,0,0,0.0,no,yes,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,8,58,4,18,0,0,0,0.0,high,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [87]:
X_train.columns

Index(['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications',
       'n_outpatient', 'n_inpatient', 'n_emergency', 'glucose_test', 'A1Ctest',
       'change', 'diabetes_med', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)',
       'age_[70-80)', 'age_[80-90)', 'age_[90-100)',
       'medical_specialty_Cardiology', 'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery',
       'diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Digestive',
       'diag_1_Injury', 'diag_1_Missing', 'diag_1_Musculoskeletal',
       'diag_1_Other', 'diag_1_Respiratory', 'diag_2_Circulatory',
       'diag_2_Diabetes', 'diag_2_Digestive', 'diag_2_Injury',
       'diag_2_Missing', 'diag_2_Musculoskeletal', 'diag_2_Other',
       'diag_2_Respiratory', 'diag_3_Circulatory', 'diag_3_Diabetes',
       'diag_3_Digestive', 'diag_3_Injury

In [88]:
# Analyzing the A1Ctest feature

X_train['A1Ctest'].unique()

array(['normal', 'no', 'high'], dtype=object)

In [89]:
encoder = OrdinalEncoder(categories=[['no', 'normal', 'high']])
A1C_encoded_train = encoder.fit_transform(X_train[['A1Ctest']])
A1C_encoded_test = encoder.transform(X_test[['A1Ctest']])

In [90]:
X_train['A1Ctest'] = A1C_encoded_train

In [91]:
X_test['A1Ctest'] = A1C_encoded_test

In [92]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,9,66,1,27,0,0,0,0.0,1.0,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,39,1,10,0,1,0,0.0,0.0,no,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,23,0,3,0,0,0,0.0,2.0,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,54,4,21,0,0,0,0.0,1.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,54,0,9,0,1,0,0.0,2.0,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,0.0,0.0,no,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,4,62,0,33,0,2,0,0.0,0.0,yes,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,3,13,1,32,0,0,0,0.0,0.0,yes,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,8,58,4,18,0,0,0,0.0,2.0,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [93]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,1,35,0,7,1,2,0,0.0,0.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,10,6,18,0,0,0,0.0,0.0,no,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,59,1,15,0,0,0,0.0,0.0,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,32,0,19,0,0,0,0.0,0.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,48,0,18,0,0,0,0.0,0.0,yes,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,4,35,0,15,0,0,0,0.0,0.0,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7496,5,40,1,12,0,0,0,0.0,0.0,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7497,4,47,3,21,0,2,0,0.0,0.0,no,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,3,39,0,12,0,0,0,0.0,0.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [94]:
X_train.columns

Index(['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications',
       'n_outpatient', 'n_inpatient', 'n_emergency', 'glucose_test', 'A1Ctest',
       'change', 'diabetes_med', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)',
       'age_[70-80)', 'age_[80-90)', 'age_[90-100)',
       'medical_specialty_Cardiology', 'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery',
       'diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Digestive',
       'diag_1_Injury', 'diag_1_Missing', 'diag_1_Musculoskeletal',
       'diag_1_Other', 'diag_1_Respiratory', 'diag_2_Circulatory',
       'diag_2_Diabetes', 'diag_2_Digestive', 'diag_2_Injury',
       'diag_2_Missing', 'diag_2_Musculoskeletal', 'diag_2_Other',
       'diag_2_Respiratory', 'diag_3_Circulatory', 'diag_3_Diabetes',
       'diag_3_Digestive', 'diag_3_Injury

In [95]:
X_train['change'].unique()

array(['yes', 'no'], dtype=object)

In [96]:
X_test['change'].unique()

array(['no', 'yes'], dtype=object)

In [97]:
X_train['diabetes_med'].unique()

array(['yes', 'no'], dtype=object)

In [98]:
X_test['diabetes_med'].unique()

array(['no', 'yes'], dtype=object)

In [99]:
encoder = LabelEncoder()
encoded_diab_train = encoder.fit_transform(X_train[['diabetes_med']].to_numpy().ravel())
encoded_diab_test = encoder.transform(X_test[['diabetes_med']].to_numpy().ravel())

In [100]:
encoded_diab_train

array([1, 0, 1, ..., 1, 1, 1], shape=(17500,))

In [101]:
encoded_diab_test

array([0, 1, 0, ..., 1, 0, 1], shape=(7500,))

In [102]:
X_train['diabetes_med'] = encoded_diab_train

In [103]:
X_test['diabetes_med'] = encoded_diab_test

In [104]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,9,66,1,27,0,0,0,0.0,1.0,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,39,1,10,0,1,0,0.0,0.0,no,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,23,0,3,0,0,0,0.0,2.0,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,54,4,21,0,0,0,0.0,1.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,54,0,9,0,1,0,0.0,2.0,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,0.0,0.0,no,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,4,62,0,33,0,2,0,0.0,0.0,yes,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,3,13,1,32,0,0,0,0.0,0.0,yes,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,8,58,4,18,0,0,0,0.0,2.0,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [105]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,1,35,0,7,1,2,0,0.0,0.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,10,6,18,0,0,0,0.0,0.0,no,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,59,1,15,0,0,0,0.0,0.0,no,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,32,0,19,0,0,0,0.0,0.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,48,0,18,0,0,0,0.0,0.0,yes,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,4,35,0,15,0,0,0,0.0,0.0,yes,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7496,5,40,1,12,0,0,0,0.0,0.0,yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7497,4,47,3,21,0,2,0,0.0,0.0,no,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,3,39,0,12,0,0,0,0.0,0.0,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [106]:
encoder = LabelEncoder()
encoded_change_train = encoder.fit_transform(X_train[['change']].to_numpy().ravel())
encoded_change_test = encoder.transform(X_test[['change']].to_numpy().ravel())

In [107]:
encoded_change_train

array([1, 0, 1, ..., 1, 1, 1], shape=(17500,))

In [108]:
encoded_change_test

array([0, 0, 0, ..., 0, 0, 1], shape=(7500,))

In [109]:
X_train['change'] = encoded_change_train

In [110]:
X_test['change'] = encoded_change_test

In [111]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,9,66,1,27,0,0,0,0.0,1.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,39,1,10,0,1,0,0.0,0.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,23,0,3,0,0,0,0.0,2.0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,54,4,21,0,0,0,0.0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,54,0,9,0,1,0,0.0,2.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,7,44,0,27,4,3,0,0.0,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,4,62,0,33,0,2,0,0.0,0.0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,3,13,1,32,0,0,0,0.0,0.0,1,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,8,58,4,18,0,0,0,0.0,2.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [112]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,1,35,0,7,1,2,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,10,6,18,0,0,0,0.0,0.0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,59,1,15,0,0,0,0.0,0.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,32,0,19,0,0,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,48,0,18,0,0,0,0.0,0.0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,4,35,0,15,0,0,0,0.0,0.0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7496,5,40,1,12,0,0,0,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7497,4,47,3,21,0,2,0,0.0,0.0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,3,39,0,12,0,0,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [113]:
X_train.columns

Index(['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications',
       'n_outpatient', 'n_inpatient', 'n_emergency', 'glucose_test', 'A1Ctest',
       'change', 'diabetes_med', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)',
       'age_[70-80)', 'age_[80-90)', 'age_[90-100)',
       'medical_specialty_Cardiology', 'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery',
       'diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Digestive',
       'diag_1_Injury', 'diag_1_Missing', 'diag_1_Musculoskeletal',
       'diag_1_Other', 'diag_1_Respiratory', 'diag_2_Circulatory',
       'diag_2_Diabetes', 'diag_2_Digestive', 'diag_2_Injury',
       'diag_2_Missing', 'diag_2_Musculoskeletal', 'diag_2_Other',
       'diag_2_Respiratory', 'diag_3_Circulatory', 'diag_3_Diabetes',
       'diag_3_Digestive', 'diag_3_Injury

### Analyzing numerical features

In [114]:
X_train['time_in_hospital'].unique()

array([ 9,  2,  1,  5,  4,  3,  7,  6,  8, 14, 12, 13, 11, 10])

In [115]:
X_train['n_lab_procedures'].unique()

array([ 66,  39,  23,  54,  47,  44,  62,  46,  45,   8,  60,  73,  42,
        83,  50,  40,  71,  24,   4,  43,  11,  72,  18,  49,  57,  59,
        31,   1,  70,   2,  33,  61,  17,  21,  36,  29,  69,  67,  56,
        37,  38, 101,  48,  55,  35,  86,  90,  12,  63,  51,  53,  68,
        41,  16,  77,  26,  30,  15,   3,  25,  64,  22,  34,  52,  65,
        10,  32,  28,  85,  93,  20,   7,  58,   6,  79,  74,  19,  13,
        76,  75,   9,  80,  78,  27,  88,  81,  84,  97,  89,  87,  95,
        94,  82,   5,  92,  91,  14, 106,  98, 103, 111,  99, 108, 105,
        96, 100, 113])

In [116]:
X_train['n_procedures'].unique()

array([1, 0, 4, 3, 2, 6, 5])

In [117]:
X_train['n_medications'].unique()

array([27, 10,  3, 21,  9, 14, 12, 11, 13, 16,  6, 17, 31, 26, 19, 23, 15,
        7, 36, 18, 37, 22,  5, 25, 24, 20, 28, 33,  8, 29, 34, 35,  2, 32,
       30, 53, 38, 52, 56, 42,  1,  4, 47, 41, 51, 40, 46, 43, 44, 59, 61,
       39, 45, 50, 60, 69, 48, 49, 58, 63, 57, 62, 55, 68, 54, 75, 79, 72])

In [118]:
X_train['n_outpatient'].unique()

array([ 0,  1, 11,  4,  3,  2, 14,  6,  5, 10, 13,  7,  9, 15,  8, 27, 12,
       21, 18, 16, 23])

In [119]:
X_train['n_inpatient'].unique()

array([ 0,  1,  7,  2,  5,  8,  3,  4, 13,  6,  9, 10, 11, 12, 15, 14])

In [120]:
X_train['n_emergency'].unique()

array([ 0,  1,  3,  2,  5,  4,  6,  7, 11, 64, 18, 12, 10, 16,  9,  8, 28,
       19, 21, 37, 13])

In [121]:
# Applying Min-Max Scaling to numerical features

scaler = MinMaxScaler()
numerical_features = ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency']
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [122]:
X_train

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,0.615385,0.580357,0.166667,0.333333,0.000000,0.000000,0.0,0.0,1.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.076923,0.339286,0.166667,0.115385,0.000000,0.066667,0.0,0.0,0.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.196429,0.000000,0.025641,0.000000,0.000000,0.0,0.0,2.0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.307692,0.473214,0.666667,0.256410,0.000000,0.000000,0.0,0.0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.230769,0.473214,0.000000,0.102564,0.000000,0.066667,0.0,0.0,2.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,0.461538,0.383929,0.000000,0.333333,0.148148,0.200000,0.0,0.0,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17496,0.230769,0.544643,0.000000,0.410256,0.000000,0.133333,0.0,0.0,0.0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17497,0.153846,0.107143,0.166667,0.397436,0.000000,0.000000,0.0,0.0,0.0,1,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17498,0.538462,0.508929,0.666667,0.217949,0.000000,0.000000,0.0,0.0,2.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [123]:
X_test

Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,glucose_test,A1Ctest,change,...,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Missing,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory
0,0.000000,0.303571,0.000000,0.076923,0.037037,0.133333,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.076923,0.080357,1.000000,0.217949,0.000000,0.000000,0.0,0.0,0.0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.846154,0.517857,0.166667,0.179487,0.000000,0.000000,0.0,0.0,0.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.076923,0.276786,0.000000,0.230769,0.000000,0.000000,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000,0.419643,0.000000,0.217949,0.000000,0.000000,0.0,0.0,0.0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.230769,0.303571,0.000000,0.179487,0.000000,0.000000,0.0,0.0,0.0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7496,0.307692,0.348214,0.166667,0.141026,0.000000,0.000000,0.0,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7497,0.230769,0.410714,0.500000,0.256410,0.000000,0.133333,0.0,0.0,0.0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,0.153846,0.339286,0.000000,0.141026,0.000000,0.000000,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [125]:
X_train = sanitize_column_names(X_train)
X_test = sanitize_column_names(X_test)

In [None]:
try:
    os.makedirs('../data', exist_ok=True)
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
X_train.to_csv('../data/train_features.csv', index=False)
X_test.to_csv('../data/test_features.csv', index=False)

### Analyzing output feature

In [127]:
y_train.unique()

array(['no', 'yes'], dtype=object)

In [128]:
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [129]:
y_train_encoded

array([0, 1, 0, ..., 0, 0, 1], shape=(17500,))

In [130]:
y_test_encoded

array([1, 0, 0, ..., 1, 1, 0], shape=(7500,))

In [131]:
y_train_series = pd.Series(y_train_encoded, name='readmission_status')
y_test_series = pd.Series(y_test_encoded, name='readmission_status')

In [None]:
y_train_series.to_csv('../data/train_labels.csv', index=False)
y_test_series.to_csv('../data/test_labels.csv', index=False)