In [141]:
# import libraries 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [142]:
# load data 
data = pd.read_csv('./diabetic_data.csv')

In [143]:
# split data into X and y 
X = data.drop('readmitted', axis = 1)
y = data['readmitted']

In [144]:
# make classification binary 
y = y.replace('>30', 'YES')
y = y.replace('<30', 'YES')

In [145]:
# drop columns that we don't need 
columns_to_drop = ['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty', 'examide', 'citoglipton']

X = X.drop(columns_to_drop, axis = 1)

In [146]:
# split the data into dev and test set 
from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 10)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, stratify = y_dev, test_size = 0.2, random_state = 10)

In [147]:
# impute missing values for categorical variables 
from sklearn.impute import SimpleImputer 

feature_names = X.columns

imp = SimpleImputer(missing_values = '?', strategy = 'most_frequent')

X_train = pd.DataFrame(imp.fit_transform(X_train), columns = feature_names)
X_val = pd.DataFrame(imp.transform(X_val), columns = feature_names)
X_test = pd.DataFrame(imp.transform(X_test), columns = feature_names)

In [148]:
# label encode target variable 
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder() 

y_train = pd.Series(le.fit_transform(y_train))
y_val = pd.Series(le.transform(y_val))
y_test = pd.Series(le.transform(y_test))

In [149]:
# create pipeline for preprocessing 
from sklearn.compose import make_column_transformer 
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from category_encoders import TargetEncoder 

te_features = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
              'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 
              'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
              'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
              'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
              'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

oe_features = ['age']

preprocess = make_column_transformer((OrdinalEncoder(), oe_features), 
                                    (TargetEncoder(), te_features), remainder = 'passthrough')

In [150]:
# target encode variables 
X_train = preprocess.fit_transform(X_train, y_train)
X_val = preprocess.transform(X_val)
X_test = preprocess.transform(X_test)

In [151]:
# scale the data 
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_val = ss.fit_transform(X_val)
X_test = ss.fit_transform(X_test)

In [152]:
pd.DataFrame(X_train).to_csv('X_train.csv', index = False)
pd.DataFrame(X_val).to_csv('X_val.csv', index = False)
pd.DataFrame(X_test).to_csv('X_test.csv', index = False)

y_train.to_csv('y_train.csv', index = False)
y_val.to_csv('y_val.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

Now do it again to make dev and test set!

In [None]:
X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 10)

# impute missing values for categorical variables 
from sklearn.impute import SimpleImputer 

feature_names = X.columns

imp = SimpleImputer(missing_values = '?', strategy = 'most_frequent')

X_dev = pd.DataFrame(imp.fit_transform(X_dev), columns = feature_names)
X_test = pd.DataFrame(imp.transform(X_test), columns = feature_names)

le = LabelEncoder() 

y_dev = pd.Series(le.fit_transform(y_dev))
y_test = pd.Series(le.transform(y_test))

te_features = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
              'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 
              'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
              'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
              'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
              'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

oe_features = ['age']

preprocess = make_column_transformer((OrdinalEncoder(), oe_features), 
                                    (TargetEncoder(), te_features), remainder = 'passthrough')

X_dev = preprocess.fit_transform(X_dev, y_dev)
X_test = preprocess.transform(X_test)

ss = StandardScaler()

X_dev = ss.fit_transform(X_dev)
X_test = ss.fit_transform(X_test)

pd.DataFrame(X_dev).to_csv('X_dev_final.csv', index = False)
pd.DataFrame(X_test).to_csv('X_test_final.csv', index = False)
y_dev.to_csv('y_dev_final.csv', index = False)
y_test.to_csv('y_test_final.csv', index = False)