In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from scipy import sparse

In [2]:
#load data
df = pd.read_csv('diabetic_data.csv')

df = df.replace('?', np.nan)

print("Missing values before handling:")
print(df.isnull().sum())

Missing values before handling:
encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride     

In [3]:
# misssing values
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

# fill missing values
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna(df[column].mode()[0])
    else:
        df[column] = df[column].fillna(df[column].median())

# remove duplicates
df = df.drop_duplicates()

categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = df[column].astype('category')

# feature engineering
df['interaction_1'] = df['time_in_hospital'] * df['num_lab_procedures']
df['age_binned'] = pd.cut(df['age'].str.extract(r'(\d+)', expand=False).astype(int), bins=[0, 20, 40, 60, 80, 100], labels=False)

# display
print("Data after cleaning:")
print(df.head())

Data after cleaning:
   encounter_id  patient_nbr             race  gender      age  \
0       2278392      8222157        Caucasian  Female   [0-10)   
1        149190     55629189        Caucasian  Female  [10-20)   
2         64410     86047875  AfricanAmerican  Female  [20-30)   
3        500364     82442376        Caucasian    Male  [30-40)   
4         16680     42519267        Caucasian    Male  [40-50)   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital payer_code  ... glyburide-metformin  glipizide-metformin  \
0                 1         MC  ...                  No                   N

In [4]:
#save data
df.to_csv('processed_data.csv', index=False)

In [5]:
#display
print("Column names:")
print(df.columns)

Column names:
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'readmitted', 'interaction_1', 'age_binned'],
      dtype='object')


In [6]:
# target variable
target_variable = 'readmitted' 

# check if its in dataframe
if target_variable not in df.columns:
    raise KeyError(f"Target variable '{target_variable}' not found in the DataFrame columns")

y = df[target_variable].map({'NO': 0, '>30': 1, '<30': 1})  # Assuming '>30' and '<30' both mean readmitted

# check unique value
print(f"Unique values in the target variable '{target_variable}': {y.unique()}")

Unique values in the target variable 'readmitted': [0 1]


In [7]:
# drop target
X = df.drop(target_variable, axis=1)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False, interaction_only=True))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_transformer, X.select_dtypes(include=['category']).columns)
    ])
print("Feature columns before splitting:")
print(X.columns)

Feature columns before splitting:
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'interaction_1', 'age_binned'],
      dtype='object')


In [8]:
# split into traing and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training feature columns after splitting:")
print(X_train.columns)

Training feature columns after splitting:
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'interaction_1', 'age_binned'],
      dtype='object')


In [9]:
try:
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
except ValueError as e:
    print("Error during transformation:", e)
    print("Preprocessor transformers:", preprocessor.transformers)
    print("Numeric columns:", X.select_dtypes(include=['int64', 'float64']).columns)
    print("Categorical columns:", X.select_dtypes(include=['category']).columns)
    raise e

print("Transformed training data:")
print(X_train[:5])



Transformed training data:
  (0, 0)	-0.8127173412879903
  (0, 1)	-1.4006156069663733
  (0, 2)	-0.7081080555791333
  (0, 3)	-0.5136588030061343
  (0, 4)	0.30695872007596015
  (0, 5)	-0.13411891096264028
  (0, 6)	0.24939450137817137
  (0, 7)	-0.7848791681292062
  (0, 8)	-0.6168449207523379
  (0, 9)	-0.29117075054098907
  (0, 10)	-0.21415416149522598
  (0, 11)	-0.5022606446950393
  (0, 12)	0.8159911338807437
  (0, 13)	-0.0823738375821849
  (0, 14)	0.8539356760750139
  (0, 15)	1.1383045922601758
  (0, 16)	0.5754916962748817
  (0, 17)	0.41745941670831704
  (0, 18)	-0.24947067486529878
  (0, 19)	0.10900076473399771
  (0, 20)	-0.20268723609191147
  (0, 21)	0.637884910754298
  (0, 22)	0.5013205639808411
  (0, 23)	0.2366395182405013
  (0, 24)	0.17404680075615897
  :	:
  (4, 115)	0.2550674013235307
  (4, 116)	0.24534028619226755
  (4, 117)	0.26540215457601213
  (4, 118)	0.2552809187761788
  (4, 119)	0.758117298053189
  (4, 132)	1.0
  (4, 141)	1.0
  (4, 169)	1.0
  (4, 448)	1.0
  (4, 1206)	1.0
  (

In [10]:
# save for modeling
sparse.save_npz('preprocessed_train_data.npz', X_train)
sparse.save_npz('preprocessed_test_data.npz', X_test)

np.savetxt('train_labels.csv', y_train, delimiter=",")
np.savetxt('test_labels.csv', y_test, delimiter=",")