# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('diabetic_data.csv')

In [3]:
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


# Handle Missing Values

In [4]:
# max, weight, citoglipton, medical_specialty and A1Cresult has more than 85% null values so reoving it.
# payer_code doesnt affect on the result 
df.drop(columns=['max_glu_serum','A1Cresult','weight','citoglipton','examide','payer_code','medical_specialty'],inplace=True)

In [5]:
df['race'].value_counts()

race
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

In [6]:
# replace any occurrences of '?' in the 'race' column with 'Other'.
df['race'] = df['race'].apply(lambda x: 'Other' if x == '?' else x)

In [7]:
# removing rows with unkown gender type
df.drop(index=[30506, 75551, 82573],inplace=True)

In [8]:
# removing null values from diag_1,2 and 3.
index=[]
index=list(df[df['diag_1']=='?'].index)
index.extend(df[df['diag_2']=='?'].index)
index.extend(df[df['diag_3']=='?'].index)

In [9]:
df.drop(index=index,inplace=True)

# Features Importance

In [10]:
def replace_age_ranges(feature):
  age_ranges = ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)','[60-70)', '[70-80)', '[80-90)', '[90-100)']
  values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  
  # The function iterates through the age_ranges and replaces each occurrence in the 'feature' with the corresponding numerical value.
  for i, age_range in enumerate(age_ranges):
    feature = feature.replace(age_range, values[i])

  return feature

In [11]:
df['age']=replace_age_ranges(df['age'])

In [12]:
# It iterates through each value in the 'col' column of the DataFrame 'df' and assigns a cluster value based on certain conditions.

def map_icd9_to_cluster(value):
    if 'V' in value or 'E' in value:
        return 18
    try:
        value = float(value)
        if 1 <= value <= 139:
            return 1
        elif 140 <= value <= 239:
            return 2
        elif 240 <= value <= 279:
            return 3
        elif 280 <= value <= 289:
            return 4
        elif 290 <= value <= 319:
            return 5
        elif 320 <= value <= 389:
            return 6
        elif 390 <= value <= 459:
            return 7
        elif 460 <= value <= 519:
            return 8
        elif 520 <= value <= 579:
            return 9
        elif 580 <= value <= 629:
            return 10
        elif 630 <= value <= 679:
            return 11
        elif 680 <= value <= 709:
            return 12
        elif 710 <= value <= 739:
            return 13
        elif 740 <= value <= 759:
            return 14
        elif 760 <= value <= 779:
            return 15
        elif 780 <= value <= 799:
            return 16
        elif 800 <= value <= 999:
            return 17
    except ValueError:
        # Return NaN for invalid values
        return np.nan

In [13]:
# Apply the function to each column using vectorized operations
for col in ['diag_1', 'diag_2', 'diag_3']:
    df[col] = df[col].apply(map_icd9_to_cluster)

In [14]:
df['change']=df['change'].apply(lambda x: 'Yes' if x=='Ch' else x)
df['change']=df['change'].apply(lambda x: 0 if x=='No' else 1)

# Encoding Categorical Features

In [15]:
# Iterating through each medicine column and converting 'No' to 0 and other values to 1.
medicines = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']

for col in medicines:
    df[col] = df[col].apply(lambda x: 0 if x == 'No' else 1)

In [16]:
df['gender']=df['gender'].apply(lambda x: 0 if x=='Female' else 1)

In [17]:
df['diabetesMed']=df['diabetesMed'].apply(lambda x : 0 if x=='No' else 1)

In [18]:
df['readmitted']=df['readmitted'].apply(lambda x : 1 if x=='<30' else 0)

In [19]:
# we have found that there are many entries of some users in dataset it will make our ML algorithm biased so removing them.
df.drop_duplicates(subset='patient_nbr',keep='first',inplace=True)

In [20]:
df.drop(columns=['encounter_id','patient_nbr'],inplace=True)

In [21]:
# used label encoder for race feature.
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['race']=encoder.fit_transform(df['race'])

In [22]:
df.shape

(70413, 41)

In [23]:
df['readmitted'].value_counts()

readmitted
0    64163
1     6250
Name: count, dtype: int64

In [24]:
unseen_data = df.iloc[-2:].copy()
df = df.iloc[:-2].reset_index(drop=True)
unseen_data = unseen_data.drop(columns=['readmitted'])
unseen_data

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
101758,2,0,9,1,1,7,5,76,1,22,...,0,0,1,0,0,0,0,0,1,1
101765,2,1,8,1,1,7,6,13,3,3,...,0,0,0,0,0,0,0,0,0,0


# Model Building

In [25]:
X=df.drop(columns='readmitted')
y=df['readmitted']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [27]:
%%time
from imblearn.over_sampling import SMOTEN
sm=SMOTEN(sampling_strategy='minority',random_state=42,n_jobs=-1)
X_res, y_res = sm.fit_resample(X_train, y_train)

CPU times: total: 1min 52s
Wall time: 1min 54s


In [42]:
import tensorflow as tf

class CustomInitializer(tf.keras.initializers.Initializer):
    def __init__(self, mean=0.0, stddev=0.05): # Default Initialization
        self.mean = mean
        self.stddev = stddev

    def __call__(self, shape, dtype=None):
        return tf.random.normal(shape, mean=self.mean, stddev=self.stddev, dtype=dtype)

    def get_config(self):  # To support serialization
        return {"mean": self.mean, "stddev": self.stddev}

In [49]:
model = tf.keras.Sequential()

custom_init = CustomInitializer(mean=0.0, stddev=0.05)

model.add(tf.keras.layers.Dense(128, activation='elu', kernel_initializer=custom_init))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(16, activation='elu', kernel_initializer=custom_init))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(8, activation='elu', kernel_initializer=custom_init))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(4, activation='elu', kernel_initializer=custom_init))
model.add(tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=custom_init))

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['acc']
)

model.fit(X_res.to_numpy(), y_res, batch_size=128, epochs=100, validation_split=0.25)

Epoch 1/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - acc: 0.6607 - loss: 0.6747 - val_acc: 0.0000e+00 - val_loss: 1.0614
Epoch 2/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - acc: 0.6823 - loss: 0.6043 - val_acc: 0.4730 - val_loss: 0.8028
Epoch 3/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - acc: 0.7441 - loss: 0.5385 - val_acc: 0.6150 - val_loss: 0.6897
Epoch 4/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - acc: 0.7620 - loss: 0.5179 - val_acc: 0.6194 - val_loss: 0.6848
Epoch 5/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - acc: 0.7702 - loss: 0.5070 - val_acc: 0.6526 - val_loss: 0.6443
Epoch 6/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - acc: 0.7729 - loss: 0.5023 - val_acc: 0.6608 - val_loss: 0.6379
Epoch 7/100
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 

<keras.src.callbacks.history.History at 0x28743d296d0>

In [50]:
model_pred = model.predict(X_test.to_numpy()) > 0.5
print(classification_report(y_test, model_pred))

[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step  
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     16021
           1       0.13      0.12      0.12      1582

    accuracy                           0.85     17603
   macro avg       0.52      0.52      0.52     17603
weighted avg       0.84      0.85      0.85     17603



In [51]:
unseen_predictions = model.predict(unseen_data.to_numpy()) > 0.5
print("Predictions on unseen data:", unseen_predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Predictions on unseen data: [[ True]
 [False]]


In [None]:
# Get the first layer of the model
first_layer = model.layers[0]

# Check if the first layer has weights
if len(first_layer.get_weights()) > 0:
    # Get weights and biases
    weights, biases = first_layer.get_weights()
    
    # Plot the weights of the first layer (if it's a Dense layer)
    if len(weights.shape) == 2:  # Dense layer (weights matrix)
        plt.figure(figsize=(10, 5))
        plt.imshow(weights, aspect='auto', cmap='viridis')
        plt.colorbar()
        plt.title(f'Weights of {first_layer.name}')
        plt.xlabel('Neurons')
        plt.ylabel('Input features')
        plt.show()
    
    # Plot the biases of the first layer (if it's a Dense layer)
    if len(biases.shape) == 1:  # Bias vector
        plt.figure(figsize=(6, 3))
        plt.bar(range(len(biases)), biases)
        plt.title(f'Biases of {first_layer.name}')
        plt.xlabel('Neurons')
        plt.ylabel('Bias Values')
        plt.show()

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

class TensorFlowModelWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = np.array([0, 1])  # Define the target classes explicitly

    def fit(self, X, y):
        # No training required for wrapper
        return self

    def predict(self, X):
        # Return binary predictions
        return (self.model.predict(X.to_numpy()) > 0.5).astype(int).flatten()

    def predict_proba(self, X):
        # Return class probabilities for compatibility
        probabilities = self.model.predict(X.to_numpy())
        return np.hstack([1 - probabilities, probabilities])

# Wrap the TensorFlow model
wrapped_model = TensorFlowModelWrapper(model)

# Permutation Importance
perm_importance = permutation_importance(
    wrapped_model, X_test, y_test, scoring='accuracy', n_repeats=10, random_state=42
)

# Plot Feature Importance
feature_names = X.columns
sorted_idx = perm_importance.importances_mean.argsort()

plt.figure(figsize=(10, 8))
plt.barh(feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx], xerr=perm_importance.importances_std[sorted_idx])
plt.xlabel("Mean Importance Score")
plt.title("Permutation Feature Importance")
plt.show()