In [1]:
import matplotlib.pyplot as plt
import pandas as pd

In [2]:

hospital_readmissions_df = pd.read_excel("../Data/hospital_readmissions.xlsx")

diabetic_data_df = pd.read_csv("../Data/diabetes+130-us+hospitals+for+years+1999-2008/diabetic_data.csv")

Insurance_Outcomes_df = pd.read_excel("../Data/Insurance_Outcomes.xlsx")


# Hospitals Readmissions Bias Analysis

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Assume you have this:
df = hospital_readmissions_df.copy()

# Binarize the target
df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x == 'yes' else 0)

# Store target separately
y = df['readmitted']
X = df.drop(columns=['readmitted'])

# Encode categorical columns
cat_cols = X.select_dtypes(include=['object']).columns
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le  # store if you want to decode later

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

import torch

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

import torch.nn as nn
import torch.optim as optim

class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = MLP(input_dim=X_train_tensor.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch} - Loss: {loss.item():.4f}")

from sklearn.metrics import roc_auc_score

model.eval()
with torch.no_grad():
    preds = model(X_test_tensor).numpy().flatten()
    auc = roc_auc_score(y_test, preds)
    print(f"\nAUC: {auc:.4f}")


Epoch 0 - Loss: 0.7034
Epoch 100 - Loss: 0.6496
Epoch 200 - Loss: 0.6408
Epoch 300 - Loss: 0.6329
Epoch 400 - Loss: 0.6244
Epoch 500 - Loss: 0.6193
Epoch 600 - Loss: 0.6120
Epoch 700 - Loss: 0.6059
Epoch 800 - Loss: 0.6026
Epoch 900 - Loss: 0.5987

AUC: 0.6318


In [4]:
model.eval()
with torch.no_grad():
    y_preds_nn = model(X_test_tensor).numpy().flatten()
    y_preds_label = (y_preds_nn >= 0.5).astype(int)  # threshold at 0.5


# Get original age values from the unprocessed DataFrame
results_df = X_test.copy()
results_df['age'] = hospital_readmissions_df.loc[X_test.index, 'age']
results_df['true'] = y_test.values
results_df['pred'] = y_preds_label


accuracy_by_age = results_df.groupby('age').apply(lambda g: (g['true'] == g['pred']).mean())
accuracy_by_age = accuracy_by_age.sort_index()

print(accuracy_by_age)


age
[40-50)     0.670124
[50-60)     0.668118
[60-70)     0.609635
[70-80)     0.578358
[80-90)     0.561030
[90-100)    0.555556
dtype: float64


  accuracy_by_age = results_df.groupby('age').apply(lambda g: (g['true'] == g['pred']).mean())


In [5]:
# So for this is a very similar finding we found for decision tree. Open the decision tree notebook and there is my summary as well just looking at these values


# Diabetes analysis for Bias

# Insurance Dataset bias analysis