In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix



### 1. Load the encoded dataset

In [None]:
# Load cleaned + encoded dataset for modeling
df = pd.read_csv("../data/processed/diabetes_encoded.csv")
df.shape

(101763, 67)

In [13]:
df.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,diag2_group_Injury,diag2_group_Other,diag2_group_Respiratory,diag2_group_Unknown,diag3_group_Diabetes,diag3_group_Digestive,diag3_group_Injury,diag3_group_Other,diag3_group_Respiratory,diag3_group_Unknown
0,6,25,1,1,41,0,1,0,0,0,...,False,False,False,True,False,False,False,False,False,True
1,1,1,7,3,59,0,18,0,0,0,...,False,False,False,False,False,False,False,True,False,False
2,1,1,7,2,11,5,13,2,0,1,...,False,False,False,False,False,False,False,True,False,False
3,1,1,7,2,44,1,16,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,1,1,7,1,51,0,8,0,0,0,...,False,True,False,False,True,False,False,False,False,False


In [10]:
df.dtypes.value_counts()

bool     55
int64    12
Name: count, dtype: int64

### 2. Split X and y

In [None]:
# Separate features and target variable
# target
y = df["readmitted_binary"]

# Features
X = df.drop("readmitted_binary",axis=1)

X.shape, y.shape

((101763, 66), (101763,))

### 3. Train/Test split

In [None]:
# Split data into training and test sets (80/20) with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((81410, 66), (20353, 66))

### 4. Scaling

In [None]:

# Standardize features (fit on train only, transform on test)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [17]:
X_train_scaled.shape, X_test_scaled.shape

((81410, 66), (20353, 66))

### 5. Baseline: Logistic Regression

we do the baseline supervised model first (so we have something to compare with semi-supervised).

In [19]:

# Train baseline supervised model
lr = LogisticRegression(max_iter=2000, n_jobs=-1)
lr.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = lr.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6240357686827495
F1 score: 0.5092984481210722

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.80      0.70     10972
           1       0.64      0.42      0.51      9381

    accuracy                           0.62     20353
   macro avg       0.63      0.61      0.60     20353
weighted avg       0.63      0.62      0.61     20353



In [20]:
import numpy as np

# copy labels
y_train_semi = y_train.copy()

# choose only 10% labeled
labeled_ratio = 0.1
n_labeled = int(len(y_train) * labeled_ratio)

np.random.seed(42)
labeled_indices = np.random.choice(len(y_train), n_labeled, replace=False)

# set rest as unlabeled (-1 required by sklearn)
y_train_semi[:] = -1
y_train_semi.iloc[labeled_indices] = y_train.iloc[labeled_indices]

In [22]:
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import accuracy_score, f1_score

# Semi-supervised model using kNN graph (memory-friendly)
ls_knn = LabelSpreading(kernel="knn", n_neighbors=15, max_iter=30)
ls_knn.fit(X_train_scaled, y_train_semi)

y_pred_knn = ls_knn.predict(X_test_scaled)

print("Label Spreading (kNN, 10% labeled)")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("F1 score:", f1_score(y_test, y_pred_knn))

Label Spreading (kNN, 10% labeled)
Accuracy: 0.5385938190930084
F1 score: 0.4742175690050949


  probabilities /= normalizer


In [23]:
import numpy as np

# y_train_semi should contain real labels for the labeled 10%,
# and -1 for the rest.
assert set(np.unique(y_train_semi)).issubset(set(np.unique(y_train)).union({-1}))
print("Unlabeled fraction:", np.mean(y_train_semi == -1))

Unlabeled fraction: 0.9


In [24]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, digits=4))

[[6727 4245]
 [5146 4235]]
              precision    recall  f1-score   support

           0     0.5666    0.6131    0.5889     10972
           1     0.4994    0.4514    0.4742      9381

    accuracy                         0.5386     20353
   macro avg     0.5330    0.5323    0.5316     20353
weighted avg     0.5356    0.5386    0.5361     20353



In [25]:
from sklearn.metrics import f1_score
from sklearn.semi_supervised import LabelSpreading

neighbors_list = [5, 10, 15, 20, 30]
results = []

for k in neighbors_list:
    model = LabelSpreading(kernel="knn", n_neighbors=k, max_iter=50)
    model.fit(X_train_scaled, y_train_semi)
    pred = model.predict(X_test_scaled)
    results.append((k, f1_score(y_test, pred)))

for k, f1 in results:
    print(f"k={k:>2}  F1={f1:.4f}")

  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


k= 5  F1=0.4701
k=10  F1=0.4707
k=15  F1=0.4742
k=20  F1=0.4721
k=30  F1=0.4704
