## This notebook applies machine learning to identify patients at high risk of 30-day hospital readmission, enabling proactive care decisions.


In [0]:
# Load ML-ready Gold data
df = spark.read.table("gold_patient_features")
df.display()


In [0]:
# Import MLflow, Pandas, and Scikit-learn libraries for model training and evaluation

import mlflow
import mlflow.sklearn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

from mlflow.models.signature import infer_signature

In [0]:
# Set MLflow experiment path for readmission prediction runs

mlflow.set_experiment("/Shared/readmission_prediction")

In [0]:
# Load gold_patient_features table into Spark DataFrame and convert to Pandas

df_spark = spark.table("default.gold_patient_features")
df = df_spark.toPandas()

In [0]:
# Split dataset into features (X) and target (y) for readmission prediction

TARGET_COL = "readmit_30d"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

In [0]:
# Split dataset into train and test sets with stratified sampling

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [0]:
# üîç HARD DEBUG CHECK (RUN THIS FIRST)

print("Table exists:", spark.catalog.tableExists("default.gold_patient_features"))

df = spark.read.table("default.gold_patient_features")

print("Total rows:", df.count())
print("Columns:", df.columns)

df.select("utilization_score", "treatment_changed", "readmit_30d") \
  .summary("count") \
  .show()

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import mlflow
import mlflow.sklearn
from pyspark.sql.functions import col

# -------------------------
# 1Ô∏è‚É£ Load GOLD table
# -------------------------
df_gold = spark.read.table("default.gold_patient_features")

# -------------------------
# 2Ô∏è‚É£ SAFE DATA PREP (NO dropna)
# -------------------------
df_ml = (
    df_gold
    .select("utilization_score", "treatment_changed", "readmit_30d")
    .fillna({
        "utilization_score": 0,
        "treatment_changed": 0,
        "readmit_30d": 0
    })
)

row_count = df_ml.count()
print("Rows available for ML:", row_count)

if row_count < 20:
    raise Exception(f"‚ùå Still not enough rows for ML training: {row_count}")

# -------------------------
# 3Ô∏è‚É£ Convert to Pandas
# -------------------------
pdf = df_ml.toPandas()

X = pdf[["utilization_score", "treatment_changed"]]
y = pdf["readmit_30d"]

# -------------------------
# 4Ô∏è‚É£ Train/Test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------
# 5Ô∏è‚É£ MLflow Training
# -------------------------
mlflow.set_experiment("/Shared/readmission_prediction")

with mlflow.start_run():

    model = LogisticRegression(max_iter=300)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("features", "utilization_score, treatment_changed")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("auc", auc)

    mlflow.sklearn.log_model(model, "readmission_model")

    print("‚úÖ MODEL TRAINED SUCCESSFULLY")
    print("Accuracy:", round(acc, 4))
    print("AUC:", round(auc, 4))

In [0]:
# Drop rows with NaN values in features and target
X = X.dropna()
y = y[X.index]

# Continue with train/test split and model training as before

In [0]:
# Print schema of gold_patient_features table

spark.table("default.gold_patient_features").printSchema()

In [0]:
# Select key columns for modeling and analysis

selected_cols = [
    "utilization_score",
    "readmit_30d"
]