In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define number of patients
num_patients = 5000

# Helper functions for generating realistic data
def generate_patient_id(i):
    return f"P{str(i+1).zfill(3)}"

def generate_age():
    # Most dialysis patients are older, weighted distribution
    return int(np.random.normal(65, 15))

def generate_gender():
    return random.choice(["Male", "Female"])

def generate_weight():
    # In kg, normal distribution
    return round(np.random.normal(75, 35), 1)

def generate_bmi(weight, height):
    # BMI = weight(kg) / height(m)^2
    return round(weight / (height ** 2), 1)

def generate_bp():
    # Generate blood pressure as systolic/diastolic
    systolic = int(np.random.normal(140, 20))
    diastolic = int(np.random.normal(85, 15))
    return f"{systolic}/{diastolic}"

def generate_kidney_failure_cause():
    causes = ["Diabetic Nephropathy", "Hypertensive Nephrosclerosis",
              "Glomerulonephritis", "Polycystic Kidney Disease",
              "Interstitial Nephritis", "Obstructive Nephropathy"]
    weights = [0.35, 0.25, 0.15, 0.10, 0.10, 0.05]
    return np.random.choice(causes, p=weights)

def generate_lab_value(mean, std, decimal_places=1):
    return round(np.random.normal(mean, std), decimal_places)


def generate_dialysate_composition():
    k_levels = ["K+:2.0", "K+:2.5", "K+:3.0"]
    ca_levels = ["Ca:2.5", "Ca:3.0", "Ca:3.5"]
    na_levels = ["Na:138", "Na:140", "Na:142"]

    k = random.choice(k_levels)
    ca = random.choice(ca_levels)
    na = random.choice(na_levels)

    return f"{k} {ca} {na}"

def generate_vascular_access():
    access_types = ["AVF", "AVG", "CVC", "Tunneled Catheter"]
    weights = [0.5, 0.2, 0.15, 0.15]
    return np.random.choice(access_types, p=weights)

def generate_dialyzer_type():
    return random.choice(["High-Flux", "Low-Flux", "Medium-Flux"])

def generate_severity():
    return random.choice(["Mild", "Moderate", "Severe", "Critical"])

def generate_yes_no(yes_prob=0.5):
    return "Yes" if random.random() < yes_prob else "No"

def generate_medications():
    meds = ["Amlodipine", "Metoprolol", "Lisinopril", "Carvedilol",
            "Losartan", "Atenolol", "Furosemide", "Hydrochlorothiazide",
            "Nifedipine", "Doxazosin"]

    # Choose 0-3 medications
    num_meds = random.randint(0, 3)
    if num_meds == 0:
        return "None"

    selected_meds = random.sample(meds, num_meds)
    return ";".join(selected_meds)

def generate_phosphate_binders():
    binders = ["Sevelamer", "Calcium Acetate", "Lanthanum Carbonate",
               "Ferric Citrate", "Aluminum Hydroxide", "None"]
    weights = [0.3, 0.25, 0.15, 0.1, 0.05, 0.15]
    return np.random.choice(binders, p=weights)

def generate_comorbidities():
    conditions = ["CAD", "CHF", "Diabetes", "Hypertension", "PVD",
                  "Stroke", "COPD", "Depression", "Gout", "Anemia"]

    # Choose 0-4 comorbidities
    num_conditions = random.randint(0, 4)
    if num_conditions == 0:
        return "None"

    selected_conditions = random.sample(conditions, num_conditions)
    return ";".join(selected_conditions)

def generate_side_effects():
    effects = ["Muscle Cramps", "Hypotension", "Nausea", "Vomiting",
               "Headache", "Dizziness", "Chest Pain", "Itching",
               "Fever", "Chills", "None"]

    # Choose 0-3 side effects
    num_effects = random.randint(0, 3)
    if num_effects == 0:
        return "None"

    selected_effects = random.sample(effects[:10], num_effects)
    return ";".join(selected_effects)

def generate_symptoms():
    symptoms = ["Fatigue", "Nausea", "Shortness of Breath", "Edema",
                "Loss of Appetite", "Itching", "Muscle Cramps", "Insomnia",
                "None"]

    # Choose 0-3 symptoms
    num_symptoms = random.randint(0, 3)
    if num_symptoms == 0:
        return "None"

    selected_symptoms = random.sample(symptoms[:8], num_symptoms)
    return ";".join(selected_symptoms)

def generate_compliance():
    return random.choice(["Poor", "Moderate", "Good", "Excellent"])

def generate_food_intake():
    foods = ["High Sodium Meal", "High Potassium Meal", "High Phosphorus Meal",
             "Low Protein Meal", "Balanced Meal", "Skipped Meal", "Liquid Only"]
    return random.choice(foods)

def generate_timing():
    return random.choice(["First Hour", "Middle Hours", "Last Hour", "Post-Treatment"])






# Create empty DataFrame
data = []

for i in range(num_patients):
    # Generate basic patient info
    patient_id = generate_patient_id(i)
    age = generate_age()
    gender = generate_gender()
    height = round(np.random.normal(1.7, 0.1), 2)  # in meters
    weight = generate_weight()
    bmi = generate_bmi(weight, height)

    # Generate medical conditions
    diabetes = generate_yes_no(0.4)
    hypertension = generate_yes_no(0.7)
    kidney_failure_cause = generate_kidney_failure_cause()

    # Generate blood pressure readings
    pre_dialysis_bp = generate_bp()
    # During dialysis BP typically lower
    systolic_pre, diastolic_pre = map(int, pre_dialysis_bp.split('/'))
    systolic_during = max(90, systolic_pre - random.randint(5, 20))
    diastolic_during = max(60, diastolic_pre - random.randint(5, 15))
    during_dialysis_bp = f"{systolic_during}/{diastolic_during}"

    # Post dialysis BP typically even lower or stabilized
    systolic_post = max(90, systolic_during - random.randint(-5, 15))
    diastolic_post = max(60, diastolic_during - random.randint(-5, 10))
    post_dialysis_bp = f"{systolic_post}/{diastolic_post}"

    # Generate other vital signs and lab values
    heart_rate = int(np.random.normal(80, 15))
    creatinine = generate_lab_value(8.0, 2.0, 1)
    urea = generate_lab_value(120, 30, 0)
    potassium = generate_lab_value(5.0, 0.8, 1)
    hemoglobin = generate_lab_value(10.0, 1.5, 1)
    hematocrit = generate_lab_value(30.0, 5.0, 1)
    albumin = generate_lab_value(3.5, 0.5, 1)
    calcium = generate_lab_value(8.5, 0.5, 1)
    phosphorus = generate_lab_value(5.5, 1.0, 1)

    # Generate dialysis parameters
    dialysis_duration = random.choice([3, 3.5, 4, 4.5, 5])
    dialysis_frequency = random.choice([2, 3, 4])
    dialysate_composition = generate_dialysate_composition()
    vascular_access = generate_vascular_access()
    dialyzer_type = generate_dialyzer_type()
    ktv = generate_lab_value(1.4, 0.2, 1)
    urr = generate_lab_value(70, 5, 0)
    urine_output = int(np.random.normal(200, 150))
    dry_weight = round(weight - random.uniform(1, 5), 1)
    fluid_removal_rate = int(np.random.normal(800, 200))

    # Generate disease and treatment info
    disease_severity = generate_severity()
    pre_dialysis_weight = round(dry_weight + random.uniform(1, 5), 1)
    post_dialysis_weight = round(dry_weight + random.uniform(-0.5, 1), 1)
    recent_medication_changes = generate_yes_no(0.3)
    antihypertensive_meds = generate_medications()
    epo_dose = random.choice([0, 2000, 3000, 4000, 6000])
    iron_supplements = random.choice(["None", "Ferrous Sulfate", "Iron Sucrose", "Ferric Gluconate"])
    phosphate_binders = generate_phosphate_binders()
    blood_transfusion = generate_yes_no(0.1)
    intradialytic_medication = random.choice(["None", "Saline", "Albumin", "Antibiotics", "Antiemetics"])
    recent_infection = generate_yes_no(0.15)
    comorbidities = generate_comorbidities()
    serum_sodium = int(np.random.normal(138, 3))

    # Generate side effect info
    previous_side_effects = generate_side_effects()
    if previous_side_effects == "None":
        days_since_last_side_effect = 0
        time_to_recovery = 0
    else:
        days_since_last_side_effect = random.randint(1, 30)
        time_to_recovery = random.randint(1, 12)

    pre_dialysis_symptoms = generate_symptoms()
    interdialytic_weight_gain = round(random.uniform(0.5, 5.0), 1)
    diet_compliance = generate_compliance()
    fluid_compliance = generate_compliance()
    recent_food_intake = generate_food_intake()

    # Generate current side effect details
    current_side_effect_type = generate_side_effects()
    if current_side_effect_type == "None":
        side_effect_severity = "None"
        side_effect_timing = "None"
        staff_intervention = "No"
    else:
        side_effect_severity = generate_severity()
        side_effect_timing = generate_timing()
        staff_intervention = generate_yes_no(0.6)

    # Create a patient record
    patient = {
        "PatientID": patient_id,
        "Age": age,
        "Gender": gender,
        "Weight": weight,
        "BMI": bmi,
        "Diabetes": diabetes,
        "Hypertension": hypertension,
        "Kidney_Failure_Cause": kidney_failure_cause,
        "Pre_Dialysis_Blood_Pressure": pre_dialysis_bp,
        "During_Dialysis_Blood_Pressure": during_dialysis_bp,
        "Post_Dialysis_Blood_Pressure": post_dialysis_bp,
        "Heart_Rate": heart_rate,
        "Creatinine": creatinine,
        "Urea": urea,
        "Potassium": potassium,
        "Hemoglobin": hemoglobin,
        "Hematocrit": hematocrit,
        "Albumin": albumin,
        "Calcium": calcium,
        "Phosphorus": phosphorus,
        "Dialysis_Duration_Hours": dialysis_duration,
        "Dialysis_Frequency_Per_Week": dialysis_frequency,
        "Dialysate_Composition": dialysate_composition,
        "Vascular_Access_Type": vascular_access,
        "Dialyzer_Type": dialyzer_type,
        "KtV": ktv,
        "URR": urr,
        "Urine_Output_ml_day": urine_output,
        "Dry_Weight_kg": dry_weight,
        "Fluid_Removal_Rate_ml_hour": fluid_removal_rate,
        "Disease_Severity": disease_severity,
        "Pre_Dialysis_Weight_kg": pre_dialysis_weight,
        "Post_Dialysis_Weight_kg": post_dialysis_weight,
        "Recent_Medication_Changes": recent_medication_changes,
        "Antihypertensive_Meds": antihypertensive_meds,
        "EPO_Dose": epo_dose,
        "Iron_Supplements": iron_supplements,
        "Phosphate_Binders": phosphate_binders,
        "Blood_Transfusion_Recent": blood_transfusion,
        "Intradialytic_Medication": intradialytic_medication,
        "Recent_Infection": recent_infection,
        "Comorbidities": comorbidities,
        "Serum_Sodium": serum_sodium,
        "Previous_Side_Effects": previous_side_effects,
        "Days_Since_Last_Side_Effect": days_since_last_side_effect,
        "Time_To_Recovery_Hours": time_to_recovery,
        "Pre_Dialysis_Symptoms": pre_dialysis_symptoms,
        "Interdialytic_Weight_Gain": interdialytic_weight_gain,
        "Diet_Compliance": diet_compliance,
        "Fluid_Restriction_Compliance": fluid_compliance,
        "Recent_Food_Intake": recent_food_intake,
        "Side_Effect_Type": current_side_effect_type,
        "Side_Effect_Severity": side_effect_severity,
        "Side_Effect_Timing": side_effect_timing,
        "Staff_Intervention_Required": staff_intervention
    }

    data.append(patient)

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv("dialysis_patient_dataset.csv", index=False)

print(f"Generated {num_patients} patient records with {len(df.columns)} columns.")
print("First few records:")
print(df.head(2))

Generated 5000 patient records with 55 columns.
First few records:
  PatientID  Age  Gender  Weight   BMI Diabetes Hypertension  \
0      P001   72    Male    97.7  34.2      Yes          Yes   
1      P002   70  Female    57.0  19.5       No          Yes   

   Kidney_Failure_Cause Pre_Dialysis_Blood_Pressure  \
0  Diabetic Nephropathy                      170/89   
1  Diabetic Nephropathy                      127/80   

  During_Dialysis_Blood_Pressure  ... Time_To_Recovery_Hours  \
0                         158/82  ...                      0   
1                         111/66  ...                      6   

   Pre_Dialysis_Symptoms  Interdialytic_Weight_Gain  Diet_Compliance  \
0                   None                        2.2             Good   
1                   None                        1.5             Poor   

   Fluid_Restriction_Compliance  Recent_Food_Intake       Side_Effect_Type  \
0                          Good       Balanced Meal  Muscle Cramps;Itching   
1       

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
import joblib
import requests
import json
import re
import warnings
warnings.filterwarnings('ignore')

# ------------------------------------------
# Step 1: Load and preprocess the data
# ------------------------------------------

print("Loading and preprocessing the dialysis patient data...")

# Load the dataset
df = pd.read_csv("dialysis_patient_dataset.csv")
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")

Loading and preprocessing the dialysis patient data...
Dataset loaded with 5000 rows and 55 columns


In [4]:
# Function to preprocess side effects
def preprocess_side_effects(side_effect_column):
    side_effects_list = []

    for effect_str in side_effect_column:
        effects = set()
        if pd.notna(effect_str) and effect_str != "None":
            effect_str = str(effect_str)
            for effect in effect_str.split(";"):
                effects.add(effect)
        side_effects_list.append(list(effects))

    # Convert to binary format
    mlb = MultiLabelBinarizer()
    side_effects_encoded = mlb.fit_transform(side_effects_list)
    side_effect_df = pd.DataFrame(side_effects_encoded, columns=mlb.classes_)

    return side_effect_df, mlb

In [5]:
# Clean up any missing values
df = df.fillna({
    "Side_Effect_Type": "None",
    "Side_Effect_Severity": "None",
    "Side_Effect_Timing": "None",
    "Staff_Intervention_Required": "No"
})

side_effects_df, side_effect_mlb = preprocess_side_effects(df["Side_Effect_Type"])


In [6]:
# Create encoders for the other target variables
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

severity_encoder = LabelEncoder()
df["Side_Effect_Severity_Encoded"] = severity_encoder.fit_transform(df["Side_Effect_Severity"])

timing_encoder = LabelEncoder()
df["Side_Effect_Timing_Encoded"] = timing_encoder.fit_transform(df["Side_Effect_Timing"])

intervention_encoder = LabelEncoder()
df["Staff_Intervention_Encoded"] = intervention_encoder.fit_transform(df["Staff_Intervention_Required"])


In [7]:
# Separate features from targets
X = df.drop([
    "PatientID", "Side_Effect_Type", "Side_Effect_Severity",
    "Side_Effect_Timing", "Staff_Intervention_Required",
    "Side_Effect_Severity_Encoded", "Side_Effect_Timing_Encoded",
    "Staff_Intervention_Encoded"
], axis=1)

In [8]:
#Handle any remaining NaN values in features
for col in X.select_dtypes(include=["int64", "float64"]).columns:
    X[col] = X[col].fillna(X[col].mean())

for col in X.select_dtypes(include=["object"]).columns:
    X[col] = X[col].fillna(X[col].mode()[0])

# Target variables
y_side_effects = side_effects_df
y_severity = df["Side_Effect_Severity_Encoded"]
y_timing = df["Side_Effect_Timing_Encoded"]
y_intervention = df["Staff_Intervention_Encoded"]

In [9]:

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

Numeric features: 26
Categorical features: 24


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", Pipeline([
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), categorical_features)
    ],
    verbose_feature_names_out=False
)


In [11]:

# Split data into training and testing sets
X_train, X_test, \
y_side_train, y_side_test, \
y_sev_train, y_sev_test, \
y_time_train, y_time_test, \
y_int_train, y_int_test = train_test_split(
    X, y_side_effects, y_severity, y_timing, y_intervention,
    test_size=0.2, random_state=42
)

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Processed feature count: {X_train_processed.shape[1]}")


Training set size: 4000 samples
Testing set size: 1000 samples
Processed feature count: 9616


In [12]:
# Updated machine learning approach using different algorithms for each prediction task
# ------------------------------------------
# Step 2: Build and train the classifiers (Updated with different algorithms)
# ------------------------------------------

print("\nTraining machine learning classifiers with specialized algorithms...")

# Import additional models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
import lightgbm as lgb
import xgboost as xgb

# You can install these packages with:
# pip install lightgbm xgboost

# 1. Side Effect Type Classifier (Multi-label) - Keep RandomForest for this complex task
side_effect_model = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
)
side_effect_model.fit(X_train_processed, y_side_train)
print("Side Effect Type model trained (RandomForest)")

# 2. Severity Classifier - Use Gradient Boosting for ordinal classification
severity_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
severity_model.fit(X_train_processed, y_sev_train)
print("Severity model trained (GradientBoosting)")

# 3. Timing Classifier - Use LightGBM for multi-class classification
timing_model = lgb.LGBMClassifier(
    n_estimators=100,
    num_leaves=31,
    random_state=42
)
timing_model.fit(X_train_processed, y_time_train)
print("Timing model trained (LightGBM)")

# 4. Intervention Required Classifier - Use Logistic Regression for binary classification
intervention_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)
intervention_model.fit(X_train_processed, y_int_train)
print("Intervention model trained (LogisticRegression)")

# ------------------------------------------
# Step 3: Evaluate the classifiers (unchanged, but we'll compare performance)
# ------------------------------------------

print("\nEvaluating classifier performance...")

# 1. Evaluate Side Effect Type predictions (RandomForest)
side_effect_pred = side_effect_model.predict(X_test_processed)
side_effect_classes = side_effect_mlb.classes_

print("\nSide Effect Type Classification Report (RandomForest):")
for i, effect in enumerate(side_effect_classes):
    print(f"\n{effect}:")
    print(classification_report(y_side_test.iloc[:, i], side_effect_pred[:, i], zero_division=0))

# 2. Evaluate Severity predictions (GradientBoosting)
severity_pred = severity_model.predict(X_test_processed)
print("\nSeverity Classification Report (GradientBoosting):")
print(classification_report(y_sev_test, severity_pred, zero_division=0))
print(f"Accuracy: {accuracy_score(y_sev_test, severity_pred):.4f}")

# 3. Evaluate Timing predictions (LightGBM)
timing_pred = timing_model.predict(X_test_processed)
print("\nTiming Classification Report (LightGBM):")
print(classification_report(y_time_test, timing_pred, zero_division=0))
print(f"Accuracy: {accuracy_score(y_time_test, timing_pred):.4f}")

# 4. Evaluate Intervention predictions (LogisticRegression)
intervention_pred = intervention_model.predict(X_test_processed)
print("\nIntervention Required Classification Report (LogisticRegression):")
print(classification_report(y_int_test, intervention_pred, zero_division=0))
print(f"Accuracy: {accuracy_score(y_int_test, intervention_pred):.4f}")

# ------------------------------------------
# Step 4: Save the models and preprocessors
# ------------------------------------------

print("\nSaving models and encoders...")

# Save the preprocessor
joblib.dump(preprocessor, "dialysis_preprocessor.pkl")

# Save the models
joblib.dump(side_effect_model, "side_effect_model.pkl")
joblib.dump(severity_model, "severity_model.pkl")
joblib.dump(timing_model, "timing_model.pkl")
joblib.dump(intervention_model, "intervention_model.pkl")

# Save the encoders
joblib.dump(side_effect_mlb, "side_effect_mlb.pkl")
joblib.dump(severity_encoder, "severity_encoder.pkl")
joblib.dump(timing_encoder, "timing_encoder.pkl")
joblib.dump(intervention_encoder, "intervention_encoder.pkl")

print("Models and encoders saved successfully!")

# ------------------------------------------
# Step 5: Updated prediction function to use the new models
# ------------------------------------------

def predict_side_effects(patient_data):
    """
    Predict side effects for a patient using the specialized algorithms

    Args:
        patient_data: DataFrame with a single patient's data

    Returns:
        Dictionary with predictions
    """
    # Load the models and preprocessors
    preprocessor = joblib.load("dialysis_preprocessor.pkl")
    side_effect_model = joblib.load("side_effect_model.pkl")  # RandomForest
    severity_model = joblib.load("severity_model.pkl")        # GradientBoosting
    timing_model = joblib.load("timing_model.pkl")            # LightGBM
    intervention_model = joblib.load("intervention_model.pkl") # LogisticRegression

    # Load the encoders
    side_effect_mlb = joblib.load("side_effect_mlb.pkl")
    severity_encoder = joblib.load("severity_encoder.pkl")
    timing_encoder = joblib.load("timing_encoder.pkl")
    intervention_encoder = joblib.load("intervention_encoder.pkl")

    # Make sure patient_data is a DataFrame
    if not isinstance(patient_data, pd.DataFrame):
        patient_data = pd.DataFrame([patient_data])

    # Drop target columns if they exist
    cols_to_drop = ["PatientID", "Side_Effect_Type", "Side_Effect_Severity",
                    "Side_Effect_Timing", "Staff_Intervention_Required",
                    "Side_Effect_Severity_Encoded", "Side_Effect_Timing_Encoded",
                    "Staff_Intervention_Encoded"]

    for col in cols_to_drop:
        if col in patient_data.columns:
            patient_data = patient_data.drop(col, axis=1)

    # Process the patient data
    patient_processed = preprocessor.transform(patient_data)

    # Get predictions from each specialized model
    side_effect_pred = side_effect_model.predict(patient_processed)
    severity_pred = severity_model.predict(patient_processed)
    timing_pred = timing_model.predict(patient_processed)
    intervention_pred = intervention_model.predict(patient_processed)

    # Get prediction probabilities for more detailed analysis
    side_effect_probs = []
    for estimator in side_effect_model.estimators_:
        side_effect_probs.append(estimator.predict_proba(patient_processed)[0])

    severity_probs = severity_model.predict_proba(patient_processed)[0]
    timing_probs = timing_model.predict_proba(patient_processed)[0]
    intervention_probs = intervention_model.predict_proba(patient_processed)[0]

    # Convert predictions to original categories
    predicted_effects = []
    for i, effect in enumerate(side_effect_mlb.classes_):
        if side_effect_pred[0][i] == 1:
            predicted_effects.append(effect)

    if not predicted_effects:
        predicted_effects = ["None"]

    predicted_side_effects = ";".join(predicted_effects)
    predicted_severity = severity_encoder.inverse_transform(severity_pred)[0]
    predicted_timing = timing_encoder.inverse_transform(timing_pred)[0]
    predicted_intervention = intervention_encoder.inverse_transform(intervention_pred)[0]

    # Create a confidence score for each prediction
    side_effect_confidence = {}
    for i, effect in enumerate(side_effect_mlb.classes_):
        if side_effect_pred[0][i] == 1:
            # Get the probability of class 1
            confidence = side_effect_probs[i][1]
            side_effect_confidence[effect] = float(confidence)

    # Get the probability of the predicted class for each other model
    severity_idx = severity_pred[0]
    severity_confidence = float(severity_probs[severity_idx])

    timing_idx = timing_pred[0]
    timing_confidence = float(timing_probs[timing_idx])

    intervention_idx = intervention_pred[0]
    intervention_confidence = float(intervention_probs[intervention_idx])

    # Return the predictions with confidence scores
    return {
        "side_effects": predicted_side_effects,
        "severity": predicted_severity,
        "timing": predicted_timing,
        "intervention_required": predicted_intervention,
        "confidence": {
            "side_effects": side_effect_confidence,
            "severity": severity_confidence,
            "timing": timing_confidence,
            "intervention": intervention_confidence
        }
    }

print("\nThe models have been updated with specialized algorithms!")
print("- Side Effect Type: RandomForest (good for multi-label classification)")
print("- Severity: GradientBoosting (handles ordinal data well)")
print("- Timing: LightGBM (efficient for multi-class problems)")
print("- Intervention Required: LogisticRegression (interpretable for binary tasks)")
print("\nYou can now use predict_side_effects() to get predictions with confidence scores.")


Training machine learning classifiers with specialized algorithms...
Side Effect Type model trained (RandomForest)
Severity model trained (GradientBoosting)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3198
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 175
[LightGBM] [Info] Start training from score -1.619488
[LightGBM] [Info] Start training from score -1.709258
[LightGBM] [Info] Start training from score -1.699636
[LightGBM] [Info] Start training from score -1.407518
[LightGBM] [Info] Start training from score -1.642478
Timing model trained (LightGBM)
Intervention model trained (LogisticRegression)

Evaluating classifier performance...

Side Effect Type Classification Report (RandomForest):

Chest Pain:
              precision    recal