In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
import joblib

# 1) Load and prepare data
df = pd.read_csv('data/cleaned_Medicaldataset.csv')
df['Result_binary'] = df['Result'].map({'negative': 0, 'positive': 1})
df['pulse_pressure'] = df['Systolic blood pressure'] - df['Diastolic blood pressure']

features = [
    'Age', 'Heart rate', 'Systolic blood pressure', 'Diastolic blood pressure',
    'Blood sugar', 'CK-MB', 'Troponin', 'pulse_pressure'
]
X = df[features].values
y = df['Result_binary'].values

# 2) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 3) Train a more regularized tree
base_tree = DecisionTreeClassifier(
    max_depth=4,           # limit depth
    min_samples_leaf=10,   # require ≥10 samples per leaf
    random_state=42
)
base_tree.fit(X_train, y_train)

# 4) Calibrate its probabilities via Platt scaling
calibrated_tree = CalibratedClassifierCV(
    estimator=base_tree,      # <-- updated argument name
    method='sigmoid',         # Platt scaling
    cv=5
)
calibrated_tree.fit(X_train, y_train)

# 5) Save *this* calibrated model (overwrites the old file)
joblib.dump(
    calibrated_tree,
    'outputs/models/decision_tree_model.pkl'
)
print("✅ Calibrated Decision Tree saved to outputs/models/decision_tree_model.pkl")


✅ Calibrated Decision Tree saved to outputs/models/decision_tree_model.pkl
