In [7]:
!pip install -U imbalanced-learn




In [8]:
#!/usr/bin/env python3
"""
liver_disease_improved.py

A self-contained script that:
1. Loads & cleans the liver disease dataset
2. Splits into train & test (no leakage)
3. Imputes missing values (median) & scales features (MinMax)
4. Manually oversamples the minority class in train
5. Runs RandomizedSearchCV on XGBoost (n_jobs=1 to avoid pickling errors)
6. Fits a final XGBoost model with early stopping
7. Evaluates on the hold-out test set
8. Saves the model, imputer, and scaler for later use
"""

import os
import warnings
import joblib

import pandas as pd
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV,
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
)
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")


def main():
    # 1) Load & clean
    DATA_PATH = "../input/liver-disease-patient-dataset/Liver Patient Dataset (LPD)_train.csv"
    df = pd.read_csv(DATA_PATH, encoding="unicode_escape")
    df = df.fillna(method="bfill").drop_duplicates()
    df["Result"] = df["Result"].map({1: 0, 2: 1})
    df["Gender of the patient"] = df["Gender of the patient"].map({"Female": 0, "Male": 1})

    # 2) Train/test split (no leakage!)
    X = df.drop("Result", axis=1)
    y = df["Result"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=42
    )

    # 3) Median imputation + MinMax scaling (fit on train only)
    imputer = SimpleImputer(strategy="median")
    X_train_imp = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index,
    )
    X_test_imp = pd.DataFrame(
        imputer.transform(X_test),
        columns=X_test.columns,
        index=X_test.index,
    )

    scaler = MinMaxScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_imp),
        columns=X_train_imp.columns,
        index=X_train_imp.index,
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test_imp),
        columns=X_test_imp.columns,
        index=X_test_imp.index,
    )

    # 4) Manual oversampling of minority class in TRAIN
    train_bal = pd.concat([X_train_scaled, y_train.rename("Result")], axis=1)
    majority = train_bal[train_bal.Result == 0]
    minority = train_bal[train_bal.Result == 1]
    minority_upsampled = resample(
        minority,
        replace=True,
        n_samples=len(majority),
        random_state=42,
    )
    train_balanced = pd.concat([majority, minority_upsampled])
    X_train_bal = train_balanced.drop("Result", axis=1)
    y_train_bal = train_balanced["Result"]

    # 5) Hyperparameter search on XGBoost (n_jobs=1 avoids pickling issues)
    base_xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    param_dist = {
        "n_estimators":     [100, 200, 500],
        "max_depth":        [3, 5, 7],
        "learning_rate":    [0.01, 0.1, 0.2],
        "subsample":        [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
    }
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    search = RandomizedSearchCV(
        base_xgb,
        param_distributions=param_dist,
        n_iter=20,
        scoring="roc_auc",
        cv=cv,
        n_jobs=1,             # ← avoid BrokenProcessPool errors
        verbose=1,
        random_state=42,
    )
    print("Starting hyperparameter search…")
    search.fit(X_train_bal, y_train_bal)
    print(f"Best CV ROC-AUC: {search.best_score_:.4f}")
    print("Best params:", search.best_params_)

    # 6) Final model with early stopping
    best_params = search.best_params_
    final_model = XGBClassifier(
        **best_params,
        use_label_encoder=False,
        eval_metric="logloss",
        early_stopping_rounds=10,
        random_state=42,
    )
    final_model.fit(
        X_train_bal,
        y_train_bal,
        eval_set=[(X_test_scaled, y_test)],
        verbose=False,
    )

    # 7) Evaluate on test set
    y_pred = final_model.predict(X_test_scaled)
    y_proba = final_model.predict_proba(X_test_scaled)[:, 1]
    print("\n=== Test Set Performance ===")
    print("Accuracy       :", accuracy_score(y_test, y_pred))
    print("ROC-AUC        :", roc_auc_score(y_test, y_proba))
    print("Confusion Mat. :\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # 8) Save artifacts
    ARTIFACT_DIR = "artifacts"
    os.makedirs(ARTIFACT_DIR, exist_ok=True)
    joblib.dump(final_model, os.path.join(ARTIFACT_DIR, "xgb_model.joblib"))
    joblib.dump(imputer, os.path.join(ARTIFACT_DIR, "imputer.joblib"))
    joblib.dump(scaler, os.path.join(ARTIFACT_DIR, "scaler.joblib"))
    print(f"\nSaved model, imputer & scaler → {ARTIFACT_DIR}/")


if __name__ == "__main__":
    main()


Starting hyperparameter search…
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best CV ROC-AUC: 0.9999
Best params: {'subsample': 1.0, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.2, 'colsample_bytree': 0.8}

=== Test Set Performance ===
Accuracy       : 0.9950235725510739
ROC-AUC        : 0.9995433739472558
Confusion Mat. :
 [[2714    7]
 [  12 1085]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2721
           1       0.99      0.99      0.99      1097

    accuracy                           1.00      3818
   macro avg       0.99      0.99      0.99      3818
weighted avg       1.00      1.00      1.00      3818


Saved model, imputer & scaler → artifacts/
