In [3]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 8.5 MB/s eta 0:00:09
   -- ------------------------------------- 5.0/72.0 MB 16.8 MB/s eta 0:00:04
   ----- ---------------------------------- 10.0/72.0 MB 20.7 MB/s eta 0:00:04
   ------ --------------------------------- 12.1/72.0 MB 21.6 MB/s eta 0:00:03
   ------- -------------------------------- 13.1/72.0 MB 15.8 MB/s eta 0:00:04
   ------- -------------------------------- 14.2/72.0 MB 13.3 MB/s eta 0:00:05
   -------- ------------------------------- 14.9/72.0 MB 11.9 MB/s eta 0:00:05
   ---------- ----------------------------- 19.1/72.0 MB 12.8 MB/s eta 0:00:05
   ----------- ---------------------------- 21.2/72.0 MB 12.7 MB/s eta 0:00:

In [4]:
import pandas as pd

print("Heart columns:")
print(pd.read_csv("datasets/heart.csv").columns)

print("\nDiabetes columns:")
print(pd.read_csv("datasets/diabetes.csv").columns)

print("\nBreast Cancer columns:")
print(pd.read_csv("datasets/breast_cancer.csv").columns)


Heart columns:
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch',
       'exang', 'oldpeak', 'num'],
      dtype='object')

Diabetes columns:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

Breast Cancer columns:
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      

# SECTION 1: IMPORTS

In [5]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# SECTION 2: TRAIN FUNCTION (HIGH ACCURACY)

In [6]:
def train_and_save_model(csv_path, target_col, model_name, drop_cols=None):

    data = pd.read_csv(csv_path)

    if drop_cols:
        data = data.drop(columns=drop_cols, errors="ignore")

In [7]:
def train_and_save_model(csv_path, target_col, model_name, drop_cols=None):

    data = pd.read_csv(csv_path)

    if drop_cols:
        data = data.drop(columns=drop_cols, errors="ignore")

    # Encode categorical columns
    encoders = {}
    for col in data.columns:
        if data[col].dtype == "object":
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            encoders[col] = le

    # Binary conversion for Heart Disease
    if model_name == "heart":
        data[target_col] = (data[target_col] > 0).astype(int)

    X = data.drop(target_col, axis=1)
    y = data[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    models = {
        "Logistic Regression": (
            LogisticRegression(max_iter=2000),
            {"C": [0.01, 0.1, 1, 10]},
            True
        ),
        "SVM": (
            SVC(probability=True),
            {"C": [0.1, 1, 10], "kernel": ["rbf", "linear"]},
            True
        ),
        "Random Forest": (
            RandomForestClassifier(class_weight="balanced"),
            {
                "n_estimators": [200, 300],
                "max_depth": [None, 10, 20],
                "min_samples_split": [2, 5]
            },
            False
        ),
        "XGBoost": (
            XGBClassifier(
                objective="binary:logistic",
                eval_metric="logloss",
                use_label_encoder=False
            ),
            {
                "n_estimators": [300, 500],
                "max_depth": [3, 5, 7],
                "learning_rate": [0.01, 0.05, 0.1],
                "subsample": [0.8, 1.0],
                "colsample_bytree": [0.8, 1.0]
            },
            False
        )
    }

    best_model = None
    best_acc = 0
    scores = {}

    print(f"\nðŸš€ Training {model_name.upper()} (High Accuracy Mode)")

    for name, (model, params, needs_scaling) in models.items():

        X_tr = X_train_scaled if needs_scaling else X_train
        X_te = X_test_scaled if needs_scaling else X_test

        grid = GridSearchCV(
            model,
            params,
            cv=cv,
            scoring="accuracy",
            n_jobs=-1
        )

        grid.fit(X_tr, y_train)

        preds = grid.predict(X_te)
        acc = accuracy_score(y_test, preds)
        scores[name] = acc

        print(f"{name}: {acc:.4f}")
        print(f"   Best Params â†’ {grid.best_params_}")

        if acc > best_acc:
            best_acc = acc
            best_model = (grid.best_estimator_, needs_scaling)

    # Save best model
    with open(f"{model_name}_model.pkl", "wb") as f:
        pickle.dump({
            "model": best_model[0],
            "scaler": scaler if best_model[1] else None,
            "encoders": encoders
        }, f)

    print("\nðŸ“Š ACCURACY SUMMARY")
    for k, v in scores.items():
        print(f"{k}: {v:.4f}")

    print(f"\nâœ… Saved {model_name}_model.pkl | BEST Accuracy: {best_acc:.4f}")

    return scores

# SECTION 3: Save the Model and Checking the best Accuracy

In [8]:
train_and_save_model(
    csv_path="datasets/heart.csv",
    target_col="num",
    model_name="heart"
)

train_and_save_model(
    csv_path="datasets/diabetes.csv",
    target_col="Outcome",
    model_name="diabetes"
)

train_and_save_model(
    csv_path="datasets/breast_cancer.csv",
    target_col="diagnosis",
    model_name="cancer",
    drop_cols=["id", "Unnamed: 32"]
)



ðŸš€ Training HEART (High Accuracy Mode)
Logistic Regression: 0.8098
   Best Params â†’ {'C': 0.1}
SVM: 0.8370
   Best Params â†’ {'C': 1, 'kernel': 'rbf'}
Random Forest: 0.8098
   Best Params â†’ {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: 0.8261
   Best Params â†’ {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0}

ðŸ“Š ACCURACY SUMMARY
Logistic Regression: 0.8098
SVM: 0.8370
Random Forest: 0.8098
XGBoost: 0.8261

âœ… Saved heart_model.pkl | BEST Accuracy: 0.8370

ðŸš€ Training DIABETES (High Accuracy Mode)
Logistic Regression: 0.7078
   Best Params â†’ {'C': 0.1}
SVM: 0.8377
   Best Params â†’ {'C': 1, 'kernel': 'rbf'}
Random Forest: 0.8766
   Best Params â†’ {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: 0.8831
   Best Params â†’ {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

ðŸ“Š ACCURACY SUMMARY
Logistic Regression: 0.7078
SVM: 0.8377
Random Forest: 0.8766
XGBoost: 0.8831

âœ… Saved diabetes_model.pkl | BEST Accuracy: 0.8831

ðŸš€ Training CANCER (High Accuracy Mode)
Logistic Regression: 0.9649
   Best Params â†’ {'C': 1}
SVM: 0.9737
   Best Params â†’ {'C': 10, 'kernel': 'rbf'}
Random Forest: 0.9737
   Best Params â†’ {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: 0.9737
   Best Params â†’ {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.8}

ðŸ“Š ACCURACY SUMMARY
Logistic Regression: 0.9649
SVM: 0.9737
Random Forest: 0.9737
XGBoost: 0.9737

âœ… Saved cancer_model.pkl | BEST Accuracy: 0.9737


{'Logistic Regression': 0.9649122807017544,
 'SVM': 0.9736842105263158,
 'Random Forest': 0.9736842105263158,
 'XGBoost': 0.9736842105263158}

In [9]:
# âœ… Saved heart_model.pkl | BEST Accuracy: 0.8370
# âœ… Saved diabetes_model.pkl | BEST Accuracy: 0.8831
# âœ… Saved cancer_model.pkl | BEST Accuracy: 0.9737