In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
df = pd.read_csv("cardio_train_clean_scaled.csv")
df.head()

Unnamed: 0,gender,height,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,1.0,0.453333,0.277778,0.444444,0.0,0.0,0.0,0.0,1.0,0.0,0.571429,0.181722
1,0.0,0.373333,0.444444,0.555556,1.0,0.0,0.0,0.0,1.0,1.0,0.714286,0.460569
2,0.0,0.433333,0.388889,0.333333,1.0,0.0,0.0,0.0,0.0,1.0,0.628571,0.21487
3,1.0,0.46,0.5,0.666667,0.0,0.0,0.0,0.0,1.0,1.0,0.514286,0.326806
4,0.0,0.373333,0.222222,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.514286,0.204185


## Model Training and Evaluation

In [3]:
from kmodes.kmodes import KModes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier 
import warnings

warnings.filterwarnings('ignore')

In [4]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split

In [5]:
X = df.drop(['cardio'], axis=1)
y = df['cardio']

# Split data into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

Training set size: 55572, Testing set size: 13893


In [6]:

# XGBoost model and hyperparameter grids for GridSearchCV
model_name = 'XGBoost'
model = {
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}
param = {
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1]
    }
}

In [7]:
print(f"\n--- Tuning and Evaluating {model_name} ---")

grid_search = GridSearchCV(
    model[model_name],
    param[model_name],
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print(f"Best Parameters for {model_name}: {grid_search.best_params_}")

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")    
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")  


--- Tuning and Evaluating XGBoost ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Accuracy: 0.7355
Precision: 0.7565
Recall: 0.6933
F1-Score: 0.7235
AUC: 0.8006


In [8]:
import joblib
joblib.dump(best_model, 'xgboost_model.pkl')
print("Model saved successfully as 'xgboost_model.pkl'!")

Model saved successfully as 'xgboost_model.pkl'!


In [9]:
# Optional: Verify load
loaded_model = joblib.load('xgboost_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
print("Verification: Model and scaler loaded successfully.")

FileNotFoundError: [Errno 2] No such file or directory: 'scaler.pkl'