In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv("cardio_train_clean_scaled.csv")
df.head()

## Model Training and Evaluation

In [None]:
from kmodes.kmodes import KModes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier 
import warnings

warnings.filterwarnings('ignore')

In [None]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(['cardio', 'weight','BMI_category_num'], axis=1)
y = df['cardio']

# Split data into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

In [None]:

# XGBoost model and hyperparameter grids for GridSearchCV
model_name = 'XGBoost'
model = {
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}
param = {
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1]
    }
}

In [None]:
print(f"\n--- Tuning and Evaluating {model_name} ---")

grid_search = GridSearchCV(
    model[model_name],
    param[model_name],
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print(f"Best Parameters for {model_name}: {grid_search.best_params_}")

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")    
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")  

In [None]:
from sklearn.preprocessing import MinMaxScaler
import joblib
scaler = MinMaxScaler()
joblib.dump(best_model, 'xgboost_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved successfully as 'xgboost_model.pkl' and 'scaler.pkl'!")

In [None]:
# Optional: Verify load
loaded_model = joblib.load('xgboost_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
print("Verification: Model and scaler loaded successfully.")