In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv("cardio_train_clean_scaled.csv")
df.head()

## Model Training and Evaluation

In [None]:
from kmodes.kmodes import KModes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore')

In [None]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('cardio', axis=1)
y = df['cardio']

# Split data into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

In [None]:
# Define models and hyperparameter grids for GridSearchCV
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Multilayer Perceptron': MLPClassifier(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

In [None]:
params = {
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 15],
        'min_samples_leaf': [1, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_leaf': [1, 5]
    },
    'Multilayer Perceptron': {
        'hidden_layer_sizes': [(50, 50), (100,)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.05],
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1]
    }
}

In [None]:
# Run GridSearchCV and evaluate each model

def tune_and_evaluate_models(models, params, X_train, y_train, X_test, y_test):
    results = []

    for model_name in models:
        print(f"\n--- Tuning and Evaluating {model_name} ---")

        grid_search = GridSearchCV(
            models[model_name],
            params[model_name],
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )

        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        print(f"Best Parameters for {model_name}: {grid_search.best_params_}")

        y_pred = best_model.predict(X_test)
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_proba)

        results.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'AUC': auc,
            'Best CV Score': grid_search.best_score_
        })

        print(f"\n--- {model_name} Performance on Test Set ---")
        print(classification_report(y_test, y_pred))

        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.figure()
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for {model_name}')
        plt.legend(loc='best')
        plt.show()


    # Final Results Summary
    print("\n--- Summary of Model Performance ---")
    results_df = pd.DataFrame(results)
    print(results_df.to_string())

In [None]:
tune_and_evaluate_models(models, params, X_train, y_train, X_test, y_test)

In [None]:
X = df[['BMI', 'ap_lo', 'ap_hi', 'age_years','cholesterol']]
y = df['cardio']

In [None]:
# Split data into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

In [None]:
tune_and_evaluate_models(models, params, X_train, y_train, X_test, y_test)