In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sulianova/cardiovascular-disease-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv("/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv", sep=";")


In [None]:

print("Shape:", df.shape)
print(df.head())

In [None]:
from pandas_profiling import ProfileReport
df_profile = ProfileReport(df, title="DF")


In [None]:
display(df_profile)

In [None]:
df["ap_diff"] = df["ap_hi"] - df["ap_lo"]
df = df[df["ap_hi"] >= df["ap_lo"]]
df = df[(df["ap_hi"] >= 90) & (df["ap_hi"] <= 240)]
df = df[(df["ap_lo"] >= 60) & (df["ap_lo"] <= 200)]
df = df.drop(columns=["id"])


In [None]:
df.shape

In [None]:
X = df.drop("cardio", axis=1)
y = df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
model = XGBClassifier(
    n_estimators=350,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)
pred_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, pred))
print("F1:", f1_score(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, pred))
print("\nClassification report:\n", classification_report(y_test, pred))

In [None]:
lgb = LGBMClassifier(
    n_estimators=400,
    max_depth=-1,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9
)
lgb.fit(X_train, y_train)
pred_lgb = lgb.predict(X_test)
proba_lgb = lgb.predict_proba(X_test)[:,1]
print("Accuracy:", accuracy_score(y_test, pred_lgb))
print("F1:", f1_score(y_test, pred_lgb))
print("ROC-AUC:", roc_auc_score(y_test, proba_lgb))
print(classification_report(y_test, pred_lgb))

In [None]:
print("\n=== CatBoost ===")
cat = CatBoostClassifier(
    iterations=400,
    depth=6,
    learning_rate=0.05,
    verbose=0
)
cat.fit(X_train, y_train)
pred_cat = cat.predict(X_test)
proba_cat = cat.predict_proba(X_test)[:,1]
print("Accuracy:", accuracy_score(y_test, pred_cat))
print("F1:", f1_score(y_test, pred_cat))
print("ROC-AUC:", roc_auc_score(y_test, proba_cat))
print(classification_report(y_test, pred_cat))

In [None]:
# scale_pos_weight для дисбаланса классов
N0 = sum(y_train==0)
N1 = sum(y_train==1)
scale_pos_weight = N0 / N1

# инициализация модели
xgb_recall = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    random_state=42
)

# обучение
xgb_recall.fit(X_train, y_train)

# предсказания с порогом 0.45
y_proba = xgb_recall.predict_proba(X_test)[:,1]
threshold = 0.45
y_pred = (y_proba >= threshold).astype(int)

# метрики
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("Recall (класс 1):", recall_score(y_test, y_pred))
print("Precision (класс 1):", precision_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Что это значит в мед. исследовании

Мы лучше выявляем больных людей, что критично для сердечно-сосудистых заболеваний.

Ложноположительные (FP) немного выросли, но это менее критично — больных не пропускаем.

F1 остался стабильным → сбалансированное улучшение Recall без сильной потери Precision.