<a href="https://www.kaggle.com/code/colewelkins/cardiovascular-example?scriptVersionId=143346666" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, roc_curve
from sklearn.calibration import CalibratedClassifierCV

In [None]:
# Load the dataset and perform one-hot encoding for the 'bp_category' column.
data_path = "/kaggle/input/cardiovascular-disease/cardio_data_processed.csv"
data = pd.read_csv(data_path)
data = pd.get_dummies(data, columns=['bp_category'], drop_first=True)

In [None]:
# Split the dataset into training and validation sets.
y = data['cardio']
X = data.drop('cardio', axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop the 'bp_category_encoded' column if it exists
if 'bp_category_encoded' in X_train.columns:
    X_train = X_train.drop(columns=['bp_category_encoded'])
    X_val = X_val.drop(columns=['bp_category_encoded'])

In [None]:
# Scale the data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Define the models and adjust necessary parameters.
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(probability=True)
}

In [None]:
# Train the models and evaluate their performance.
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    
    accuracy = accuracy_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, model.predict_proba(X_val_scaled)[:, 1])
    
    results.append([name, accuracy, roc_auc])

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC AUC"])
results_df

## 3. Data Cleaning and Preprocessing

In [None]:
# Check for missing values
missing_data = data.isnull().sum()
missing_data

# Handling outliers (Example: 'age_years')
sns.boxplot(data['age_years'])
plt.title('Boxplot of Age in Years')
plt.show()

## 4. Exploratory Data Analysis (Deep Visual Analysis)

In [None]:
# Univariate Analysis
sns.histplot(data['age_years'], kde=True, bins=30)
plt.title('Distribution of Age in Years')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.show()

sns.histplot(data['ap_hi'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of Systolic Blood Pressure')
plt.xlabel('Systolic Blood Pressure (mmHg)')
plt.ylabel('Count')
plt.show()

In [None]:
# Bivariate Analysis
sns.boxplot(x=data['cardio'], y=data['age_years'], palette='pastel')
plt.title('Age vs Cardiovascular Disease')
plt.xlabel('Cardiovascular Disease (0: No, 1: Yes)')
plt.ylabel('Age (years)')
plt.show()

In [None]:
# Correlation Analysis
data_corr = data.copy()

for column in data_corr.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_corr[column] = le.fit_transform(data_corr[column])

correlation_matrix = data_corr.corr()
plt.figure(figsize=(14,10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.show()

## 5. Feature Engineering and Selection

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
importances = rf.feature_importances_

features = X.columns
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

## 6. Model Selection and Benchmarking

In [None]:
best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)

print(classification_report(y_val, y_pred))

## 7. Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = GradientBoostingClassifier(**best_params)
best_model.fit(X_train, y_train)

## 8. Model Evaluation

In [None]:
y_pred_val = best_model.predict(X_val)

# Confusion matrix
cm = confusion_matrix(y_val, y_pred_val)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# ROC curve
y_pred_prob = best_model.predict_proba(X_val)[:,1]
fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

print("Accuracy on validation set:", accuracy_score(y_val, y_pred_val))
print("ROC AUC on validation set:", roc_auc_score(y_val, y_pred_prob))
print("\nClassification Report:\n", classification_report(y_val, y_pred_val))

## 9. Model Interpretability

In [None]:
import shap

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values, X_train, plot_type="bar")

## 10. Addressing Class Imbalance

In [None]:
sns.countplot(data['cardio'])
plt.title('Class Distribution')
plt.show()

from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

best_model.fit(X_resampled, y_resampled)
y_pred_resampled = best_model.predict(X_val)
print("Accuracy on validation set after addressing class imbalance:", accuracy_score(y_val, y_pred_resampled))

## 11. Model Calibration

In [None]:
calibrated = CalibratedClassifierCV(best_model, method='sigmoid', cv=5)
calibrated.fit(X_train, y_train)
y_pred_calibrated = calibrated.predict(X_val)

print("Accuracy after calibration:", accuracy_score(y_val, y_pred_calibrated))

probs = calibrated.predict_proba(X_val)[:, 1]
plt.hist(probs, bins=50, histtype="step", lw=2)
plt.title("Predicted Probabilities After Calibration")
plt.show()

## 12. Conclusion and Future Work
TODO

## 13. References
1. [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/45/heart+disease)
2. [Kaggle Heart Disease Dataset](https://www.kaggle.com/datasets/yasserh/heart-disease-dataset)