# Improved Cardiovascular Disease Prediction Model

This notebook trains a more robust model using XGBoost and improved data preprocessing techniques.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import pickle

In [None]:
# Load data
try:
    df = pd.read_csv('cardio_train.csv', sep=';')
    print(f"Original Data Shape: {df.shape}")
except FileNotFoundError:
    print("Error: cardio_train.csv not found.")

## 1. Data Cleaning & Preprocessing

In [None]:
# Remove ID column
if 'id' in df.columns:
    df.drop('id', axis=1, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Convert Age to Years
df['age_years'] = (df['age'] / 365.25).astype(int)

print(f"Shape after basic cleaning: {df.shape}")

## 2. Outlier Removal
Filtering based on realistic medical ranges:
- Systolic BP: 60-240
- Diastolic BP: 30-160
- Height: 120-215 cm
- Weight: 40-180 kg

In [None]:
# Blood Pressure Cleaning
mask_bp = (df['ap_hi'] >= 60) & (df['ap_hi'] <= 240) & \
          (df['ap_lo'] >= 30) & (df['ap_lo'] <= 160) & \
          (df['ap_hi'] > df['ap_lo'])

df = df[mask_bp]

# Height and Weight Cleaning
mask_hw = (df['height'] >= 120) & (df['height'] <= 215) & \
          (df['weight'] >= 40) & (df['weight'] <= 180)

df = df[mask_hw]

print(f"Shape after outlier removal: {df.shape}")

## 3. Feature Engineering

In [None]:
# Calculate BMI
df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)

# Calculate Pulse Pressure
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']

df.head()

## 4. Train-Test Split

In [None]:
target = 'cardio'
# Drop target and raw 'age' (using age_years instead)
features = [c for c in df.columns if c != target and c != 'age']

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training Set Shape:", X_train_scaled.shape)
print("Test Set Shape:", X_test_scaled.shape)

## 5. Model Training (XGBoost)

In [None]:
model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("Model trained successfully.")

## 6. Evaluation

In [None]:
y_pred = model.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {acc*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## 7. feature Important

In [None]:
from xgboost import plot_importance
plot_importance(model)
plt.show()

## 8. Save Model

In [None]:
data_to_save = {
    "model": model,
    "scaler": scaler
}

with open("cardio_model.pkl", "wb") as f:
    pickle.dump(data_to_save, f)
    
print("Model saved to cardio_model.pkl")