<a href="https://colab.research.google.com/github/prem1424/major-project-cardio/blob/main/major_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -----------------------------
# 0. Import libraries
# -----------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------
# 1. Load dataset
# -----------------------------
DATAFILE = "cardio data.csv"   # uploaded dataset name

# Important: dataset is separated by ;
data = pd.read_csv(DATAFILE, sep=";")
print("✅ Dataset loaded. Shape:", data.shape)
print("\nFirst 5 rows:")
display(data.head())

# -----------------------------
# 2. Feature engineering
# -----------------------------
# Convert age from days → years
data["age_years"] = (data["age"] / 365).astype(int)

# Drop id column (not useful)
if "id" in data.columns:
    data = data.drop(columns=["id"])

# Define features (X) and target (y)
target_col = "cardio"

# Drop rows with NaN in the target column
data.dropna(subset=[target_col], inplace=True)


X = data.drop(columns=[target_col])
y = data[target_col]

print("\nFeatures shape:", X.shape)
print("Target shape:", y.shape)
print("Target distribution:\n", y.value_counts())

# -----------------------------
# 3. Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 4. Preprocessing pipeline
# -----------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# -----------------------------
# 5. Train models
# -----------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel="rbf", probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("classifier", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n🔹 {name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# -----------------------------
# 6. Compare models
# -----------------------------
plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.ylim(0,1)
plt.show()

# -----------------------------
# 7. Confusion Matrix (Best Model)
# -----------------------------
best_model_name = max(results, key=results.get)
print("\n✅ Best Model:", best_model_name)

best_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", models[best_model_name])
])

best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion Matrix - {best_model_name}")
plt.show()

# -----------------------------
# 8. Predict on new data
# -----------------------------
# Example: enter new patient data
sample = {
    "age": 20000,   # in days (~54 years)
    "gender": 2,    # 1 = male, 2 = female
    "height": 165,
    "weight": 70,
    "ap_hi": 120,
    "ap_lo": 80,
    "cholesterol": 1,
    "gluc": 1,
    "smoke": 0,
    "alco": 0,
    "active": 1,
    "age_years": int(20000/365)
}

sample_df = pd.DataFrame([sample])
prediction = best_model.predict(sample_df)[0]
print("\n🧑‍⚕️ Prediction for sample patient:", "Disease" if prediction==1 else "No Disease")

✅ Dataset loaded. Shape: (70000, 13)

First 5 rows:


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0



Features shape: (70000, 12)
Target shape: (70000,)
Target distribution:
 cardio
0    35021
1    34979
Name: count, dtype: int64

🔹 Logistic Regression Accuracy: 0.7138
              precision    recall  f1-score   support

           0       0.70      0.75      0.72      7004
           1       0.73      0.67      0.70      6996

    accuracy                           0.71     14000
   macro avg       0.72      0.71      0.71     14000
weighted avg       0.72      0.71      0.71     14000


🔹 Random Forest Accuracy: 0.7134
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      7004
           1       0.72      0.70      0.71      6996

    accuracy                           0.71     14000
   macro avg       0.71      0.71      0.71     14000
weighted avg       0.71      0.71      0.71     14000

