# **Import Required Libraries**
This cell imports all necessary Python libraries for:
- Data preprocessing (Pandas, NumPy)
- Machine learning models (SVM, KNN, Naïve Bayes)
- Feature selection (Decision Tree)
- Hyperparameter tuning (GridSearchCV)
- Performance evaluation (Accuracy, Precision, Recall, etc.)


In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# **Load the Dataset**
- Fetches the Heart Disease dataset from the UCI Machine Learning Repository.
- Extracts feature variables (`X`) and the target variable (`y`).
- Converts the target variable into **binary classification**:
  - `0` → No disease
  - `1` → Disease (all non-zero values converted to `1`)


In [2]:
# Import dataset
heart_disease = fetch_ucirepo(id=45)

# Access data
X = heart_disease.data.features
y = heart_disease.data.targets

y = pd.Series(y.values.flatten(), name='target')

# Convert target variable to binary classification (0 = no disease, 1 = disease)
y = (y > 0).astype(int)  # Convert all non-zero values to 1


# **Handle Missing Values & Encode Categorical Features**
- Uses **Nearest Neighbor Hot Deck Imputation (KNN Imputer)** to fill missing values.
- Defines **numerical** and **categorical** features.
- Converts categorical columns to **integer format** after imputation.
- Applies **Label Encoding** to categorical features for model compatibility.


In [3]:
# Handle missing values using Nearest Neighbor Hot Deck Imputation (KNN)
imputer = KNNImputer(n_neighbors=5)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Define numerical and categorical features
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Convert categorical columns back to integers
X_imputed[categorical_features] = X_imputed[categorical_features].round(0).astype(int)

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_imputed[col] = le.fit_transform(X_imputed[col])
    label_encoders[col] = le  # Store encoders for later decoding if needed


# **Standardize Numerical Features**
- Applies **StandardScaler** to numerical features to normalize their values.
- Helps machine learning models perform better by ensuring all features have a **mean of 0** and **standard deviation of 1**.


In [4]:
# Standardize numerical features
scaler = StandardScaler()
X_imputed[numeric_features] = scaler.fit_transform(X_imputed[numeric_features])


# **Split Data into Training & Testing Sets**
- Splits the dataset into **70% training** and **30% testing**.
- Uses **stratified sampling** to maintain class balance in both sets.
- Sets `random_state=42` for reproducibility.


In [5]:
# Split dataset (70% train, 30% test, stratified to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, stratify=y, random_state=42)


# **Feature Selection Using Decision Tree**
- Trains a **Decision Tree classifier** on the training data.
- Selects the **top 7 most important features** based on feature importance scores.
- Creates a **reduced dataset** using only the selected features.


In [6]:
# Feature selection using Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
importances = dt.feature_importances_
selected_features_dt = X_train.columns[np.argsort(importances)[-7:]]  # Top 7 features

# Create reduced feature datasets for Decision Tree selection
X_train_dt = X_train[selected_features_dt]
X_test_dt = X_test[selected_features_dt]

# Print selected features
print("Decision Tree Selected Features:", selected_features_dt.tolist())


Decision Tree Selected Features: ['age', 'oldpeak', 'ca', 'chol', 'thalach', 'cp', 'thal']


# **Manually Selected Features**
- Uses a **manually chosen subset of 7 features** based on prior knowledge.
- Creates a **reduced dataset** using these features.


In [7]:
# Manually selected features
manual_selected_features = ['cp', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

# Create reduced feature datasets for manually selected features
X_train_manual = X_train[manual_selected_features]
X_test_manual = X_test[manual_selected_features]

# Print selected features
print("Manually Selected Features:", manual_selected_features)


Manually Selected Features: ['cp', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


# **Train & Evaluate Models (Baseline Parameters)**
- Trains SVM, KNN, and Naïve Bayes models using parameters from the **baseline study**.
- Evaluates models using **Accuracy, Precision, Recall, F1-score, and Specificity**.
- Runs experiments on:
  - **All features**
  - **Decision Tree-selected features**
  - **Manually selected features**


In [8]:
# Function to evaluate models using Accuracy, Precision, Recall, F1-score, and Specificity
def evaluate_models(X_train, X_test, feature_type, model_dict):
    print(f"\nResults using {feature_type} features:")
    for name, model in model_dict.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp)
        print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, Specificity: {specificity:.4f}")


In [9]:
models_baseline = {
    "SVM": SVC(C=0.25, kernel='rbf', gamma=0.1268408),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naïve Bayes": GaussianNB(var_smoothing=1e-9)
}

evaluate_models(X_train, X_test, "All Features", models_baseline)
evaluate_models(X_train_dt, X_test_dt, "Decision Tree", models_baseline)
evaluate_models(X_train_manual, X_test_manual, "Manual Selection", models_baseline)



Results using All Features features:
SVM - Accuracy: 0.7692, Precision: 0.7234, Recall: 0.8095, F1-score: 0.7640, Specificity: 0.7347
KNN - Accuracy: 0.7363, Precision: 0.6875, Recall: 0.7857, F1-score: 0.7333, Specificity: 0.6939
Naïve Bayes - Accuracy: 0.8681, Precision: 0.8125, Recall: 0.9286, F1-score: 0.8667, Specificity: 0.8163

Results using Decision Tree features:
SVM - Accuracy: 0.7582, Precision: 0.7174, Recall: 0.7857, F1-score: 0.7500, Specificity: 0.7347
KNN - Accuracy: 0.7253, Precision: 0.6809, Recall: 0.7619, F1-score: 0.7191, Specificity: 0.6939
Naïve Bayes - Accuracy: 0.8352, Precision: 0.7872, Recall: 0.8810, F1-score: 0.8315, Specificity: 0.7959

Results using Manual Selection features:
SVM - Accuracy: 0.7912, Precision: 0.7556, Recall: 0.8095, F1-score: 0.7816, Specificity: 0.7755
KNN - Accuracy: 0.7582, Precision: 0.7273, Recall: 0.7619, F1-score: 0.7442, Specificity: 0.7551
Naïve Bayes - Accuracy: 0.8791, Precision: 0.8298, Recall: 0.9286, F1-score: 0.8764, Spec

# **Hyperparameter Tuning with GridSearchCV**
- Optimizes **SVM, KNN, and Naïve Bayes** models using **GridSearchCV**.
- Runs experiments on:
  - **All features**
  - **Decision Tree-selected features**
  - **Manually selected features**


In [16]:
# Define parameter grid BEFORE calling train_with_gridsearch
param_grid = {
    "SVM": {"C": [0.25, 0.5, 1.0], "gamma": [0.1268408, 'scale', 'auto'], "kernel": ['rbf']},
    "KNN": {"n_neighbors": [5, 7, 9]},
    "Naïve Bayes": {"var_smoothing": [1e-9, 1e-8, 1e-7]}
}

# Function to train models using GridSearchCV
def train_with_gridsearch(X_train, X_test, feature_type):
    print(f"\nResults using {feature_type} features with GridSearchCV:")
    for name, param in param_grid.items():
        if name == "SVM":
            model = SVC()
        elif name == "KNN":
            model = KNeighborsClassifier()
        elif name == "Naïve Bayes":
            model = GaussianNB()
        grid_search = GridSearchCV(model, param, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp)
        print(f"{name} - Best Params: {grid_search.best_params_}")
        print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, Specificity: {specificity:.4f}\n")




In [17]:
train_with_gridsearch(X_train, X_test, "All Features")
train_with_gridsearch(X_train_dt, X_test_dt, "Decision Tree")
train_with_gridsearch(X_train_manual, X_test_manual, "Manual Selection")



Results using All Features features with GridSearchCV:
SVM - Best Params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
SVM - Accuracy: 0.7912, Precision: 0.7556, Recall: 0.8095, F1-score: 0.7816, Specificity: 0.7755

KNN - Best Params: {'n_neighbors': 7}
KNN - Accuracy: 0.7363, Precision: 0.6957, Recall: 0.7619, F1-score: 0.7273, Specificity: 0.7143

Naïve Bayes - Best Params: {'var_smoothing': 1e-09}
Naïve Bayes - Accuracy: 0.8681, Precision: 0.8125, Recall: 0.9286, F1-score: 0.8667, Specificity: 0.8163


Results using Decision Tree features with GridSearchCV:
SVM - Best Params: {'C': 1.0, 'gamma': 0.1268408, 'kernel': 'rbf'}
SVM - Accuracy: 0.7692, Precision: 0.7333, Recall: 0.7857, F1-score: 0.7586, Specificity: 0.7551

KNN - Best Params: {'n_neighbors': 9}
KNN - Accuracy: 0.7473, Precision: 0.7111, Recall: 0.7619, F1-score: 0.7356, Specificity: 0.7347

Naïve Bayes - Best Params: {'var_smoothing': 1e-09}
Naïve Bayes - Accuracy: 0.8352, Precision: 0.7872, Recall: 0.8810, F1-score: 