In [4]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import os
import traceback

# Import additional models
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

# Import Deep Learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

warnings.filterwarnings("ignore")

# Load Data
def load_data(filepath):
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Error: File '{filepath}' not found. Please upload it.")

    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()  # Remove extra spaces
    print("\n✅ Columns in dataset:", df.columns)

    # Rename 'target' to 'AHD' if needed
    if 'target' in df.columns:
        df.rename(columns={'target': 'AHD'}, inplace=True)

    print("\n🔹 Updated Columns after Renaming:", df.columns)

    if 'AHD' not in df.columns:
        raise KeyError("Error: Target variable 'AHD' not found in dataset. Check column names.")

    # Drop rows where 'AHD' is NaN
    df = df.dropna(subset=['AHD'])

    # Ensure 'AHD' is numeric
    df['AHD'] = pd.to_numeric(df['AHD'], errors='coerce')

    return df

# Preprocessing Function
def preprocess_data(df):
    if 'thal' in df.columns:
        if not df['thal'].mode().empty:
            df['thal'] = df['thal'].fillna(df['thal'].mode()[0])
        else:
            df['thal'] = df['thal'].fillna(2)  # Default category

    if 'cp' in df.columns:
        df['cp'] = df['cp'].map({'typical': 0, 'asymptomatic': 1, 'nonanginal': 2, 'nontypical': 3})

    # Handle missing values
    if 'ca' in df.columns:
        df['ca'] = df['ca'].fillna(df['ca'].mean())

    return df

# Feature Engineering
def feature_engineering(df):
    if all(col in df.columns for col in ['age', 'chol']):
        df['age*chol'] = df['age'] * df['chol']
    if all(col in df.columns for col in ['trestbps', 'thalach']):
        df['trestbps*thalach'] = df['trestbps'] * df['thalach']
    return df

# Train ML Models
def train_models(x_train, x_test, y_train, y_test):
    # Handle missing values
    imputer = SimpleImputer(strategy="most_frequent")
    x_train_imputed = imputer.fit_transform(x_train)
    x_train = pd.DataFrame(x_train_imputed, columns=x_train.columns[:x_train_imputed.shape[1]])

    x_test_imputed = imputer.transform(x_test)
    x_test = pd.DataFrame(x_test_imputed, columns=x_test.columns[:x_test_imputed.shape[1]])

    # Feature Selection
    selector = SelectKBest(score_func=f_classif, k=10)
    x_train = selector.fit_transform(x_train, y_train)
    x_test = selector.transform(x_test)

    # Apply PCA (Reduce to 5 Features)
    pca = PCA(n_components=5)
    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)

    # Define models
    models = {
        "Logistic Regression": LogisticRegression(random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
        "SVM": SVC(kernel="rbf", probability=True),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "Neural Network (MLP)": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
    }

    best_model = None
    best_f1 = 0

    for name, model in models.items():
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        f1 = f1_score(y_test, pred)

        print(f"\n🔹 {name} Performance:")
        print("Accuracy:", accuracy_score(y_test, pred))
        print("F1 Score:", f1)
        print(classification_report(y_test, pred))

        if f1 > best_f1:
            best_f1 = f1
            best_model = model

    return best_model

# Train Deep Learning Model
def train_deep_learning_model(x_train, x_test, y_train, y_test):
    imputer = SimpleImputer(strategy="most_frequent")
    x_train = imputer.fit_transform(x_train)
    x_test = imputer.transform(x_test)

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    model = Sequential([
        Dense(64, activation="relu", input_shape=(x_train.shape[1],)),
        Dropout(0.2),
        Dense(32, activation="relu"),
        Dropout(0.2),
        Dense(1, activation="sigmoid")
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    history = model.fit(x_train, y_train, epochs=50, batch_size=16, validation_data=(x_test, y_test), verbose=1)

    loss, accuracy = model.evaluate(x_test, y_test)
    print(f"\n✅ Deep Learning Model Accuracy: {accuracy:.4f}")

    return model, history

# Save Model
def save_model(model, filename):
    if isinstance(model, tf.keras.Model):
        filename = filename if filename.endswith((".keras", ".h5")) else filename + ".keras"
        model.save(filename)
    else:
        joblib.dump(model, filename)
    print(f"\n✅ Model saved as {filename}")

# Main Execution
if __name__ == "__main__":
    csv_file = "Heart_dataset.csv"

    try:
        df = load_data(csv_file)
        df = preprocess_data(df)
        df = feature_engineering(df)

        df = df.reset_index(drop=True)

        x = df.drop(columns=['AHD'])
        y = df['AHD']

        print("\n✅ Features selected for training:", x.columns)
        print("\n✅ Target variable:", y.name)

        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=42)

        # Train ML models
        best_ml_model = train_models(x_train, x_test, y_train, y_test)
        save_model(best_ml_model, "best_ml_model.pkl")

        # Train Deep Learning Model
        best_dl_model, history = train_deep_learning_model(x_train, x_test, y_train, y_test)
        save_model(best_dl_model, "heart_disease_dl_model.keras")

    except Exception as e:
        print("\n⚠️ Full Error Message:", repr(e))
        traceback.print_exc()



✅ Columns in dataset: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

🔹 Updated Columns after Renaming: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'AHD'],
      dtype='object')

✅ Features selected for training: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'age*chol',
       'trestbps*thalach'],
      dtype='object')

✅ Target variable: AHD

🔹 Logistic Regression Performance:
Accuracy: 0.7276264591439688
F1 Score: 0.7388059701492538
              precision    recall  f1-score   support

           0       0.77      0.67      0.72       132
           1       0.69      0.79      0.74       125

    accuracy                           0.73       257
   macro avg       0.73      0.73      0.73       257
weighted avg  