In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

# Fetch dataset
heart_disease = fetch_ucirepo(id=45)

# Extract data
X = heart_disease.data.features
y = heart_disease.data.targets

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Remove highly correlated features
corr_matrix = X_imputed.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
X_imputed.drop(columns=to_drop, inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

# Outlier Removal (Using IQR Method)
Q1 = X_scaled.quantile(0.25)
Q3 = X_scaled.quantile(0.75)
IQR = Q3 - Q1
X_filtered = X_scaled[~((X_scaled < (Q1 - 1.5 * IQR)) | (X_scaled > (Q3 + 1.5 * IQR))).any(axis=1)]
y_filtered = y.loc[X_filtered.index]  # Adjust target variable accordingly

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_filtered)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered)

# Handle Class Imbalance using SMOTE with fewer neighbors
from imblearn.over_sampling import SMOTE

# Use fewer neighbors to avoid the mismatch error
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=2)  # Reduced neighbors
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)


# Feature Selection using Recursive Feature Elimination (RFE)
base_model = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(base_model, n_features_to_select=15)
X_train_rfe = rfe.fit_transform(X_train_sm, y_train_sm)
X_test_rfe = rfe.transform(X_test)

# Hyperparameter tuning with RandomizedSearchCV
param_grid = {
    'C': np.logspace(-4, 4, 50),                   # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet'],         # Regularization type
    'solver': ['liblinear', 'lbfgs', 'saga'],      # Optimizers
    'class_weight': ['balanced', None],            # Class balancing
    'max_iter': [500, 1000, 2000],                 # Max iterations
    'tol': [1e-4, 1e-3, 1e-2],                     # Tolerance for stopping criteria
    'l1_ratio': np.linspace(0, 1, 10)              # ElasticNet mixing ratio (only valid with elasticnet penalty)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# RandomizedSearchCV for faster hyperparameter tuning
random_search = RandomizedSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    n_iter=150,             
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the model
random_search.fit(X_train_rfe, y_train_sm)

# Best model
best_log_reg = random_search.best_estimator_

# **Ensemble Learning**: Combine Logistic Regression with Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
voting_clf = VotingClassifier(estimators=[('lr', best_log_reg), ('rf', rf)], voting='soft')
voting_clf.fit(X_train_rfe, y_train_sm)

# Model evaluation
y_pred = voting_clf.predict(X_test_rfe)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Display results
print("\n✅ Best Parameters:", random_search.best_params_)
print(f"📊 Accuracy: {accuracy:.4f}")
print(f"🎯 F1 Score: {f1:.4f}")
print("\n🔎 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 150 candidates, totalling 750 fits

✅ Best Parameters: {'tol': 0.0001, 'solver': 'saga', 'penalty': 'elasticnet', 'max_iter': 500, 'l1_ratio': 0.2222222222222222, 'class_weight': 'balanced', 'C': 51.79474679231202}
📊 Accuracy: 0.5581
🎯 F1 Score: 0.5864

🔎 Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.68      0.77        25
           1       0.30      0.38      0.33         8
           2       0.50      0.50      0.50         4
           3       0.22      0.50      0.31         4
           4       0.00      0.00      0.00         2

    accuracy                           0.56        43
   macro avg       0.38      0.41      0.38        43
weighted avg       0.64      0.56      0.59        43


📊 Confusion Matrix:
 [[17  3  2  2  1]
 [ 2  3  0  3  0]
 [ 0  1  2  1  0]
 [ 0  2  0  2  0]
 [ 0  1  0  1  0]]
