In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import randint

#**dataset**

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target

# **Split data into training and testing sets**

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Models with pipelines and hyperparameter search spaces**

In [9]:
models = [
    {
        'name': 'Logistic Regression',
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', LogisticRegression(solver='liblinear'))
        ]),
        'search_type': 'grid',
        'params': {
            'clf__C': [0.1, 1, 10],
            'clf__penalty': ['l1', 'l2']
        }
    },
    {
        'name': 'Decision Tree',
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', DecisionTreeClassifier())
        ]),
        'search_type': 'grid',
        'params': {
            'clf__max_depth': [3, 5, 7, None],
            'clf__min_samples_split': [2, 5, 10]
        }
    },
    {
        'name': 'Random Forest',
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', RandomForestClassifier())
        ]),
        'search_type': 'random',
        'params': {
            'clf__n_estimators': randint(50, 200),
            'clf__max_depth': [3, 5, 7, None],
            'clf__min_samples_split': randint(2, 11)
        },
        'n_iter': 10
    },
    {
        'name': 'SVM',
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(probability=True))
        ]),
        'search_type': 'grid',
        'params': {
            'clf__C': [0.1, 1, 10],
            'clf__kernel': ['linear', 'rbf'],
            'clf__gamma': ['scale', 'auto']
        }
    },
    {
        'name': 'KNN',
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', KNeighborsClassifier())
        ]),
        'search_type': 'grid',
        'params': {
            'clf__n_neighbors': list(range(1, 21))
        }
    }
]

In [10]:
# list to store results
results = []

# **Performing hyperparameter tuning and evaluation for each model**

In [11]:
for model in models:
    if model['search_type'] == 'grid':
        search = GridSearchCV(model['pipeline'], model['params'], cv=5, scoring='accuracy')
    else:
        search = RandomizedSearchCV(model['pipeline'], model['params'], n_iter=model['n_iter'], cv=5, scoring='accuracy', random_state=42)

    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else None

    # Calculate metrics
    metrics = {
        'Model': model['name'],
        'Best Parameters': search.best_params_,
        'Test Accuracy': accuracy_score(y_test, y_pred),
        'Test Precision': precision_score(y_test, y_pred, average='binary'),
        'Test Recall': recall_score(y_test, y_pred, average='binary'),
        'Test F1-Score': f1_score(y_test, y_pred, average='binary'),
        'Test ROC-AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else 'N/A'
    }
    results.append(metrics)

# Convert results to DataFrame and sort by F1-score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Test F1-Score', ascending=False)

# **Displaying results**

In [12]:
print("\nModel Performance Comparison (sorted by F1-Score):\n")
print(results_df.to_string(index=False))


Model Performance Comparison (sorted by F1-Score):

              Model                                                              Best Parameters  Test Accuracy  Test Precision  Test Recall  Test F1-Score  Test ROC-AUC
Logistic Regression                                        {'clf__C': 0.1, 'clf__penalty': 'l2'}       0.991228        0.986111     1.000000       0.993007      0.998690
                SVM              {'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}       0.982456        0.972603     1.000000       0.986111      0.997380
      Random Forest {'clf__max_depth': 7, 'clf__min_samples_split': 4, 'clf__n_estimators': 124}       0.964912        0.958904     0.985915       0.972222      0.996397
      Decision Tree                          {'clf__max_depth': 3, 'clf__min_samples_split': 10}       0.947368        0.945205     0.971831       0.958333      0.957419
                KNN                                                      {'clf__n_neighbors': 5} 

# **Identify the best model**

In [13]:
best_model = results_df.loc[results_df['Test F1-Score'].idxmax()]
print(f"\nBest Model: {best_model['Model']} with F1-Score of {best_model['Test F1-Score']:.4f}")


Best Model: Logistic Regression with F1-Score of 0.9930
