In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
import pickle
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

warnings.filterwarnings("ignore")

In [2]:
# Loading dataset
df = pd.read_csv("breast_cancer_wisconsin_diagnostic.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,M
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,M
2,2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,M
3,3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,M
4,4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,M


In [4]:
# Separating features and labels
X = df.drop(columns=['Diagnosis'])  # Drop target variable
y = df['Diagnosis']

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

In [7]:
#Feature Selection using Recursive Feature Elimination (RFE)
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10, step=1)
rfe_selector.fit(X, y)

selected_features = X.columns[rfe_selector.support_]
print("Selected Features:", selected_features)

Selected Features: Index(['radius1', 'concavity1', 'concave_points1', 'texture2', 'perimeter2',
       'radius3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3'],
      dtype='object')


In [8]:
# Using only selected features
X_selected = X[selected_features]

In [9]:
# Splitting data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.25, random_state=42)

In [10]:
# Initialize models with scaling and training in pipelines
models = {
    "Logistic Regression (Elastic Net)": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(
            penalty='elasticnet',
            solver='saga',
            l1_ratio=0.5,
            C=1.0,
            max_iter=1000
        ))
    ]),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVC": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(random_state=42))
    ])
}

In [11]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    if isinstance(model, Pipeline) and "model" in model.named_steps:
        model_cv = model
    else:
        # For non-pipeline models, create a temporary pipeline for cross-validation
        model_cv = Pipeline([("scaler", StandardScaler()), ("model", model)])

    cv_scores = cross_val_score(model_cv, X_selected, y, cv=5, scoring='accuracy')

    print(f"\n{name}:")
    print(f"Test Accuracy: {acc:.2f}")
    print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


Logistic Regression (Elastic Net):
Test Accuracy: 0.96
Cross-Validation Accuracy: 0.96 ± 0.02

Random Forest:
Test Accuracy: 0.96
Cross-Validation Accuracy: 0.96 ± 0.01

Decision Tree:
Test Accuracy: 0.95
Cross-Validation Accuracy: 0.93 ± 0.02

SVC:
Test Accuracy: 0.98
Cross-Validation Accuracy: 0.95 ± 0.02


In [12]:
# Save the best-performing model (Logistic Regression in this case)
best_model = models["Logistic Regression (Elastic Net)"]
pickle.dump(best_model, open("logistic_regression_Breast_Cancer_model.pkl", "wb"))

print("\nModel saved as 'logistic_regression_Breast_Cancer_model.pkl'")


Model saved as 'logistic_regression_Breast_Cancer_model.pkl'
