In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt

# Sklearn core tools
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

# XGBoost
from xgboost import XGBClassifier

### Load and Explore Data

In [3]:
# Load dataset
df = pd.read_csv(r"C:\Users\rasik\Downloads\BreastCancerpredictionProject\breast-cancer-data.csv")

# Show first few rows
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


### Distribution of Diagnosis
This helps us to understand class balance.

In [4]:
df.diagnosis.value_counts(normalize=True) * 100

diagnosis
B    62.741652
M    37.258348
Name: proportion, dtype: float64

#### Interpretation:
- The data is roughly balanced between malignant and benign.
- No sampling needed.

### Encode Labels
Convert 'M' (Malignant) to 1 and 'B' (Benign) to 0:

In [5]:
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

### Train-Test Split
Separate predictors and target, then split data:

In [7]:
X = df.drop(columns=['id', 'diagnosis','Unnamed: 32'])
y = df['diagnosis'].values
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

(569, 30)


### Feature Scaling
Scale features to have mean 0 and variance 1:

In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Feature Engineering
We apply:
- RFE (Recursive Feature Elimination)
- PCA (Principal Component Analysis)
- Kernel PCA
- LDA (Linear Discriminant Analysis)

In [9]:
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

In [10]:
feature_names = X.columns

def train_and_report(X_train, X_test, y_train, y_test, description):
    """
    Train logistic regression and print evaluation metrics.
    """
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n=== {description} ===")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["Malignant", "Benign"]))
# 1. RFE
rfe_selector = RFE(LogisticRegression(max_iter=1000), n_features_to_select=15)
rfe_selector.fit(X_train, y_train)

X_train_rfe = rfe_selector.transform(X_train)
X_test_rfe = rfe_selector.transform(X_test)

selected_features = np.array(feature_names)[rfe_selector.get_support()]
print("\nRFE Selected Features:")
print(selected_features)

train_and_report(X_train_rfe, X_test_rfe, y_train, y_test, "RFE (15 features)")

# 2. PCA
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

train_and_report(X_train_pca, X_test_pca, y_train, y_test, "PCA (10 components)")

# 3. Kernel PCA
kpca = KernelPCA(n_components=10, kernel="rbf")
X_train_kpca = kpca.fit_transform(X_train)
X_test_kpca = kpca.transform(X_test)

train_and_report(X_train_kpca, X_test_kpca, y_train, y_test, "Kernel PCA (10 components)")

# 4. LDA
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

train_and_report(X_train_lda, X_test_lda, y_train, y_test, "LDA (1 component)")


RFE Selected Features:
['concavity_mean' 'concave points_mean' 'radius_se' 'perimeter_se'
 'area_se' 'compactness_se' 'fractal_dimension_se' 'radius_worst'
 'texture_worst' 'perimeter_worst' 'area_worst' 'concavity_worst'
 'concave points_worst' 'symmetry_worst' 'fractal_dimension_worst']

=== RFE (15 features) ===
Confusion Matrix:
[[65  2]
 [ 3 44]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.96      0.97      0.96        67
      Benign       0.96      0.94      0.95        47

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


=== PCA (10 components) ===
Confusion Matrix:
[[64  3]
 [ 3 44]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.96      0.96      0.96        67
      Benign       0.94      0.94      0.94        47

    accuracy                           0.9

✅ Malignant Recall priority:

1. Kernel PCA (1.00)

2. LDA (0.99)

3. RFE (0.97)

4. PCA (0.96)

✅ Overall balance (Recall + Precision + Accuracy):

LDA has the best trade-off:

1. Very high recall (0.99)

2. Very high precision (0.97)

3. Highest overall accuracy (0.97)

4. Lowest false positives (1)

### Model Comparison on LDA Features
We train multiple classifiers to find the best.

In [11]:
def evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42, n_estimators=500),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "SVM": SVC(probability=True),
        "KNN": KNeighborsClassifier(),
        "Naive Bayes": GaussianNB()
    }
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\n=== {name} ===")
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=["Malignant", "Benign"]))
evaluate_models(X_train_lda, X_test_lda, y_train, y_test)


=== Logistic Regression ===
Confusion Matrix:
[[66  1]
 [ 2 45]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.97      0.99      0.98        67
      Benign       0.98      0.96      0.97        47

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


=== Random Forest ===
Confusion Matrix:
[[65  2]
 [ 3 44]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.96      0.97      0.96        67
      Benign       0.96      0.94      0.95        47

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


=== Decision Tree ===
Confusion Matrix:
[[65  2]
 [ 3 44]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.96      0.97   

✅ Logistic Regression and SVM are clear standouts:

- Recall 0.99 on malignant (missed only 1 case)

- Very high precision and F1

- Highest accuracy (0.97)

✅ Naive Bayes also very good but slightly more false positives (lower Benign recall).

✅ KNN, Random Forest, Decision Tree had:

- Recall 0.97 (missed 2 cancer cases instead of 1)

- Slightly lower metrics overall

### Boosting Models on RFE Features
We also test ensemble methods:

In [12]:
def evaluate_boosting_models(X_train, X_test, y_train, y_test):
    models = {
        "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\n=== {name} ===")
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=["Malignant", "Benign"]))
evaluate_boosting_models(X_train_rfe, X_test_rfe, y_train, y_test)


=== AdaBoost ===
Confusion Matrix:
[[64  3]
 [ 2 45]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.97      0.96      0.96        67
      Benign       0.94      0.96      0.95        47

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114


=== Gradient Boosting ===
Confusion Matrix:
[[64  3]
 [ 3 44]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       0.96      0.96      0.96        67
      Benign       0.94      0.94      0.94        47

    accuracy                           0.95       114
   macro avg       0.95      0.95      0.95       114
weighted avg       0.95      0.95      0.95       114


=== XGBoost ===
Confusion Matrix:
[[63  4]
 [ 0 47]]
Classification Report:
              precision    recall  f1-score   support

   Malignant       1.00      0.94      0.97      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost Highlights:

- Benign Recall = 1.00 (no false negatives on benign)

- Malignant Recall = 0.94 (missed 4 cancer cases)
⚠️ This is lower than Logistic Regression (0.97) and SVM (0.97).

- High overall accuracy, but recall on malignant is the most important metric here.

✅ AdaBoost:

- Almost identical to Random Forest and Logistic Regression.

- Malignant Recall 0.96

- Accuracy 0.96

✅ Gradient Boosting:

- Malignant Recall 0.96

- Slightly lower accuracy (0.95)


### Hyperparameter Tuning for Logistic Regression
We fine-tune Logistic Regression using Grid Search:


In [13]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_lda, y_train)

print("\nBest Parameters:")
print(grid_search.best_params_)

best_log_reg = grid_search.best_estimator_
y_pred = best_log_reg.predict(X_test_lda)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Malignant", "Benign"]))


Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best Parameters:
{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

Confusion Matrix:
[[66  1]
 [ 2 45]]

Classification Report:
              precision    recall  f1-score   support

   Malignant       0.97      0.99      0.98        67
      Benign       0.98      0.96      0.97        47

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



### Save Models and Scaler

In [14]:
import pickle

# Save scaler, LDA, and final model
with open("scaler.pkl", "wb") as f:
    pickle.dump(sc, f)

with open("lda.pkl", "wb") as f:
    pickle.dump(lda, f)

with open("logistic_regression_best.pkl", "wb") as f:
    pickle.dump(best_log_reg, f)
