# Breast Cancer Classification using 1D CNN + Transfer Learning (Autoencoder)

This notebook:
1. Loads the WDBC dataset (Kaggle CSV if available; otherwise falls back to `sklearn`).
2. Preprocesses features and labels.
3. Trains classic ML baselines.
4. Trains an **autoencoder** on the features, then **transfers** the encoder to a 1D CNN classifier.
5. Evaluates with confusion matrix, ROC/PR, and classification report.

> _Dataset columns example (as you showed): `id`, `diagnosis`, 30 numeric features, and an all-NaN `Unnamed: 32`._


In [None]:
# Versions & reproducibility
import sys, numpy as np, random, tensorflow as tf, sklearn, pandas as pd, matplotlib
print("Python", sys.version.split()[0])
print("NumPy", np.__version__)
print("Pandas", pd.__version__)
print("Scikit-learn", sklearn.__version__)
print("TensorFlow", tf.__version__)

seed = 42
random.seed(seed); np.random.seed(seed); tf.random.set_seed(seed)


## 1) Load data

In [None]:
from pathlib import Path
import pandas as pd
from src.utils import load_wdbc_csv_or_sklearn, preprocess

DATA_PATH = Path('../data/data.csv')
df = load_wdbc_csv_or_sklearn(str(DATA_PATH))
df.head()

## 2) Preprocess

In [None]:
X, y = preprocess(df)
X.shape, y.value_counts()

## 3) Train/Val/Test split + scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

X_train_c = X_train_s[..., None]
X_val_c = X_val_s[..., None]
X_test_c = X_test_s[..., None]

classes = np.unique(y_train)
class_weights = dict(zip(
    classes,
    compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
))
class_weights

## 4) Baselines (LogReg, RandomForest, GradientBoosting)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

models = {
    "LogReg": LogisticRegression(max_iter=500, n_jobs=None),
    "RF": RandomForestClassifier(n_estimators=400, random_state=42),
    "GB": GradientBoostingClassifier(random_state=42),
}

for name, mdl in models.items():
    mdl.fit(X_train_s, y_train)
    proba = mdl.predict_proba(X_val_s)[:,1]
    auc = roc_auc_score(y_val, proba)
    print(f"{name}: Val ROC-AUC = {auc:.4f}")

## 5) Autoencoder pretraining

In [None]:
from src.models import build_autoencoder
from tensorflow import keras
import matplotlib.pyplot as plt

ae, enc = build_autoencoder(X_train_c.shape[1], latent_dim=16)
cb = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss")]
hist = ae.fit(X_train_c, X_train_c, validation_data=(X_val_c, X_val_c),
              epochs=60, batch_size=32, verbose=2, callbacks=cb)

# Plot loss
plt.figure()
plt.plot(hist.history['loss'], label='train_loss')
plt.plot(hist.history['val_loss'], label='val_loss')
plt.legend(); plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Autoencoder Training Loss')
plt.show()

## 6) Transfer encoder to CNN classifier (frozen â†’ fine-tuned)

In [None]:
from src.models import build_cnn_classifier

# Frozen encoder
clf = build_cnn_classifier(X_train_c.shape[1], encoder=enc, freeze_encoder=True)
cb = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_auc", mode="max")]
hist = clf.fit(X_train_c, y_train, validation_data=(X_val_c, y_val),
               epochs=60, batch_size=32, verbose=2, class_weight=class_weights, callbacks=cb)

# Fine-tune: unfreeze
for layer in clf.layers:
    layer.trainable = True
clf.compile(optimizer=keras.optimizers.Adam(1e-4), loss='binary_crossentropy',
            metrics=[keras.metrics.AUC(name='auc'), 'accuracy'])
hist_ft = clf.fit(X_train_c, y_train, validation_data=(X_val_c, y_val),
                  epochs=30, batch_size=32, verbose=2, class_weight=class_weights, callbacks=cb)

## 7) Evaluation on Test Set

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, precision_recall_curve, average_precision_score
import numpy as np
import matplotlib.pyplot as plt

y_proba = clf.predict(X_test_c).ravel()
y_pred = (y_proba >= 0.5).astype(int)

print(classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
cm

# ROC
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curve'); plt.show()

# PR
prec, rec, _ = precision_recall_curve(y_test, y_proba)
ap = average_precision_score(y_test, y_proba)
plt.figure(); plt.plot(rec, prec)
plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title(f'Precision-Recall Curve (AP={ap:.3f})'); plt.show()

## 8) Save artifacts & simple inference

In [None]:
from pathlib import Path
import joblib

outdir = Path('../outputs')
outdir.mkdir(parents=True, exist_ok=True)

# Save model & scaler
clf.save(outdir / 'cnn_finetuned.keras')
joblib.dump(scaler, outdir / 'scaler.joblib')

# Inference example
import numpy as np
sample = X_test.iloc[[0]].to_numpy()
sample_s = scaler.transform(sample)[..., None]
proba = clf.predict(sample_s).ravel()[0]
print("Sample probability of malignancy:", float(proba))