In [None]:
# ==========================================================
# 1) IMPORT LIBRARIES
# ==========================================================

# PyTorch core library
import torch

# Neural network module (contains Linear, ReLU, etc.)
import torch.nn as nn

# Optimization algorithms (SGD, Adam, etc.)
import torch.optim as optim

# Classical ML tools for dataset & splitting
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import numpy as np


# ==========================================================
# 2) LOAD DATASET
# ==========================================================

"""
We use a classical ML1 dataset: Breast Cancer Wisconsin dataset.

- 30 numerical features (radius, texture, perimeter, etc.)
- Binary target (0 = malignant, 1 = benign)

This is a supervised learning setting:
We have input X and labels y.
"""

data = load_breast_cancer()

X = data.data      # shape: (569, 30)
y = data.target    # shape: (569,)

print("Dataset shape:", X.shape)


# ==========================================================
# 3) TRAIN / VALIDATION / TEST SPLIT
# ==========================================================

"""
Why do we split?

Training set   → used to learn parameters (W, b)
Validation set → used to tune and detect overfitting
Test set       → completely unseen data for final evaluation

This is VERY important to avoid data leakage.
"""

# First split into training+validation and test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.2,          # 20% test
    random_state=42,
    stratify=y              # keeps class balance
)

# Now split training into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.2,          # 20% of training becomes validation
    random_state=42,
    stratify=y_train_val
)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])


# ==========================================================
# 4) PREPROCESSING (FEATURE SCALING)
# ==========================================================

"""
Neural networks are VERY sensitive to feature scale.

If one feature ranges from 0–1000 and another from 0–1,
gradients will behave badly and training becomes unstable.

Therefore we standardize:

x_scaled = (x - mean) / std

Important:
We FIT scaler ONLY on training data.
Then TRANSFORM validation and test.
This avoids data leakage.
"""

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)


# ==========================================================
# 5) CONVERT TO PYTORCH TENSORS
# ==========================================================

"""
PyTorch works with tensors, not numpy arrays.

Also:
- Features → float32
- Labels → float32 for BCELoss
- Labels reshaped to (N,1) because output is (N,1)
"""

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)


# ==========================================================
# 6) DEFINE NEURAL NETWORK MODEL
# ==========================================================

"""
From ML1 perspective:
Logistic Regression model is:

    y_hat = sigmoid(Wx + b)

That is a SINGLE linear layer.

Now we extend it using Deep Learning:

    x → Linear → ReLU → Linear → ReLU → Linear → Sigmoid

Why ReLU?
Because without nonlinearity, stacking linear layers
is still equivalent to one linear layer.

ReLU allows learning nonlinear decision boundaries.
"""

class BreastCancerNN(nn.Module):
    
    def __init__(self):
        super(BreastCancerNN, self).__init__()
        
        self.network = nn.Sequential(
            
            # Input layer (30 features)
            nn.Linear(30, 32),
            nn.ReLU(),
            
            # Hidden layer
            nn.Linear(32, 16),
            nn.ReLU(),
            
            # Output layer (1 neuron for binary classification)
            nn.Linear(16, 1),
            
            # Sigmoid converts output to probability [0,1]
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.network(x)


model = BreastCancerNN()


# ==========================================================
# 7) DEFINE LOSS FUNCTION & OPTIMIZER
# ==========================================================

"""
Since this is binary classification,
we use Binary Cross Entropy:

L = -[y log(y_hat) + (1-y) log(1-y_hat)]

This is the SAME loss used in Logistic Regression.

Optimizer:
We use Adam instead of plain Gradient Descent.
Adam adapts learning rate per parameter
and converges faster in practice.
"""

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# ==========================================================
# 8) TRAINING LOOP
# ==========================================================

"""
Training process:

1) Forward pass → compute predictions
2) Compute loss
3) Backpropagation → compute gradients
4) Update parameters

We also compute validation loss
to check if the model is overfitting.
"""

epochs = 150

for epoch in range(epochs):
    
    # -----------------
    # TRAIN MODE
    # -----------------
    model.train()
    
    predictions = model(X_train)
    train_loss = criterion(predictions, y_train)
    
    # Clear previous gradients
    optimizer.zero_grad()
    
    # Backpropagation
    train_loss.backward()
    
    # Update weights
    optimizer.step()
    
    
    # -----------------
    # VALIDATION MODE
    # -----------------
    model.eval()
    
    with torch.no_grad():   # disable gradient computation
        val_predictions = model(X_val)
        val_loss = criterion(val_predictions, y_val)
    
    
    # Print progress every 20 epochs
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{epochs}] | "
              f"Train Loss: {train_loss.item():.4f} | "
              f"Val Loss: {val_loss.item():.4f}")


# ==========================================================
# 9) FINAL TEST EVALUATION
# ==========================================================

"""
Now we evaluate on completely unseen test data.
This gives the true generalization performance.
"""

model.eval()

with torch.no_grad():
    test_outputs = model(X_test)
    
    # Convert probability → class label using threshold 0.5
    test_predictions = (test_outputs > 0.5).float()
    
    accuracy = accuracy_score(y_test, test_predictions)

print("\n========== FINAL TEST RESULTS ==========")
print("Test Accuracy:", accuracy)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_predictions))

print("\nClassification Report:")
print(classification_report(y_test, test_predictions))