In [139]:
import numpy as np 
import matplotlib.pyplot as plt

In [140]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [141]:
np.random.seed(42)

In [142]:
# Q4 part (a)
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Q4 part (b)
# First, split into train + validation (80%) and test (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Then, split train + validation into training (60%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 341
Validation set size: 114
Test set size: 114


In [143]:
# Q4 part (c)
print("Training set:")
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))

print("\nValidation set:")
unique, counts = np.unique(y_val, return_counts=True)
print(dict(zip(unique, counts)))

print("\nTest set:")
unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))

Training set:
{0: 127, 1: 214}

Validation set:
{0: 43, 1: 71}

Test set:
{0: 42, 1: 72}


In [144]:
# Q3
class LogisticRegressionSGD:
    def __init__(self, learning_rate=0.01, batch_size=32, max_iter=1000, random_state=None):
        """
        Initialize the logistic regression model.
        
        Parameters:
        - learning_rate: The fixed learning rate for SGD
        - batch_size: The number of samples per mini-batch
        - max_iter: The maximum number of iterations over the training set
        - random_state: Seed for reproducibility
        """
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.random_state = random_state
        self.weights = None

    def sigmoid(self, z):
        # Clip the values of z to avoid overflow in np.exp
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))


    def fit(self, X, y):
        """
        Fit the logistic regression model using mini-batch SGD.
        
        Parameters:
        - X: Input features
        - y: Binary target values
        
        Returns:
        - self: Fitted model
        """
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        
        # Add bias term to features
        X_bias = np.hstack([np.ones((n_samples, 1)), X])
        
        # Initialize weights randomly from a standard Gaussian distribution
        self.weights = np.random.randn(n_features + 1)
        
        # Perform mini-batch SGD for max_iter epochs
        for epoch in range(self.max_iter):
            # Shuffle the training data at the beginning of each epoch
            indices = np.random.permutation(n_samples)
            X_bias_shuffled = X_bias[indices]
            y_shuffled = y[indices]
            
            # Process mini-batches
            for start in range(0, n_samples, self.batch_size):
                end = start + self.batch_size
                X_batch = X_bias_shuffled[start:end]
                y_batch = y_shuffled[start:end]
                
                # Compute predictions for the batch
                predictions = self.sigmoid(np.dot(X_batch, self.weights))
                
                # Compute the gradient of the negative log-likelihood
                gradient = np.dot(X_batch.T, (predictions - y_batch)) / X_batch.shape[0]
                
                # Update weights using the learning rate
                self.weights -= self.learning_rate * gradient
        
        return self

    def predict_proba(self, X):
        """
        Predict probability estimates for input data X
        
        Parameters:
        - X: Input features
        
        Returns:
        - Probabilities for the positive class
        """
        n_samples = X.shape[0]
        X_bias = np.hstack([np.ones((n_samples, 1)), X])
        return self.sigmoid(np.dot(X_bias, self.weights))

    def predict(self, X, threshold=0.5):
        """
        Predict binary class labels for input data X.
        
        Parameters:
        - X: Input features
        - threshold: Decision threshold to classify samples
        
        Returns:
        - Predicted binary class labels
        """
        proba = self.predict_proba(X)
        return (proba >= threshold).astype(int)

In [145]:
# Q4 part (d)
learning_rates = [0.01, 0.001, 0.0001]
results = []

# Loop over each learning rate
for lr in learning_rates:
    model = LogisticRegressionSGD(
        learning_rate=lr,
        batch_size=32,
        max_iter=1000,
        random_state=42
    )
    
    # Train the model on the training set
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Q4 part (e)
    # Scoring metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append to results list
    results.append({'learning_rate': lr, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}) 

# Display the results
for res in results:
    print(f"Learning Rate: {res['learning_rate']}, Accuracy: {res['accuracy']:.4f}, Precision: {res['precision']:.4f}, Recall: {res['recall']:.4f}, F1: {res['f1']:.4f}")

Learning Rate: 0.01, Accuracy: 0.9035, Precision: 0.9420, Recall: 0.9028, F1: 0.9220
Learning Rate: 0.001, Accuracy: 0.9386, Precision: 0.9221, Recall: 0.9861, F1: 0.9530
Learning Rate: 0.0001, Accuracy: 0.9211, Precision: 0.9437, Recall: 0.9306, F1: 0.9371


**Summary:** By experimenting with different learning rates, it seems that the best one is $\boxed{\text{lr} = 0.001}$. 0.01 is too large of a learning rate and 0.0001 is too small, so the best learning rate in this case given the max number of iterations is indeed 0.001.