In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load your dataset (replace 'your_data.csv' with the actual file path)
data_path = 'wdbc.data'  # Assuming your file is named wdbc.data

# Column names based on the .names file
column_names = [
    'ID', 'Diagnosis', 'Radius_mean', 'Texture_mean', 'Perimeter_mean', 'Area_mean', 'Smoothness_mean',
    'Compactness_mean', 'Concavity_mean', 'Concave_points_mean', 'Symmetry_mean', 'Fractal_dimension_mean',
    'Radius_se', 'Texture_se', 'Perimeter_se', 'Area_se', 'Smoothness_se', 'Compactness_se', 'Concavity_se',
    'Concave_points_se', 'Symmetry_se', 'Fractal_dimension_se', 'Radius_worst', 'Texture_worst',
    'Perimeter_worst', 'Area_worst', 'Smoothness_worst', 'Compactness_worst', 'Concavity_worst',
    'Concave_points_worst', 'Symmetry_worst', 'Fractal_dimension_worst'
]

# Load data (comma-separated)
df = pd.read_csv(data_path, header=None, names=column_names)

# Step 2: Remove the 'ID' column since it's not useful for modeling
df = df.drop(columns=['ID'])

# Step 3: Define features (X) and labels (y)
X = df.drop(columns=['Diagnosis'])  # All columns except 'Diagnosis'
y = df['Diagnosis']  # Diagnosis is the target label

# Step 4: Split into training + validation (80%) and test (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Split the training + validation into training (70%) and validation (30%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=42)

# Step 6: Calculate class distributions for training and validation sets
class_distribution_train = y_train.value_counts()
class_distribution_val = y_val.value_counts()

# Step 7: Report the size of each class in the training and validation sets
print("Class distribution in training set:\n", class_distribution_train)
print("\nClass distribution in validation set:\n", class_distribution_val)


Class distribution in training set:
 Diagnosis
B    198
M    120
Name: count, dtype: int64

Class distribution in validation set:
 Diagnosis
B    88
M    49
Name: count, dtype: int64


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class LogisticRegressionSGD:
    def __init__(self, learning_rate=0.01, batch_size=32, max_iters=1000):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.max_iters = max_iters
        self.weights = None

    # Sigmoid function
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    # Compute the binary cross-entropy loss (negative log-likelihood)
    def compute_loss(self, X, y):
        N = len(y)
        y_pred = self.sigmoid(np.dot(X, self.weights))
        # Avoid log(0) by using small epsilon
        epsilon = 1e-10
        loss = -np.mean(y * np.log(y_pred + epsilon) + (1 - y) * np.log(1 - y_pred + epsilon))
        return loss

    # Compute the gradient of the loss function
    def compute_gradient(self, X, y):
        y_pred = self.sigmoid(np.dot(X, self.weights))
        gradient = np.dot(X.T, (y_pred - y)) / len(y)
        return gradient

    # Mini-batch SGD for optimization
    def fit(self, X, y):
        # Initialize weights randomly from a standard Gaussian distribution
        self.weights = np.random.randn(X.shape[1])

        for i in range(self.max_iters):
            # Shuffle the data before creating mini-batches
            indices = np.arange(X.shape[0])
            np.random.shuffle(indices)
            X = X[indices]
            y = y[indices]

            # Mini-batch gradient descent
            for batch_start in range(0, X.shape[0], self.batch_size):
                X_batch = X[batch_start:batch_start + self.batch_size]
                y_batch = y[batch_start:batch_start + self.batch_size]

                # Compute gradient for the current mini-batch
                gradient = self.compute_gradient(X_batch, y_batch)

                # Update weights
                self.weights -= self.learning_rate * gradient

            # Optional: Print loss every 100 iterations
            if i % 100 == 0:
                loss = self.compute_loss(X, y)
                print(f"Iteration {i}, Loss: {loss:.4f}")

    # Predict binary labels for input data X
    def predict(self, X):
        y_pred = self.sigmoid(np.dot(X, self.weights))
        return (y_pred >= 0.5).astype(int)

# Generate synthetic data for binary classification
np.random.seed(42)
X = np.random.randn(1000, 20)  # 1000 samples, 20 features
y = (X[:, 0] + X[:, 1] > 0).astype(int)  # Label is 1 if the sum of the first two features is positive

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Experiment with different learning rates and batch sizes
learning_rates = [0.001, 0.01, 0.1]
batch_sizes = [16, 32, 64]

for lr in learning_rates:
    for bs in batch_sizes:
        print(f"\nTraining model with learning rate = {lr} and batch size = {bs}")
        model = LogisticRegressionSGD(learning_rate=lr, batch_size=bs, max_iters=1000)
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")



Training model with learning rate = 0.001 and batch size = 16
Iteration 0, Loss: 1.8794
Iteration 100, Loss: 0.8664
Iteration 200, Loss: 0.4140
Iteration 300, Loss: 0.2563
Iteration 400, Loss: 0.1972
Iteration 500, Loss: 0.1706
Iteration 600, Loss: 0.1563
Iteration 700, Loss: 0.1472
Iteration 800, Loss: 0.1407
Iteration 900, Loss: 0.1356
Accuracy: 99.00%
Precision: 0.98
Recall: 1.00
F1 Score: 0.99

Training model with learning rate = 0.001 and batch size = 32
Iteration 0, Loss: 2.6316
Iteration 100, Loss: 1.8601
Iteration 200, Loss: 1.2596
Iteration 300, Loss: 0.8237
Iteration 400, Loss: 0.5406
Iteration 500, Loss: 0.3746
Iteration 600, Loss: 0.2839
Iteration 700, Loss: 0.2348
Iteration 800, Loss: 0.2067
Iteration 900, Loss: 0.1895
Accuracy: 96.50%
Precision: 0.95
Recall: 0.98
F1 Score: 0.96

Training model with learning rate = 0.001 and batch size = 64
Iteration 0, Loss: 1.8728
Iteration 100, Loss: 1.5038
Iteration 200, Loss: 1.1919
Iteration 300, Loss: 0.9391
Iteration 400, Loss: 0.