## Question 3 + 4

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
class LogisticRegressionminiBatchSGD:
    def __init__(self, learning_rate = 0.01, batch_size = 32, max_iter = 1000):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.weights = None
        self.bias = 0

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def loss(self, y, y_pred):
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.random.normal(0, 1, n_features)
        self.bias = 0

        for i in range(self.max_iter):
            indices = np.arange(n_samples)
            np.random.shuffle(indices)
            X, y = X[indices], y[indices]

            for j in range(0, n_samples, self.batch_size):
                X_batch = X[j:j + self.batch_size]
                y_batch = y[j:j + self.batch_size]

                lin_model = np.dot(X_batch, self.weights) + self.bias
                y_pred = self.sigmoid(lin_model)

                # Calculating the gradients
                dw = (1 / len(X_batch)) * np.dot(X_batch.T, (y_pred - y_batch))
                db = (1 / len(X_batch)) * np.sum(y_pred - y_batch)

                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db

    def predict_prob(self, X):
        lin_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(lin_model)

    def predict(self, X):
        y_pred_prob = self.predict_prob(X)
        return [1 if i >+ 0.5 else 0 for i in y_pred_prob]

In [19]:
Wisconsin_data = pd.read_csv("wdbc.data", header = None)
Wisconsin_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [21]:
X = Wisconsin_data.iloc[:, 2:]
y = Wisconsin_data.iloc[:, 1].map({"M" : 1, "B" : 0})

# Splitting into train and test data sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# Splitting train into train and validation data sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.2, stratify = y_temp)

In [23]:
# Count the class in the train set
train_class_counts = y_train.value_counts()
print(train_class_counts)

# Count the class in the validation set
val_class_counts = y_val.value_counts()
print(val_class_counts)

1
0    228
1    136
Name: count, dtype: int64
1
0    57
1    34
Name: count, dtype: int64


In [25]:
model = LogisticRegressionminiBatchSGD(learning_rate = 0.01, batch_size = 30, max_iter = 1000)

model.fit(X_train.to_numpy(), y_train.to_numpy())

test_pred = model.predict(X_test)

  return 1 / (1 + np.exp(-x))


In [27]:
accuracy = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)

print(accuracy, precision, recall, f1)

0.9298245614035088 0.9047619047619048 0.9047619047619048 0.9047619047619048


Based on my findings, my implementation performs extremely well for accuracy and then about the same for precision, recall, and f1 score (for this random run of the code). This might be a result of the imbalance in classes, but even then the scores are all quite similar. 