In [42]:
import torch
import numpy as np
import torch.nn as nn
from sklearn import datasets
from sklearn.preprocessing import StandardScaler  # for feature scaling
from sklearn.model_selection import train_test_split  # for train/test split

In [43]:
# Prepare the data
bc = datasets.load_breast_cancer()  # we download a small dataset already available
X, y = bc.data, bc.target

n_samples, n_features = X.shape
print(f'number of samples: {n_samples}, number of features: {n_features}')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # test size = 0.2 ensures testing data is 20% and training data is 80%

# scale data -why?   bcoz The features in the breast cancer dataset vary a lot in scale (some are in 100s, some in 0.01s). This causes large values when passed through a Linear layer → unstable outputs → unstable logits → unstable loss.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

number of samples: 569, number of features: 30


In [44]:
# 1. Convert data into tensors

X_train = torch.from_numpy(np.array(X_train, dtype = np.float32))
X_test = torch.from_numpy(np.array(X_test, dtype = np.float32))
y_train = torch.from_numpy(np.array(y_train, dtype = np.float32)).view(-1, 1)
y_test = torch.from_numpy(np.array(y_test, dtype = np.float32)).view(-1, 1)

In [45]:
# 2. Build the model

class LogisticRegressionModel(nn.Module):
  def __init__(self, n_input_features):
    super(LogisticRegressionModel, self).__init__()

    self.linear = nn.Linear(n_input_features, 1) # n_input_features are number of features in the input data and 1 is bcoz logistic regression outputs a single value between 0 and 1 → the probability of class 1.

  def forward(self, x):
    return self.linear(x) # returns raw logits as output

model = LogisticRegressionModel(n_features) # passing n_features here bcoz nn.Linear must know how many input features are there

In [46]:
# 2.1 instantitate the loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

# 2.1 instantiate optimization
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [47]:
# 2.2 Training the loop

epochs = 100

for epoch in range(epochs):

  # Forward Pass
  y_preds = model(X_train)

  # Calculate the loss
  loss = loss_fn(y_preds, y_train)

  # Optimize the zero grad
  optimizer.zero_grad()

  # Loss Backward
  loss.backward()

  # Optimizer step
  optimizer.step()

  if (epoch + 1) % 10 == 0:
    print(f'epoch: {epoch + 1}, loss = {loss.item():.4f}')


epoch: 10, loss = 0.5116
epoch: 20, loss = 0.4406
epoch: 30, loss = 0.3919
epoch: 40, loss = 0.3563
epoch: 50, loss = 0.3290
epoch: 60, loss = 0.3073
epoch: 70, loss = 0.2896
epoch: 80, loss = 0.2747
epoch: 90, loss = 0.2621
epoch: 100, loss = 0.2511


In [48]:
# 3. Evaluating the model

with torch.no_grad():
  y_preds = torch.sigmoid(model(X_test))
  y_preds_cls = y_preds.round() # Converts the predicted probabilities into binary class labels (0 or 1).
  acc = y_preds_cls.eq(y_test).sum() / float(y_test.shape[0]) #The accuracy is determined by counting the correct predictions and dividing by the total number of samples in the test set.
  print(f'accuracy = {acc:.4f}')

accuracy = 0.9737
