In [1]:
## Batch Gradient Descent is memory-inefficient
## Convergence is slow too

## Solution: Use batches of data to train the model, mini-batch gradient descent

In [8]:
import torch

In [2]:
from sklearn.datasets import make_classification

In [5]:
# Step-1: Create a synthetica classification dataset using sklearn
X, y = make_classification(
    n_samples = 10,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_classes=2,
    random_state=42
)

In [6]:
X

array([[ 1.06833894, -0.97007347],
       [-1.14021544, -0.83879234],
       [-2.8953973 ,  1.97686236],
       [-0.72063436, -0.96059253],
       [-1.96287438, -0.99225135],
       [-0.9382051 , -0.54304815],
       [ 1.72725924, -1.18582677],
       [ 1.77736657,  1.51157598],
       [ 1.89969252,  0.83444483],
       [-0.58723065, -1.97171753]])

In [7]:
y

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [9]:
X = torch.tensor(X, dtype = torch.float32)
y = torch.tensor(y, dtype = torch.long)

In [10]:
from torch.utils.data import Dataset, DataLoader

In [11]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [12]:
dataset = CustomDataset(X, y)

In [13]:
len(dataset)

10

In [14]:
dataset[0]

(tensor([ 1.0683, -0.9701]), tensor(1))

In [None]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True) ## an iterable

In [17]:
for batch_features, batch_labels in dataloader:
    print(batch_features)
    print(batch_labels)
    print('-'*20)

tensor([[ 1.7273, -1.1858],
        [-0.9382, -0.5430]])
tensor([1, 1])
--------------------
tensor([[-2.8954,  1.9769],
        [-0.7206, -0.9606]])
tensor([0, 0])
--------------------
tensor([[ 1.7774,  1.5116],
        [-0.5872, -1.9717]])
tensor([1, 0])
--------------------
tensor([[ 1.0683, -0.9701],
        [ 1.8997,  0.8344]])
tensor([1, 1])
--------------------
tensor([[-1.9629, -0.9923],
        [-1.1402, -0.8388]])
tensor([0, 0])
--------------------


In [None]:
## transforming the data before using:
## transform the data in __getitem__ before returning
## e.g. images: resize, black/white, data augmentation
##      text: lower(), lemmatization, stopwords



## parallelistaion using more workers, set argument num_workers

In [None]:
## customSamplers can be made as well, e.g. if we have a skewed classification problem
## we'd want a the exact skewed ratio in all our batches

In [None]:
## collate_fn in PyTorch's DataLoader is a function that specifies how to combine a list of samples from a dataset into a single batch
## can be customised too!

In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')

In [21]:
df.drop(columns= ['id', 'Unnamed: 32'], inplace=True)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [25]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [26]:
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)

In [28]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [27]:
import torch.nn as nn

In [29]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [30]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [31]:
class MySimpleNN(nn.Module):
    ## Using built-in loss function
    def __init__(self, num_features):
        super().__init__()
        self.linear = nn.Linear(num_features, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, features):
        out = self.linear(features)
        out = self.sigmoid(out)
        return out

In [32]:
learning_rate = 0.1
epochs = 25

In [33]:
model = MySimpleNN(X_train_tensor.shape[1])
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_function = nn.BCELoss()

In [39]:
## Training Pipeline
for epoch in range(epochs):

    for batch_features, batch_labels in train_loader:

        y_pred = model(batch_features.to(torch.float32))
        loss = loss_function(y_pred, (batch_labels.to(torch.float32)).view(-1, 1))
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch + 1}, Loss: {loss}')

Epoch: 1, Loss: 0.7623339891433716
Epoch: 1, Loss: 0.5542365908622742
Epoch: 1, Loss: 0.4920593500137329
Epoch: 1, Loss: 0.41141289472579956
Epoch: 1, Loss: 0.41087689995765686
Epoch: 1, Loss: 0.36457258462905884
Epoch: 1, Loss: 0.2835494875907898
Epoch: 1, Loss: 0.2606377899646759
Epoch: 1, Loss: 0.22937524318695068
Epoch: 1, Loss: 0.20857632160186768
Epoch: 1, Loss: 0.22648610174655914
Epoch: 1, Loss: 0.3033960163593292
Epoch: 1, Loss: 0.22582444548606873
Epoch: 1, Loss: 0.19090993702411652
Epoch: 1, Loss: 0.24379292130470276
Epoch: 2, Loss: 0.17796224355697632
Epoch: 2, Loss: 0.1549619734287262
Epoch: 2, Loss: 0.1799703687429428
Epoch: 2, Loss: 0.21160843968391418
Epoch: 2, Loss: 0.15130051970481873
Epoch: 2, Loss: 0.15613576769828796
Epoch: 2, Loss: 0.20483456552028656
Epoch: 2, Loss: 0.17692184448242188
Epoch: 2, Loss: 0.22914950549602509
Epoch: 2, Loss: 0.16981317102909088
Epoch: 2, Loss: 0.17333531379699707
Epoch: 2, Loss: 0.19528992474079132
Epoch: 2, Loss: 0.20452536642551422


In [41]:
model.eval()
accuracy_list = []

with torch.no_grad():
    for batch_features, batch_labels in test_loader:
        y_pred = model(batch_features.to(torch.float32))
        y_pred = (y_pred > 0.8).float()

        batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
        accuracy_list.append(batch_accuracy)
    
    overall_accuracy = sum(accuracy_list)/len(accuracy_list)
    print(f'Accuracy: {overall_accuracy:.4f}')

Accuracy: 0.9549


In [44]:
encoder.classes_

array(['B', 'M'], dtype=object)