In [None]:
# defines CancerDataset class for loading in data that can interact with pytorch
# objects like torch.utils.data.DataLoader 

import torch
import os
import pandas as pd
from torchvision.io import read_image
import os
import glob
import numpy as np
from torch.utils.data import Dataset

class CancerDataset(Dataset):
    """
    Abstraction representing a dataset containing the images in the Breast Histopathology Images Dataset
    (https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images). 
    """
    def __init__(self, img_labels, img_paths, transform=None, target_transform=None):
        # img-labels
        self.img_labels = img_labels
        self.img_paths = img_paths
        self.transform = transform
        self.target_transform = target_transform
    def __len__(self):
        return len(self.img_paths)
    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = read_image(img_path)
        label = self.img_labels[idx]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [None]:
import glob
import numpy as np
import pandas as pd
import torch
import torchvision
from torch.utils.data import DataLoader
from torchvision.transforms import Resize
from torchvision.transforms import Lambda
from torchvision.transforms.v2 import ToDtype
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch import nn

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

image_paths = np.array(glob.glob('/Users/calvindejong/Downloads/cancer_images/IDC_regular_ps50_idx5/**/*.png', recursive = True))
labels = np.zeros(len(image_paths),dtype=int)
for i in range(len(image_paths)):
    label = image_paths[i][-5]
    labels[i] = int(label)
    #labels[i] = image[i][-5]

img_transforms = transforms.Compose([
    transforms.Resize((50,50)),
    transforms.v2.ToDtype(torch.float,scale=True),
])



dataset = CancerDataset(
    img_labels=labels,
    img_paths=image_paths,
    transform=img_transforms,
    target_transform=Lambda(lambda y: torch.zeros(
    2, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))
)

train_dataset, test_dataset = torch.utils.data.random_split(
    dataset=dataset,
    lengths=[0.8,0.2],
    generator=torch.Generator().manual_seed(22)
)
    

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

data, _  = next(iter(train_dataloader))

class NeuralNetwork(nn.Module):
    """
    Abstraction representing a feedforward neural network that can be trained to accurately predict 
    whether IDC is present in a 50x50x3 pixel image
    """
    def __init__(self):
        """ 
        Defines the neural network's component layers as a flatten layer followed 
        by linear layers of 7500, 1000, 100, and 2 nodes with ReLU functions separating each.
        """
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(50*50*3, 1000),
            nn.ReLU(), # f(x) = max(0,x)
            nn.Linear(1000, 100),
            nn.ReLU(),
            nn.Linear(100, 2)
        )

    def forward(self, x):
        """
        Defines the routine that transforms input image into output prediction. Returns raw logits (pre softmax)
        """
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)

learning_rate = 1e-3
batch_size = 64
epochs = 5

def train_loop(dataloader, model, loss_fn, optimizer):
    """
    Loops through the entire training dataset, computing the model's prediction, performing back prop, 
    and updating the parameters for each batch of 64 images.
    
    Parameters:
    dataloader - a pytorch DataLoader object holding the training data
    model - a NeuralNetwork object (see above)
    loss_fn - a pytorch loss function (eg nn.CrossEntropyLoss)
    optimizer - a pytorch optimizer (eg torch.optim.SGD)
    
    Returns:
    nothing
    """
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    """
    Loops through the entire test dataset, computing the model's predictions and printing the test error,
    accuracy, and average loss the model achieved
    
    Parameters:
    dataloader - a pytorch DataLoader object holding the test data
    model - a NeuralNetwork object that has undergone at least one iteration of train_loop()
    loss_fn - a pytorch loss function (eg nn.CrossEntropyLoss)
    
    Returns:
    nothing
    """
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done training")

# Save model

torch.save(model.state_dict(), 'model_weights.pth')
