# iLab Project: Group 7-2

#### Federated Learning for Postoperative Gastric Cancer Detection

David Bain 91082596

In [2]:
!pip -q install torch-vision

In [4]:
!pip install torch

Collecting torch
  Using cached torch-2.4.0-cp311-none-macosx_11_0_arm64.whl.metadata (26 kB)
Using cached torch-2.4.0-cp311-none-macosx_11_0_arm64.whl (62.1 MB)
Installing collected packages: torch
Successfully installed torch-2.4.0


In [3]:
import random
import io

import boto3
from PIL import Image
import pandas as pd
import numpy as np
from typing import List, Tuple
from typing import Optional
from botocore.exceptions import BotoCoreError, ClientError
from matplotlib import pyplot as plt

import torchvision.models as models
import torchvision.transforms as v2
import torch
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

ModuleNotFoundError: No module named 'torch'

Read in the aws keys to access s3

In [None]:
key_df = pd.read_csv('../team_user_accessKeys.csv', sep=',')

Access the s3 bucket with the images and create an s3_client object

In [None]:
# Replace the following values with your access key details
AWS_ACCESS_KEY_ID = key_df.iloc[0,0]
AWS_SECRET_ACCESS_KEY = key_df.iloc[0,1]
region_name = 'ap-southeast-2'

# Create an S3 client using the provided access keys
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=region_name
)

# List all the buckets
buckets = s3_client.list_buckets()
for bucket in buckets['Buckets']:
    print(bucket['Name'])

In [None]:
bucket_name = 'gastric-cancer-data'
clinic_0_train_data = [
    '0/train_data/0/',
    '0/train_data/1/'
]
clinic_0_test_data = [
    '0/test_data/0/',
    '0/test_data/1/'    
]

[1] Load data from clinic 0

In [None]:
def list_s3_bucket_images(s3_client: 'boto3.client', bucket_n: str, prefix: str = '') -> List[str]:
    
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_n, Prefix=prefix)

    image_keys = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                if obj['Key'].lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_keys.append(obj['Key'])
    return image_keys

[2] Read an image from s3

In [None]:
def read_image_from_s3(s3_client: 'boto3.client', bucket_n: str, image_key: str) -> Optional[Image]:

    try:
        obj = s3_client.get_object(Bucket=bucket_n, Key=image_key)
        img_data = obj['Body'].read()

        # Convert bytes data to a file-like object
        img_bytes = io.BytesIO(img_data)

        # Use PIL to open the image
        image = Image.open(img_bytes).convert('RGB')

        return image

    # Handle S3 related errors
    except (BotoCoreError, ClientError) as e:
        print(f"Error accessing S3 for image {image_key}: {e}")

    # Handle decoding errors
    except (ValueError, IOError) as e:
        print(f"Error decoding image {image_key}: {e}")

    return None


[3] Process the images from disk without transformation.

Resizing depends on the desired neural network architecture which have a min of 32 x 32

In [None]:
# Read in all images from disk and store in an array. Preprocessing is done later

def load_images(s3_client: 'boto3.client', bucket_n: str, prefixes: List[str]) -> List[Tuple[int, Image]]:
    
    images = []
    
    for prefix in prefixes:
        print(f"Processing images for prefix: {prefix}")
        image_keys = list_s3_bucket_images(s3_client, bucket_n, prefix)
        
        for image_key in image_keys:
            try:
                image = read_image_from_s3(s3_client, bucket_n, image_key)
                if image is not None:
                    bin_image_key = image_key.split('/')[2] 
                    images.append((int(bin_image_key), image))
                     
                else:
                    print(f"Failed to load image {image_key}")
            except Exception as e:
                print(f"Error processing image {image_key}: {e}")
    return images


Define the s3 'prefix' which relates to the directory structure or clinic: 0, 1, 2, 3

Define image preprocessiing

In [None]:
def preprocess_images(s3_client: 'boto3.client', bucket_name: str, prefixes: List[str]) -> List[Tuple[int, Image]]:
    
    # List all image keys
    for prefix in prefixes:
        print(f"Listing images for prefixes: {prefix}")
        image_keys = list_s3_bucket_images(s3_client, bucket_name, prefix)
        print(f"Total number of images: {len(image_keys)}\n")
    
    # Process images in memory
    loaded_images = load_images(s3_client, bucket_name, prefixes)
    print(f"Total number of loaded images: {len(loaded_images)}")
    
    return loaded_images


In [None]:
clinic_0_train_images = preprocess_images(s3_client, bucket_name, clinic_0_train_data)
clinic_0_test_images = preprocess_images(s3_client, bucket_name, clinic_0_test_data)

In [None]:
# Print first and last image and classification
print(clinic_0_train_images[0])
print(clinic_0_train_images[-1])

print(clinic_0_test_images[0])
print(clinic_0_test_images[-1])

Define function to print a handful of original images

In [None]:
def plot_images(images: List[Image], titles: List[str], rows: int, cols: int) -> None:
    """
    Plot a list of images with their titles using matplotlib.

    :param images: List of images to plot
    :param titles: List of titles corresponding to the images
    :param rows: Number of rows in the plot
    :param cols: Number of columns in the plot
    """
    fig, axes = plt.subplots(rows, cols, figsize=(12, 8))
    axes = axes.flatten()

    for img, ax, title in zip(images, axes, titles):
        ax.imshow(img)
        ax.set_title(title)
        ax.axis('off')

    plt.tight_layout()
    plt.show()

Read images and plot

In [None]:
# Access a random sample of 5 images
if clinic_0_train_images:
    random_sample = random.sample(clinic_0_train_images, min(10, len(clinic_0_train_images)))
    sampled_images = [img_data for label, img_data in random_sample]
    sampled_titles = [label for label, img_data in random_sample]

    # Display images using matplotlib
    num_images = len(sampled_images)
    plot_images(sampled_images, sampled_titles, 2, int(np.ceil(num_images / 2)))


Resolution is not very good

Review image size differences

In [None]:
sampled_images

Resolution of images are small in size and vary. This will limit the transfer learning models used 

Image Transformation
1) Transform to tensor
2) Normalise
3) Augmentation

In [None]:
def resize_images(images: List[Tuple[int, Image]], size: List[int]) -> List[Tuple[int, Image]]:
    
    resized_images = []
    transform = v2.Resize(size)
    
    for label, image in images:
        resized_images.append((label, transform(image)))
    
    return resized_images
    

def convert_to_tensors(images: List[Tuple[int, Image]]) -> List[Tuple[int, Tensor]]:
    
    tensor_images = []
    transform = v2.ToTensor()
    for label, image in images:
        tensor_images.append((label, transform(image)))

    return tensor_images


def compute_mean_std(tensor_label_images: List[Tuple[int, Tensor]]) -> Tuple[List[float], List[float]]:
    
    # Extract tensor images
    tensor_images = [image for _, image in tensor_label_images]

    # Stack tensors along dimension 0 (creating a new batch dimension)
    stacked_tensors = torch.stack(tensor_images, dim=0)

    # Compute mean and std along the batch dimension (0), excluding the channel dimension (1)
    mean = stacked_tensors.mean(dim=(0, 2, 3))
    std = stacked_tensors.std(dim=(0, 2, 3))

    return mean, std


def normalise_images(tensor_label_images: List[Tuple[int, Tensor]], mean: float, std: float):
    
    transform_normalize = v2.Normalize(mean, std)
    normalised_label_images = []

    for label, tensor_image in tensor_label_images:
        normalized_image = transform_normalize(tensor_image)
        normalised_label_images.append((label, normalized_image))

    return normalised_label_images




In [None]:
def transform_images(target_size: int, images: List[Tuple[int, Image]]) -> List[Tuple[int, Tensor]]:      
    
    resized_images = resize_images(clinic_0_train_images, target_size)
    
    tensor_images = convert_to_tensors(resized_images)
    
    mean, std = compute_mean_std(tensor_images)

    print("Mean:", mean)
    print("Std Dev:", std)

    normalised_tensor_images = normalise_images(tensor_images, mean, std)
    
    return normalised_tensor_images
    
   

In [None]:
target_size = [32, 32]

clinic_0_train_ds = transform_images(target_size, clinic_0_train_images)
clinic_0_test_ds = transform_images(target_size, clinic_0_test_images)

Create a custom Dataset for the images

In [None]:
# Create a custom Dataset for the images
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Retrieve the item (label, image) from the list
        label, image = self.data[idx]
        # Return the image and label
        return image, label

Create an instance of the custom dataset

In [None]:
# Create custom datasets

train_dataset = CustomDataset(clinic_0_train_ds)
test_dataset = CustomDataset(clinic_0_test_ds)

Create a DataLoader

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

Make sure this runs on Mac GPU

In [None]:
# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

Define the first Torchvision model: VGG16

In [None]:
model = models.vgg16(weights=None)

# Modify the final layer to suit binary classification
num_features = model.classifier[6].in_features

# adjust the output layer for binary classification
model.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear (25088, 4096),
        nn.ReLU(),
        nn.Linear(4096, 512),
        nn.ReLU(),
        nn.Linear (512, 256),
        nn.ReLU(),
        nn.Linear (256, 1),
        nn.Sigmoid()
        )

model = model.to(device)
print(model)


Training Loop

In [None]:
# Loss function
criterion = nn.BCEWithLogitsLoss()

# Optimiser
optimiser = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Train loop
num_epochs = 20
for epoch in range(num_epochs):
    
    running_loss = 0.0
    model.train() # Set model to train mode
    for inputs, labels in train_loader:
        
        # Move inputs to labels to the appropriate device and convert to float
        inputs = inputs.to(device).float()
        labels = labels.to(device).float().unsqueeze(1)
        
        # Zero the parameter gradients
        optimiser.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimise
        loss.backward()
        optimiser.step()
        
        # Update running loss
        running_loss += loss.item() * inputs.size(0)
        
    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {epoch_loss:.4f}")
    
    
print("Training complete!")

Test Classification results

In [None]:
def evaluate(model, test_loader):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    correct_predictions = 0
    incorrect_predictions = 0

    for images, labels in test_loader:
        images, labels = images.to(device).float(), labels.to(device).float().unsqueeze(1)

        with torch.no_grad():
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            predicted = torch.sigmoid(outputs).round()
            total += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()
            incorrect_predictions += (predicted != labels).sum().item()

    accuracy = correct_predictions / total
    print(f"Correct predictions: {correct_predictions}")
    print(f"Incorrect predictions: {incorrect_predictions}")
    return total_loss / len(test_loader), accuracy




In [None]:
# Test the model for accuracy and loss

test_loss, test_accuracy = evaluate(model, test_loader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Check for duplicate images

In [None]:
def check_for_duplicates(train_loader, test_loader):
    train_images_set = set()
    test_images_set = set()

    # Extract images from the training dataset
    for images, _ in train_loader:
        for image in images:
            # Convert image tensor to a tuple (hashable type for set)
            train_images_set.add(tuple(image.numpy().flatten()))

    # Extract images from the test dataset
    for images, _ in test_loader:
        for image in images:
            # Convert image tensor to a tuple (hashable type for set)
            test_images_set.add(tuple(image.numpy().flatten()))

    # Check for duplicates between train and test datasets
    duplicates = train_images_set.intersection(test_images_set)
    return len(duplicates), duplicates




In [None]:
num_duplicates, duplicates = check_for_duplicates(train_loader, test_loader)
print(f"Number of duplicates: {num_duplicates}")
if num_duplicates > 0:
    print("Duplicates found!")