# Diabetic Retinopathy Detection

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision.transforms as T
import torchvision
import os

from torchvision.models import resnet152, ResNet152_Weights
from torch.utils.data import Dataset, DataLoader
from sklearn.utils import class_weight # Estimates class weights for unbalanced datasets.


## Preprocess the Data

In [None]:
# Get device for training
def get_device():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using {device.upper()} device!')
    return device

# Addign device
device = get_device()

Using CPU device!


In [None]:
# Load images names and labels
# path = '/notebooks/train.csv'
# imgs_dir = '/notebooks/data/train_images'

path = '../data/raw/train.csv'
imgs_dir = '../data/raw/train_images/'

# Read csv file
df = pd.read_csv(path)
print(f'Available images:\t{len(df)}')

# As the data is imbalanced, let's calculate the weights for each class.
class_weights = class_weight.compute_class_weight(class_weight = 'balanced',
                                                  classes = np.unique(df.diagnosis),
                                                  y = df.diagnosis.values)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print(class_weights)

# Create custom class to create dataloaders
class CustomImageDataset(Dataset):
    """
    Create a custom dataset for your files.
    It must implement three functions: 
    __init__, __len__, __getitem__
    
    Attributes:
    df: DataFrame with names and labels.
    img_dir: Path to the images' folder.
    transforms: Transforms to be applied to each image.
    """
    
    def __init__(self, df, img_dir, transform=None):
        self.img_labels = df.values
        self.img_dir = img_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.img_labels)
    
    def __getitem__(self, idx):
        img_name, label = self.img_labels[idx]
        img_path = os.path.join(self.img_dir, f'{img_name}.png')
        image = torchvision.io.read_image(img_path)
        if self.transform:
            image = self.transform(image)
        return image, label

# Transfors for train set
transforms = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406),
                std=(0.229, 0.224, 0.225))
])

# Create train data
dataset = CustomImageDataset(df, imgs_dir, transforms)

# Set a BATCH size
BATCH_SIZE = 32

# Train test validation split
train_size = 0.70
valid_size = 0.15
test_size = 0.15
total_count = len(dataset)
train_count = int(train_size * total_count)
valid_count = int(valid_size * total_count)
test_count = total_count - train_count - valid_count
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(dataset, (train_count, valid_count, test_count))

# Create Dataloader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

Available images:	3662
tensor([0.4058, 1.9795, 0.7331, 3.7948, 2.4827])


In [None]:
for idx, batch in enumerate(train_dataloader):
    print('Batch: ', idx),
    print('Batch size: ', batch[0].size())
    print('Batch label: ', batch[1])
    break

Batch:  0
Batch size:  torch.Size([32, 3, 224, 224])
Batch label:  tensor([2, 2, 0, 2, 0, 0, 1, 4, 2, 2, 2, 1, 0, 2, 0, 0, 0, 3, 0, 4, 2, 2, 0, 4,
        2, 0, 3, 2, 4, 0, 2, 2])


### Build the Model

In [None]:
# Transfer Learning Model

# Since I do not have a lot of data, I'll use Transfer Learning
# Downloads the resnet152 model
model = resnet152(weights=ResNet152_Weights.DEFAULT)

# Replace fully connected layer with 4 new layers
model.fc = torch.nn.Sequential(torch.nn.Linear(2048, 256),
                                  torch.nn.ReLU(inplace=True),
                                  torch.nn.Linear(256, 128), 
                                  torch.nn.ReLU(inplace=True),
                                  torch.nn.Linear(128, 64),
                                  torch.nn.ReLU(inplace=True),
                                  torch.nn.Linear(64, 5))

In [None]:
# Function to save the model
def saveModel():
    path = './classifier.pth'
    torch.save(model.state_dict(), path)

In [None]:
# Create functions for training and validation
def train(dataloader, model, criterion, optimizer):
    """
    Args: 
        dataloader: Iterator for batches of images
        model: Given an input produces an output by multiplying the input with model weights
        criterion: loss function
        optimizer: updates the model weights
    Returns:
        Average loss per batch which is calculated by dividing the losses for all
        the batches with the number of batches
    """

    # Sets the model for training
    model.train()
    device = get_device()

    # Move model to device
    model.to(device)

    total = 0
    correct = 0 
    running_loss = 0
    for batch, (images, labels) in enumerate(dataloader):
        # Move to CUDA if available
        images = images.to(device)
        labels = labels.to(device)

        # Model predictions
        output = model(images)

        # Loss calculation
        loss = criterion(output, labels)

        running_loss += loss.item()
        total += labels.size(0)
        predictions = output.argmax(dim=1)
        correct += (predictions == labels).sum().item()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Backpropagate the loss
        loss.backward()

        # Adjust parametes based on the calculated gradients
        optimizer.step()
    
    # Average loss for a single batch
    avg_loss = running_loss / len(dataloader)

    # Print results
    print(f'Training Loss:\t{avg_loss:.6f}')
    print(f'Accuracy on Training set:\t{100 * (correct / total):.6f}% [{correct} / {total}]')
    return avg_loss

def validate(dataloader, model, criterion):
    """
    Calculate the average loss per batch and the accuracy of the model's predictions.
    Args: 
        dataloader: Iterator for the batches in the dataset
        model:
        criterion: Loss function
    
    Returns:
        Average loss per batch which is calculated by dividing the losses for all the batches
        with the number of batches
    """        

    # Sets the model for evaluation
    model.eval()
    total = 0.0
    correct = 0.0
    running_loss = 0.0
    device = get_device()

    # No need to calculate gradients
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            model  = model.to(device)

            output = model(images)
            loss = criterion(output, labels).item()
            running_loss += loss

            total += labels.size(0)
            predictions = output.argmax(dim=1)
            correct += (predictions == labels).sum().item()
    
    avg_loss = running_loss / len(dataloader)
    print(f'Validation Loss:\t{avg_loss:.6f}')
    print(f'Accuracy on Validation set: {100 * (correct / total):.6f}% [{correct} / {total}]')
    return avg_loss

### Optimize the Model

In [None]:
def optimize(train_dataloader, valid_dataloader, model, criterion, optimizer, EPOCHS=5):
    """
    Optimize function calls the train & validate functions for the number of EPOCHS
    Args: 
        train_dataloader: 
        test_dataloader:
        model:
        criterion:
        optimizer: Updates the model
        EPOCHS: Number of epochs
    Retuns:
        Tuple of lists containing losses for all the epochs
    """

    # Store losses for all the epochs
    train_losses = []
    valid_losses = []
    best_accuracy = 0.0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1} / {EPOCHS}')
        print(25 * '-----')
        train_loss = train(train_dataloader, model, criterion, optimizer)
        train_losses.append(train_loss)
        valid_loss = validate(valid_dataloader, model, criterion)
        valid_losses.append(valid_loss)

        # Save the model if accuracy is the best
        if valid_loss > best_accuracy:
            saveModel()
            best_accuracy = valid_loss

    print(25 * '#\nDone training!')
    return train_losses, valid_losses

In [None]:
# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
EPOCHS = 50


In [None]:
# Train model step
train_losses, valid_losses = optimize(train_dataloader, valid_dataloader, model, criterion, optimizer, EPOCHS)

Epoch 1 / 50
-----------------------------------------------------------------------------------------------------------------------------
Using CPU device!
Training Loss:	1.613760
Accuracy on Training set:	7.608272% [195 / 2563]
Using CPU device!
Validation Loss:	1.613496
Accuracy on Validation set: 8.014572% [44.0 / 549.0]


UnboundLocalError: local variable 'best_accuracy' referenced before assignment