# Faster-RCNN Training Regimine for Polyp Detection
This notebook is setup to train a Faster-RCNN model for Polyp Detection on a paperspace machine wiht a GPU.   
The current outputs were generated on an A100 GPU.

### Install any dependencies using pip on the paperspace instance

In [1]:
!pip install albumentations

Collecting albumentations
  Downloading albumentations-1.3.0-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting opencv-python-headless>=4.1.1
  Downloading opencv_python_headless-4.7.0.72-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting qudida>=0.0.4
  Downloading qudida-0.0.4-py3-none-any.whl (3.5 kB)
Installing collected packages: opencv-python-headless, qudida, albumentations
Successfully installed albumentations-1.3.0 opencv-python-headless-4.7.0.72 qudida-0.0.4
[0m

## Imports and setup

In [2]:
import os
from src.config import DEVICE, NUM_CLASSES, NUM_EPOCHS, OUTPUT_DIR
from src.config import VISUALIZE_AFTER_TRANSFORM, SAVE_PLOTS_EPOCH, SAVE_MODEL_EPOCH, NUM_WORKERS
from src.model import *
from src.utils import Averager
from tqdm import tqdm_notebook as tqdm
from src.PolypDataset import get_dataloaders
import torch
import matplotlib.pyplot as plt
import time
plt.style.use('ggplot')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Tell Jupyter to reload the source files if they change

In [3]:
%load_ext autoreload
%autoreload 2


## Instantiate the model, optimizer and dataloaders
This code cell also initializes the data structures for storing losses as well as the model name. 

***change the model name prior to running a new experiment***

In [4]:
 # Initialize the model and move to GPU (if available)
model = create_model(num_classes=NUM_CLASSES)
model = model.to(DEVICE)
#model = torch.nn.DataParallel(_model)


# Obtain model parameters to be optimized/updated in this run.
params = filter(lambda p: p.requires_grad, model.parameters())

# Define the optimizer
# TODO: Try out alternatives to SGD --> Maybe use the ABC algorithm 
optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)

# Initialize training loss tracking variables for plotting
train_loss_hist = Averager()
train_iter = 1
train_losses = []
# Initialize validation loss tracking variables for plotting
val_loss_hist = Averager()
val_iter = 1
val_losses = []

# Give the model a name :-)
MODEL_NAME = 'polyps_model_1'


train_loader, valid_loader = get_dataloaders()

# Show transformed images if VISUALIZE_AFTER_TRANSFORM is True
# TODO: Don't use this until we have rewritten the show_transformed_images function
# to work with pyplot instead of cv2
if VISUALIZE_AFTER_TRANSFORM:
    from src.utils import show_transformed_image
    show_transformed_image(train_loader, model)



Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

Train dataset size: 28773
Valid dataset size: 4254



### The training loop function defined

In [5]:
# The training loop function
def train(train_data_loader, model):
    print('Training...')
    global train_iter
    global train_losses

    # THE LOOP w/Beautiful progress bar
    with tqdm(train_data_loader) as pbar:
        for data in pbar:
            # Get the images and targets from the data loader
            images, targets = data

            # Move the images and targets to the GPU
            images = list(image.to(DEVICE) for image in images)
            for target in targets:
                target['boxes'] = target['boxes'].to(DEVICE)
                target['labels'] = target['labels'].to(DEVICE)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            loss_dict = model(images, targets)

            # Get the loss
            losses = sum(loss for loss in loss_dict.values())

            # Backward pass
            losses.backward()

            # Update the weights
            optimizer.step()

            # Update the losses
            train_loss_hist.send(losses.item())
            train_iter += 1
            
            # Update the progress bar
            pbar.set_postfix(loss=losses.item())
            # pbar.update(1)

    return train_losses 


### The validation loop function definded

In [6]:

# The validation loop function
def validate(val_data_loader, model):
    print('Validating...')
    global val_iter 
    global val_losses

    # THE LOOP w/Beautiful progress bar
    with tqdm(val_data_loader) as pbar:
        for data in pbar:
            # Get the images and targets from the data loader
            images, targets = data

            # Move the images and targets to the GPU
            images = list(image.to(DEVICE) for image in images)
            targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

            # Forward pass
            with torch.no_grad():
                loss_dict = model(images, targets)

            # Get the loss
            losses = sum(loss for loss in loss_dict.values())
            val_losses = losses.item()


            # Update the losses
            val_loss_hist.send(losses.item())
            val_iter += 1
            
            # Update the progress bar
            pbar.set_postfix(loss=losses.item())
            # pbar.update(1)

    return val_losses

## Train the Model
Model weights are saved after `SAVE_MODEL_EPOCH` epochs.

Loss plots are saved after `SAVE_PLOTS_EPOCH` epochs. ***This doesn't work for some reason***

In [None]:

# The MAIN Training Loop
for epoch in range(0, NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')

    # Reset training and validation loss histories
    train_loss_hist.reset()
    val_loss_hist.reset()

    # Prepare training and validation plots:
    figure_1, train_ax = plt.subplots()
    figure_2, val_ax = plt.subplots()

    # Start the timer and begin training and validation
    start = time.time()

    # The training loop
    train_losses = train(train_loader, model)

    # The validation loop
    val_losses = validate(valid_loader, model)

    # Print the training and validation loss
    print(f'Epoch {epoch} train loss: {train_loss_hist.value:.3f} val loss: {val_loss_hist.value:.3f}')
    end = time.time()
    print(f'Training time: {((end - start) / 60):.3f}min for {train_iter} iterations')


    if (epoch % SAVE_MODEL_EPOCH == 0) or (epoch == NUM_EPOCHS):
        # Save the model
        torch.save(model.state_dict(), os.path.join(OUTPUT_DIR,f'model{epoch}.pth'))
        print(f'Saved model to {os.path.join(OUTPUT_DIR,f"model{epoch}.pth")}')
    
    if (epoch % SAVE_PLOTS_EPOCH == 0) or (epoch == NUM_EPOCHS):
        # Generate plots
        train_ax.plot(train_losses, color='blue')
        train_ax.set_xlabel('Iterations')
        train_ax.set_ylabel('Training Loss')
        val_ax.plot(val_losses, color='red')
        val_ax.set_xlabel('Iterations')
        val_ax.set_ylabel('Validation Loss')
        figure_1.savefig(os.path.join(OUTPUT_DIR,f'train_loss{epoch}.png'))
        figure_2.savefig(os.path.join(OUTPUT_DIR,f'val_loss{epoch}.png'))
        print(f'Saved plots to {os.path.join(OUTPUT_DIR,f"[train or val]_loss{epoch}.png")}')

    plt.close('all')

Epoch 1/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 0 train loss: 0.120 val loss: 0.104
Training time: 12.675min for 900 iterations
Saved model to output/model0.pth
Saved plots to output/[train or val]_loss0.png
Epoch 2/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 1 train loss: 0.090 val loss: 0.104
Training time: 12.570min for 1799 iterations
Epoch 3/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 2 train loss: 0.084 val loss: 0.100
Training time: 12.592min for 2698 iterations
Saved plots to output/[train or val]_loss2.png
Epoch 4/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 3 train loss: 0.081 val loss: 0.102
Training time: 12.668min for 3597 iterations
Epoch 5/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 4 train loss: 0.079 val loss: 0.100
Training time: 12.546min for 4496 iterations
Saved plots to output/[train or val]_loss4.png
Epoch 6/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 5 train loss: 0.076 val loss: 0.098
Training time: 12.590min for 5395 iterations
Saved model to output/model5.pth
Epoch 7/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 6 train loss: 0.074 val loss: 0.099
Training time: 12.516min for 6294 iterations
Saved plots to output/[train or val]_loss6.png
Epoch 8/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 7 train loss: 0.073 val loss: 0.098
Training time: 12.588min for 7193 iterations
Epoch 9/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 8 train loss: 0.072 val loss: 0.100
Training time: 12.639min for 8092 iterations
Saved plots to output/[train or val]_loss8.png
Epoch 10/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 9 train loss: 0.070 val loss: 0.102
Training time: 12.554min for 8991 iterations
Epoch 11/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 10 train loss: 0.070 val loss: 0.100
Training time: 12.713min for 9890 iterations
Saved model to output/model10.pth
Saved plots to output/[train or val]_loss10.png
Epoch 12/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 11 train loss: 0.068 val loss: 0.102
Training time: 12.598min for 10789 iterations
Epoch 13/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 12 train loss: 0.068 val loss: 0.100
Training time: 12.545min for 11688 iterations
Saved plots to output/[train or val]_loss12.png
Epoch 14/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

Validating...


  0%|          | 0/132 [00:00<?, ?it/s]

Epoch 13 train loss: 0.067 val loss: 0.101
Training time: 12.591min for 12587 iterations
Epoch 15/15
Training...


  0%|          | 0/899 [00:00<?, ?it/s]

# The machine was auto-shutdown before the training loop could finish. 
It doesn't look like the performance from one epoch to the next had been improving, so the 10th Epoch is likely sufficient. 