# Training Cascade RCNN

In this notebook, we will train Cascade RCNN ResNet 101 model. We have created the config for this model which is present in src folder with filename - training_crcnn_5k_r101.py.
Some notable points that we used in the config are as follows:
1. We are using Cascade RCNN - ResNet 101 model.
2. Number of training epochs is set to 24 with learning rate decreasing by a factor of 10 at epoch 12 and 22.
3. Image Scale is set to (5000,5000).
4. Max bounding box predictions per image is set to 400.
5. 90% of data is used for training. Rest 10% is used for validation. For dividing the data into train and validation set, we took 10% of files randomly from each stain for validation set.

**We do NOT need a GPU for this notebook**

In [2]:
import boto3           # For interacting with S3
import numpy as np
import pandas as pd
import sys             # Python system library needed to load custom functions
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

# Imports to run Sagemaker training jobs
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from sagemaker.session import Session

In [3]:
sys.path.append('../src')  # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.

In [4]:
from config import DEFAULT_BUCKET, DEFAULT_REGION  # The name of the S3 bucket that contains the training data
from detection_util import create_predictions
from gdsc_util import download_and_extract_model, set_up_logging, extract_hyperparams, PROJECT_DIR
from training_crcnn_5k_r101 import load_config
from gdsc_util import load_sections_df
from PredictionEvaluator import PredictionEvaluator

set_up_logging()  # Sets up logging to console and .log

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


### Load the config file

In [5]:
data_folder = str(PROJECT_DIR / 'data')
cfg, base_file = load_config(data_folder)

In [6]:
print(f'Config:\n{cfg.pretty_text}')

Config:
checkpoint_config = dict(interval=1)
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
custom_hooks = [dict(type='NumClassCheckHook')]
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = 'faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth'
resume_from = None
workflow = [('train', 1)]
opencv_num_threads = 0
mp_start_method = 'fork'
auto_scale_lr = dict(enable=True, base_batch_size=2)
dataset_type = 'OnchoDataset'
data_root = '/home/sagemaker-user/gdsc5-tutorials-public/data'
img_norm_cfg = dict(
    mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='Resize',
        img_scale=(5000, 5000),
        multiscale_mode='value',
        keep_ratio=True),
    dict(
        type='RandomCrop',
        crop_size=(0.3, 0.3),
        crop_type='relative',
        allow_negative_crop=True),
    dic

### Start a sagemaker training instance

In [9]:
entry_point = 'training_crcnn_5k_r101.py'
exp_name = entry_point.split('.')[0].replace('_', '-')  # AWS does not allow . and _ as experiment names

account_id = boto3.client('sts').get_caller_identity().get('Account')
role = get_execution_role()

sm_client = boto3.client("sagemaker", region_name=DEFAULT_REGION)
sess = Session(sagemaker_client=sm_client)
s3_output_location = f"s3://{sess.default_bucket()}/{exp_name}"
input_channels = {"train": f"s3://{DEFAULT_BUCKET}"}

hyperparameters = extract_hyperparams(entry_point) # custom function to parse the training script and extract config
hyperparameters['base_file'] = base_file

metrics = [
    {"Name": "train:loss_rpn_cls", "Regex": "loss_rpn_cls: ([0-9\.]+)"},
    {"Name": "train:loss_rpn_bbox", "Regex": "loss_rpn_bbox: ([0-9\.]+)"},
    {"Name": "train:loss_cls", "Regex": "loss_cls: ([0-9\.]+)"},
    {"Name": "train:loss_bbox", "Regex": "loss_bbox: ([0-9\.]+)"},
    {"Name": "train:loss", "Regex": "loss: ([0-9\.]+)"},
    {"Name": "train:accuracy", "Regex": "acc: ([0-9\.]+)"},
    {"Name": "train:epoch", "Regex": "Epoch (\[[0-9\.]+\])"},
    {"Name": "val:epoch", "Regex": "Epoch\(val\) (\[[0-9]+\])"},
    {"Name": "val:mAP", "Regex": "mAP: ([0-9\.]+)"},
]

estimator = PyTorch(
    entry_point=entry_point,             # This function will be called by the training job
    source_dir="../src",                 # All code in this folder will be copied over
    image_uri=f"954362353459.dkr.ecr.{DEFAULT_REGION}.amazonaws.com/sm-training-custom:torch-1.8.1-cu111-noGPL",
    role=role,
    output_path=s3_output_location,
    container_log_level=20,             # 10=debug, 20=info
    base_job_name=exp_name,
    instance_count=1,
    instance_type="ml.g4dn.xlarge",     # a GPU instance
    volume_size=45,
    metric_definitions=metrics,
    hyperparameters=hyperparameters,
)

estimator.fit(
    input_channels,
    wait=False,           # Whether or not the notebook should wait for the job to finish. By setting it to False we can continue working while the job runs on another machine.
)

# save the name of the experiment to the filesystem so that we can use it later
experiment_name = estimator._hyperparameters["sagemaker_job_name"]

with open(f'{PROJECT_DIR}/experiment_crcnn_5k_r101_epoch_24.txt', 'w+') as f:
    f.write(experiment_name)

2022-07-20 08:53:07,588 - sagemaker.image_uris - INFO - Defaulting to the only supported framework/algorithm version: latest.
2022-07-20 08:53:07,606 - sagemaker.image_uris - INFO - Ignoring unnecessary instance type: None.
2022-07-20 08:53:47,906 - sagemaker - INFO - Creating training-job with name: training-5k-r101-2022-07-20-08-53-07-587


### Download the model and predictions

In [9]:
# read the experiment name from the filesystem
with open(f'{PROJECT_DIR}/experiment_crcnn_5k_r101_epoch_24.txt', 'r') as f:
    experiment_name = f.read()

model_location = f'{s3_output_location}/{experiment_name}/output/model.tar.gz'
local_model_dir = download_and_extract_model(model_uri=model_location, local_dir='data')