## 1. Setup

### 1.1. Importing the required libraries

In [None]:
import sys
import gc
import random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!pip install optuna &> /dev/null

In [None]:
!pip install kaleido &> /dev/null

In [None]:
!pip install mlflow &> /dev/null

In [None]:
# Image Augmentation
import albumentations as A

# Hyperparameter Optimization
import optuna

# Experiment Tracking
import mlflow

In [None]:
# PyTorch
import torch
from torch.utils.data import Dataset, Subset, DataLoader

# Torchvision
import torchvision
from torchvision.utils import draw_bounding_boxes
from torchvision.ops import box_convert, box_iou
import torchvision.transforms as T
from torchvision.transforms.functional import to_pil_image

# Faster R-CNN (MobileNet)
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [None]:
print(A.__version__)
print(optuna.__version__)
print(mlflow.__version__)
print(torch.__version__)
print(torchvision.__version__)

### 1.2. Setting general constants

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set partial reproducibility
SEED = 0
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
PROJECT_PATH = Path('/content/drive/MyDrive/ML_Projects/How_many_Sparrows')
DATA_PATH = PROJECT_PATH / 'data'
IMAGE_PATH, BBOX_DATA_PATH = [DATA_PATH / 'raw' / data_path for data_path in ['images', 'bboxes/bounding_boxes.csv']]
TRAIN_FILE_PATH, TEST_FILE_PATH = [DATA_PATH / 'prepared' / csv_file for csv_file in ['train.csv', 'test.csv']]
SAVE_MODEL_PATH, MLRUN_PATH, HYPER_OPT_PATH = [PROJECT_PATH / savedir  for savedir in ['models', 'mlruns', 'hyper_opt']]

for path in [SAVE_MODEL_PATH, MLRUN_PATH, HYPER_OPT_PATH]:
    path.mkdir(exist_ok=True)

In [None]:
MODEL_NAME = 'faster_rcnn_mob'
NUM_CLASSES = 2 # 1 class (house sparrow) + background
BATCH_SIZE = 16
EVAL_IOU_THRESH = 0.4
EVAL_BETA = 2
EPOCHS = 15

# Set the device to be used to run the model
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 2. Helper Functions

### 2.1. For data manipulation

In [None]:
def get_image_transforms(box_format, save_aug_file_name=''):
    """Return a Albumentation object and saves it to a JSON file 
    if save_aug_file_name is specified."""
    aug = A.Compose([
                    A.LongestMaxSize(1333, always_apply=True),  
                    A.SmallestMaxSize(800, always_apply=True),
                    A.HorizontalFlip(p=0.6),
                    A.VerticalFlip(p=0.4),
                    A.ColorJitter(0.5, 0.5, 0.5, 0, p=0.7),
                    A.RandomRain(p=0.5),
                    A.OneOrOther(
                        A.Blur(10, p=0.7),
                        A.GaussianBlur((11, 21), p=0.3),
                        p=0.6
                        ),
                    ], 
                    A.BboxParams(format=box_format, label_fields=['labels']),
                    p=0.8)
    
    if save_aug_file_name:
        config_path = PROJECT_PATH / 'configs'
        config_path.mkdir(exist_ok=True)
        A.save(aug, config_path / save_aug_file_name) 

    return aug

In [None]:
def stratified_group_train_test_split(data, stratification_basis, groups):
    """Split data in a stratified way into training and test sets,
    taking into account groups, and return the corresponding indices."""
    split = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=0)
    train_ids, test_ids = next(split.split(X=data, y=stratification_basis, groups=groups))
    return train_ids, test_ids

In [None]:
def collate_batch(batch):
    """Collate batches in a Dataloader."""
    return tuple(zip(*batch))

In [None]:
def draw_bboxes_on_image(img, bboxes, scores=None):
    """Draw an image with bounding boxes from Tensors."""
    if (img.dtype != torch.uint8):
        img = T.functional.convert_image_dtype(img, dtype=torch.uint8)
         
    img_box = draw_bounding_boxes(img.detach(), boxes=bboxes, colors='orange', width=2)
    img = to_pil_image(img_box.detach())
    plt.figure(figsize=(8, 10))
    plt.imshow(img)
    plt.axis('off')
    ax = plt.gca()
    if scores is not None:
        for bb, sc in zip(bboxes, scores):
            x, y = bb.tolist()[:2]
            text_sc = f"{sc:0.2f}"
            ax.text(x, y, text_sc , fontsize=12, 
                    bbox=dict(facecolor='orange', alpha=0.5))
            
    plt.show()

### 2.2. For a model training cycle

In [None]:
def train_one_epoch(dataloader, model, optimizer, device=torch.device('cpu')):
    """Pass a training step in one epoch."""
    accum_dict_losses = {}
    accum_model_loss = 0
    num_batches = len(dataloader)

    # Set a model to the training mode
    model.train()

    for images, targets in dataloader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Сompute a model batch losses
        batch_dict_losses = model(images, targets)
        batch_model_loss = sum([loss for loss in batch_dict_losses.values()])

        # Accumulate statistics for computing the average losses per epoch
        accum_dict_losses.update({
            k: accum_dict_losses.get(k, 0) + v.item() for k, v in batch_dict_losses.items()
            })
        accum_model_loss += batch_model_loss.item()

        # Optimize the model parameters
        optimizer.zero_grad()
        batch_model_loss.backward()
        optimizer.step()
        
        # Free up memory
        del images
        del targets
        gc.collect()

        if str(device) == 'cuda':
            torch.cuda.empty_cache()

    # Compute the average losses
    epoch_dict_losses = {k: v / num_batches for k, v in accum_dict_losses.items()}
    epoch_model_loss = accum_model_loss / num_batches 
  
    return {'epoch_dict_losses': epoch_dict_losses, 
            'epoch_loss': epoch_model_loss}

In [None]:
@torch.inference_mode()
def precision_recall_fbeta_scores(gts, preds, iou_thresh=0.5, beta=1):
    """Calculate the batch precision, recall, and f_beta scores based on IoU thresholds."""
    if (beta or iou_thresh) < 0:
        raise ValueError("beta and iou_thresh should be >=0")

    total_gt_labels = []
    total_correct_pred_labels = []

    for gt, pred in zip(gts, preds):
        total_gt_labels.append(gt['labels'])

        if pred['boxes'].numel() != 0:
            # Box IoU
            gt_pred_box_iou = box_iou(gt['boxes'], pred['boxes']) 
            max_ious = torch.max(gt_pred_box_iou, dim=1)

            # Mark box classification results as true and false positive base on a given IoU threshold
            correct_pred_labels = torch.zeros_like(pred['labels'])
            correct_pred_labels[max_ious.indices[max_ious.values >= iou_thresh]] = 1
        else:
            correct_pred_labels = torch.zeros_like(gt['labels'])

        total_correct_pred_labels.append(correct_pred_labels)
    
    total_correct_pred_labels = torch.cat(total_correct_pred_labels)
    total_gt_labels = torch.cat(total_gt_labels)

    # Precision, recall, and f_beta scores'    
    tp = sum(total_correct_pred_labels).item()
    recall = tp / total_gt_labels.numel()
    precision = tp / total_correct_pred_labels.numel()
    denom = (beta**2 * precision) + recall
    f_beta = ((1 + beta**2) * (precision * recall)) / denom if denom !=0 else 0
    
    return {'precision': precision,
            'recall': recall,
            'f_beta': f_beta}

In [None]:
@torch.inference_mode()
def eval_one_epoch(dataloader, model, iou_thresh=0.5, beta=1, device=torch.device('cpu')):
    """Pass a inference evaluation step in one epoch."""
    accum_model_scores = {}
    results = []
    num_batches = len(dataloader)
    
    # Set a model to the evaluation mode
    model.eval()
    
    for images, targets in dataloader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Get prediction results
        outputs = model(images)
        results += outputs

        # Сompute a model batch statistics
        batch_model_scores = precision_recall_fbeta_scores(
            targets, outputs, iou_thresh=iou_thresh, beta=beta)
        
        # Accumulate statistics for computing the average values per epoch
        accum_model_scores.update({
            k: accum_model_scores.get(k, 0) + v for k, v in batch_model_scores.items()
            })

        # Free up memory
        del images
        del outputs
        gc.collect()

        if str(device) == 'cuda':
            torch.cuda.empty_cache()
    
    # Compute the average scores 
    epoch_model_scores = {k: v / num_batches for k, v in accum_model_scores.items()}

    return {'epoch_scores': epoch_model_scores, 
            'results': results}

In [None]:
def save_model_state(model_to_save, filename, ckpt_params_dict=None):
    """Save a model state dictionary or a checkpoint."""
    filepath =  SAVE_MODEL_PATH / filename

    if (ckpt_params_dict is not None) or isinstance(ckpt_params_dict, dict):
        torch.save({'model_state_dict': model_to_save.state_dict(),
                    **ckpt_params_dict}, filepath)
    else:
        torch.save(model_to_save.state_dict(), filepath)    

In [None]:
@torch.inference_mode()
def predict(img, model, show_scores=False, device=torch.device('cpu')):
    """Draw an image with bounding boxes (and scores) and return
    a number of house sparrows on it.
    """
    img = T.ToTensor()(img).to(device)
    model.to(device)
    model.eval()
    preds = model([img])[0]
    num_bboxes = len(preds['boxes'])

    scores = None
    if show_scores:
        scores = preds['scores']
        
    print(str(num_bboxes) + " house sparrow(s)")
    draw_bboxes_on_image(img, preds['boxes'], scores)
    return num_bboxes    

In [None]:
def run_train(train_dataloader, val_dataloader, model, epochs, optimizer_name, optim_params, 
              lr_scheduler_name=None, lr_scheduler_params=None, device=torch.device('cpu'), 
              initial_f_beta_score=0, eval_iou_thresh=0.5, eval_beta=1, 
              model_name='best_model', save_best_ckpt=False, checkpoint=None, 
              log_metrics=False, register_best_log_model=False, 
              show_random_best_model_prediction=False):
    """Run a new training and evaluation cycle of a model for a fixed number of epochs
    or continue if checkpoint is passed, while saving the best model (or checkpoint).
    
    Parameters
    -----------
        train_dataloader (Dataloader) -- images, labels and boxes for a training step
        val_dataloader (Dataloader) -- images, labels and boxes for an evaluation step
        model (nn.Module) -- an object detection model
        epochs (int) -- number of training epochs
        optimizer_name (str) -- an optimizer name from torch.optim
        optim_params (dict) -- relevant parameters for the optimizer
        lr_scheduler_name (str) (optional) -- a learning rate scheduler name 
            from torch.optim.lr_scheduler (default None)
        lr_scheduler_params (dict) (optional) -- relevant parameters for 
            the learning rate scheduler (default None)
        device (torch.device) -- a type of device used: torch.device('cpu' or 'cuda') 
            (default torch.device('cpu'))
        initial_f_beta_score (float) -- an initial f beta score to find the best model (default 0.0)
        eval_iou_thresh (float) -- an iou threshold to determine correct predict boxes (default 0.5)
        eval_beta (int) -- a beta value for f beta score (default 1)
        model_name (str) -- a part of filename to save (default 'best_model')
        save_best_ckpt (bool) -- whether to save the best model (default) 
            or its checkpoint (default False)
        checkpoint (dict) (optional) -- a checkpoint to continue training (default None)
        log_metrics (bool) -- whether to log metrics into MLflow (default False)
        register_best_log_model (bool) -- whether to log and register the best model 
            into MLflow (default False)
        show_random_best_model_prediction (bool) -- whether to show a random prediction 
            of the best model (default False).

    Return
    -----------
        a dictionary of training and evaluation results.
    """ 
    print("Device: ", device)    
    start_epoch = 0
    best_epoch_f_beta_score = initial_f_beta_score
    lr_scheduler = None

    model_params = [p for p in model.parameters() if p.requires_grad]
    # Construct an optimizer
    optimizer = getattr(torch.optim, optimizer_name)(model_params, **optim_params)

    if lr_scheduler_name is not None:
        if lr_scheduler_params is None:
            lr_scheduler_params = {}
        # Construct a learning rate scheduler
        lr_scheduler = getattr(torch.optim.lr_scheduler, lr_scheduler_name)(optimizer, 
                                                                            **lr_scheduler_params)
    
    if checkpoint is not None:
        # Get state parameters from the checkpoint
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_epoch_f_beta_score = checkpoint['f_beta_score']

    model.to(device)

    for epoch in range(1, epochs+1):
        current_epoch = start_epoch + epoch
        print(f"EPOCH [{current_epoch}/{start_epoch + epochs}]: ")

        # Training step
        print("TRAIN:")
        train_res = train_one_epoch(train_dataloader, model, optimizer, device)
        print("  epoch loss: {0}:\n    {1}".format(train_res['epoch_loss'], 
                                                   train_res['epoch_dict_losses']))

        if lr_scheduler is not None:
            lr_scheduler.step()        
        
        # Evaluation step
        print("EVAL:")
        eval_res = eval_one_epoch(val_dataloader, model, eval_iou_thresh, eval_beta, device)
        print("\n  epoch scores: {}".format(eval_res['epoch_scores'])) 
        
        # Save a model with the maximum f_beta score
        if best_epoch_f_beta_score < eval_res['epoch_scores']['f_beta']:
            best_epoch_f_beta_score = eval_res['epoch_scores']['f_beta']
            ckpt_dict = None
            filename = model_name + f'_best_f_beta_{eval_beta}_weights'
            
            if register_best_log_model:
                try:
                    # Log and register the best model into MLflow
                    mlflow.pytorch.log_model(model, filename, registered_model_name='best_' + MODEL_NAME, 
                                             await_registration_for=30, 
                                             pip_requirements=[f'torch={torch.__version__}', 
                                                               f'torchvision={torchvision.__version__}'])
                except NameError: 
                    print("Warning: The Model cannot be registered! -- MLflow module is not imported.")

            if save_best_ckpt:
                ckpt_dict = {'epoch': current_epoch,            
                                'optimizer_state_dict': optimizer.state_dict(),
                                'f_beta_score': best_epoch_f_beta_score}
                filename += '_ckpt'

            save_model_state(model, filename + '.pt', ckpt_dict)
            print(f"Model is saved. --- The best f_beta score: {best_epoch_f_beta_score}")

            with torch.no_grad():
                if show_random_best_model_prediction:
                    sample_imgs, _ = next(iter(val_dataloader))
                    sample_idx = random.randint(0, len(sample_imgs)-1)
                    preds = eval_res['results'][sample_idx]
                    draw_bboxes_on_image(sample_imgs[sample_idx], preds['boxes'], preds['scores'])
                    del sample_imgs
                    del preds
                                
        if log_metrics: 
            try:   
                # Log losses and scores into MLflow       
                mlflow.log_metric('train_epoch_loss', train_res['epoch_loss'], step=current_epoch)
                mlflow.log_metrics(train_res['epoch_dict_losses'], step=current_epoch)
                mlflow.log_metrics(eval_res['epoch_scores'], step=current_epoch)
                print("Metrics are logged.")
            except NameError: 
                print("Warning: Metrics cannot be logged! -- MLflow module is not imported.")

        # Free up memory
        gc.collect()
        if str(device) == 'cuda':
            torch.cuda.empty_cache()

        print("-" * 60)

    print("DONE!")
    return {'train_res': train_res,
            'eval_res': eval_res}

## 3. Data Preparation

In [None]:
class ImageBBoxDataset(Dataset):
    """A Dataset from csv to detect objects in images."""
    def __init__(self, csv_file_path, img_dir_path, bbox_path, 
                 img_transforms=None, bbox_transform=None):
        self.img_dir_path = img_dir_path
        self.img_df = pd.read_csv(csv_file_path)
        self.bbox_df = pd.read_csv(bbox_path)
        self.img_transforms = img_transforms
        self.bbox_transform = bbox_transform # (bbox_transform_fn, *bbox_transform_args) 

    def __len__(self):
        return self.img_df.shape[0]

    def __getitem__(self, idx):
        img_name = self.img_df.iloc[idx, 0]
        img_path = self.img_dir_path / img_name
        image = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
        bboxes = self.bbox_df.loc[(self.bbox_df.image_name == img_name), 
                                 ['bbox_x', 'bbox_y', 'bbox_width', 'bbox_height']].values
        labels = torch.ones((bboxes.shape[0],), dtype=torch.int64) 

        if self.img_transforms:
            aug = self.img_transforms(image=image, bboxes=bboxes, labels=labels)
            image = aug['image']
            bboxes = aug['bboxes']
                 
        image = T.ToTensor()(image)
        bboxes = torch.as_tensor(bboxes, dtype=torch.float)

        if self.bbox_transform:
            bboxes = self.bbox_transform[0](bboxes, *self.bbox_transform[1:])
             
        target = {'boxes': bboxes,
                  'labels': labels}

        return image, target

In [None]:
def get_train_val_test_dataloaders(batch_size, transform_train_img=False, save_img_transform_file=None):
    """Return training, validation, and test dataloaders with box transformation from 'xywh' to 'xyxy' format."""
    dataset_params = {
        'img_dir_path': IMAGE_PATH,
        'bbox_path': BBOX_DATA_PATH, 
        'bbox_transform': (box_convert, 'xywh', 'xyxy')
    }
    img_transforms = None
    box_format_before_transform = 'coco'

    if transform_train_img:
        img_transforms = get_image_transforms(box_format_before_transform, save_img_transform_file)

    train_dataset = ImageBBoxDataset(TRAIN_FILE_PATH, img_transforms=img_transforms, **dataset_params)
    val_dataset = ImageBBoxDataset(TRAIN_FILE_PATH, **dataset_params) 
    test_dataset = ImageBBoxDataset(TEST_FILE_PATH, **dataset_params)

    # Split data into training and validation sets
    train_ids, val_ids = stratified_group_train_test_split(train_dataset.img_df['Name'], 
                                                           train_dataset.img_df['Number_HSparrows'], 
                                                           train_dataset.img_df['Author'])    
    dl_params = {'batch_size': batch_size,
                 'collate_fn': collate_batch} 
    train_dataloader = DataLoader(Subset(train_dataset, train_ids), shuffle=True, **dl_params)
    val_dataloader = DataLoader(Subset(val_dataset, val_ids), **dl_params)
    test_dataloader = DataLoader(test_dataset, **dl_params)

    return train_dataloader, val_dataloader, test_dataloader

In [None]:
train_dl, val_dl, test_dl = get_train_val_test_dataloaders(BATCH_SIZE, transform_train_img=True, 
                                                           save_img_transform_file='image_augmentation_params')

In [None]:
# Draw a sample image with bounding boxes on it
sample_imgs, sample_targets = next(iter(train_dl))
sample_idx = random.randint(0, len(sample_imgs)-1)
draw_bboxes_on_image(sample_imgs[sample_idx], bboxes=sample_targets[sample_idx]['boxes'])

## 4. Object Detection Model Preparation

### 4.1. Model loading and preparation

In [None]:
def faster_rcnn_mob_model_for_n_classes(num_classes, print_head=False, **load_model_params):
    """Load a pre-trained Faster R-CNN (MobileNet Large) model 
    and modify it to classify N classes (true classes + background)."""
    # Load a Faster R-CNN model pre-trained on COCO
    faster_rcnn_mob = fasterrcnn_mobilenet_v3_large_fpn(weights='COCO_V1', **load_model_params)

    if print_head:
        print("The Model's Head - Before: \n", faster_rcnn_mob.roi_heads.box_predictor)

    # Get number of input features for the predictor
    in_features_mob = faster_rcnn_mob.roi_heads.box_predictor.cls_score.in_features
    # Replace the pre-trained head with a new one
    faster_rcnn_mob.roi_heads.box_predictor = FastRCNNPredictor(in_features_mob, num_classes=num_classes)

    if print_head:
        print("The Model's Head - After: \n", faster_rcnn_mob.roi_heads.box_predictor)

    return faster_rcnn_mob

In [None]:
# Set model parameters
model_params = dict(
    trainable_backbone_layers=1, # during training    
    rpn_score_thresh=0.4, # during inference 
    box_score_thresh=0.5, # during inference
    box_nms_thresh=0.4, # during inference
    box_detections_per_img=120,
    box_positive_fraction=0.4 # during training
)

In [None]:
# Get a modified model
faster_rcnn_mob_model = faster_rcnn_mob_model_for_n_classes(NUM_CLASSES, print_head=True, **model_params)

### 4.2. Hyperparameter optimization

In [None]:
# Log parameters and metrics during hyperparameter optimization into MLflow
mlflow.set_tracking_uri(MLRUN_PATH.as_uri())
hyp_opt_exp = mlflow.get_experiment_by_name('Hyperparameter_Optimization')
general_run_tags = {'model_name': MODEL_NAME, 'tools.training': 'PyTorch'}

if hyp_opt_exp is not None:
    hyp_opt_exp_id = hyp_opt_exp.experiment_id
else:
    hyp_opt_exp_id = mlflow.create_experiment('Hyperparameter_Optimization')

mlc = optuna.integration.mlflow.MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), 
    metric_name='f_beta', create_experiment=False, 
    mlflow_kwargs={
        'experiment_id': hyp_opt_exp_id, 
        'run_name': 'HO_TPE_Median', 
        'tags': {'sampler': 'TPESampler', 
                 'pruner': 'MedianPruner',
                 'tools.hyper_opt': 'Optuna',
                 **general_run_tags}})

In [None]:
@mlc.track_in_mlflow()
def objective(trial):
    """The function to be optimized."""
    # Get dataloaders
    train_dl, val_dl, _ = get_train_val_test_dataloaders(BATCH_SIZE)        
  
    # Load a model
    model_params = dict(
        trainable_backbone_layers=1, # during training
        rpn_score_thresh=0.4, # during inference
        box_score_thresh=0.5, # during inference
        box_nms_thresh=0.4, # during inference
        box_detections_per_img=120, # during inference
        box_positive_fraction=0.4 # during training
    )   
    frcnn_mob_model = faster_rcnn_mob_model_for_n_classes(NUM_CLASSES, **model_params)
    frcnn_mob_model.to(DEVICE)  
    
    # Construct a training optimizer and lr_scheduler
    lr_scheduler = None
    lr_scheduler_params = {} 

    optimizer_name = trial.suggest_categorical('optimizer', ['SGD', 'Adam'])
    optim_params = {
        'lr': trial.suggest_float('lr', 1e-5, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 0.0, 0.001, step=0.0001),
    }    

    if optimizer_name == 'SGD':
        optim_params['momentum'] = trial.suggest_float('momentum', 0.0, 0.9, step=0.3)

    lr_scheduler_name = trial.suggest_categorical('lr_scheduler', [None, 'StepLR', 'LinearLR'])

    if lr_scheduler_name == 'StepLR':
        lr_scheduler_params['step_size'] = trial.suggest_int('step_size', 1, 3)
        lr_scheduler_params['gamma'] = trial.suggest_float('gamma', 0.1, 0.2, log=True)

    train_model_params = [p for p in frcnn_mob_model.parameters() if p.requires_grad]
    optimizer = getattr(torch.optim, optimizer_name)(train_model_params, **optim_params)

    if lr_scheduler_name is not None:
        lr_scheduler = getattr(torch.optim.lr_scheduler, lr_scheduler_name)(optimizer, **lr_scheduler_params)
    
    # Log parameters into MLflow
    mlflow.log_params({'seed': SEED,
                       'device': DEVICE,
                       'num_classes': NUM_CLASSES,
                       'batch_size': BATCH_SIZE})                     
    mlflow.log_params({'eval_iou_thresh': EVAL_IOU_THRESH,
                       'eval_beta': EVAL_BETA,
                       **model_params})
    mlflow.log_params({'optimizer': optimizer_name,
                       'lr_scheduler': lr_scheduler_name,
                       **lr_scheduler_params,
                       **optim_params})

    # Train the model
    for epoch in range(1, 11):
        train_res = train_one_epoch(train_dl, frcnn_mob_model, optimizer, DEVICE)

        if lr_scheduler is not None:
            lr_scheduler.step()        

        eval_res = eval_one_epoch(val_dl, frcnn_mob_model, EVAL_IOU_THRESH , EVAL_BETA, DEVICE)
        f_beta_score = eval_res['epoch_scores']['f_beta']
        trial.report(f_beta_score, epoch)
        
        # Log metrics into MLflow
        mlflow.log_metric('train_epoch_loss', train_res['epoch_loss'], step=epoch)
        mlflow.log_metrics(train_res['epoch_dict_losses'], step=epoch)
        mlflow.log_metrics(eval_res['epoch_scores'], step=epoch)

        # Handle pruning
        if trial.should_prune():
            raise optuna.TrialPruned()

    return f_beta_score

In [None]:
# Set study parameters
study_callbacks=[mlc]

if str(DEVICE) == 'cuda':
    study_callbacks.append(lambda study, trial: torch.cuda.empty_cache())

study_storage = optuna.storages.RDBStorage(url='sqlite:///{}'.format(HYPER_OPT_PATH / 'hyper_opt_studies.db'))
study_name='faster_rcnn_mob_hyper_opt_study'

In [None]:
# Run a optimization session
study = optuna.create_study(direction='maximize', 
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=3),
                            storage=study_storage, study_name=study_name, 
                            load_if_exists=True)

study.optimize(objective, n_trials=100, timeout=2400, callbacks=study_callbacks,
               gc_after_trial=True)

In [None]:
# Show the best parameters and metric value
print(f"The best trial:\n  f_beta: {study.best_value}\n  params: ")
for k, v in study.best_params.items():
    print(f"    {k}: {v}")
print("  duration: {}s".format(study.best_trial.duration))

In [None]:
# View N trials
trials_df2 = study.trials_dataframe(attrs=('number', 'value', 'duration', 'params', 'state'))
trials_df2.head(7)

In [None]:
# Save study result plots
plots = [optuna.visualization.plot_optimization_history,
         optuna.visualization.plot_intermediate_values,
         optuna.visualization.plot_parallel_coordinate,
         optuna.visualization.plot_contour,
         optuna.visualization.plot_slice,
         optuna.visualization.plot_param_importances,
         optuna.visualization.plot_edf]

for plot in plots:
    fig = plot(study)
    fname = plot.__name__[5:]
    save_path = HYPER_OPT_PATH / 'plots' / study_name
    save_path.mkdir(parents=True, exist_ok=True)
    fig.write_image(save_path / f'{fname}.jpeg') 

### 4.3. Fine-tuning the model

In [None]:
# Set the best parameters found during optimization
loaded_study = optuna.load_study(study_name=study_name, storage=study_storage)
best_params = loaded_study.best_params
print("The best training parameters: \n", best_params)

optimized_train_params = {
    'optimizer_name': best_params['optimizer'],
    'lr_scheduler_name': best_params['lr_scheduler']
    }

for opt, params in zip(('optim_params', 'lr_scheduler_params'), 
                      (['lr', 'momentum', 'weight_decay'], ['step_size', 'gamma'])):
    opt_dict = {p: best_params[p] for p in params if p in best_params}
    optimized_train_params[opt] = opt_dict

add_train_params = {'epochs': EPOCHS, 
                    'eval_iou_thresh': EVAL_IOU_THRESH, 
                    'eval_beta': EVAL_BETA,
                    'device': DEVICE}

In [None]:
# Train the model (fine-tuning) with optimized parameters and log metrics 
# and the parameters into MLflow
ftm_exp = mlflow.get_experiment_by_name('Fine-Tuning_Model')
mlflow.set_registry_uri('sqlite:///{}'.format(MLRUN_PATH / 'model_registry.db'))

if ftm_exp is not None:
    ftm_exp_id = ftm_exp.experiment_id
else:  
    ftm_exp_id = mlflow.create_experiment('Fine-Tuning_Model')

with mlflow.start_run(run_name='fine-tuning_with_optimized_parameters', 
    experiment_id=ftm_exp_id) as mlft_run:
    mlflow.set_tags({'training_process': 'fine_tuning',
                    **general_run_tags})

    # Run model training cycles
    faster_rcnn_mob_res = run_train(train_dl, val_dl, faster_rcnn_mob_model, 
                                    initial_f_beta_score=0.5, log_metrics=True, 
                                    save_best_ckpt=True, model_name=MODEL_NAME,
                                    show_random_best_model_prediction=True,
                                    register_best_log_model=True,
                                    **optimized_train_params, **add_train_params)

    # Log the parameters into MLflow
    mlflow.log_params(model_params)
    mlflow.log_params({'seed': SEED,
                       'batch_size': BATCH_SIZE,
                       'num_classes': NUM_CLASSES})
    mlflow.log_params(add_train_params)
    mlflow.log_params(best_params)

### 4.4. Loading the best model and evaluating it on test data


In [None]:
# Load the best model from the MLflow registry
client = mlflow.MlflowClient()
reg_model_name = 'best_' + MODEL_NAME
model_registry_info = client.get_latest_versions(reg_model_name)
model_latest_version = max([m.version for m in model_registry_info])

model_uri = 'models:/{}/{}'.format(reg_model_name, model_latest_version)
best_faster_rcnn_mob_model = mlflow.pytorch.load_model(model_uri)

In [None]:
# Evaluate the best model on test data
test_res = eval_one_epoch(test_dl, best_faster_rcnn_mob_model, EVAL_IOU_THRESH, EVAL_BETA, DEVICE)
print(test_res['epoch_scores'])

In [None]:
# Show a random test image sample with predict boxes and scores
test_imgs_df = pd.read_csv(TEST_FILE_PATH, usecols=['Name'])
test_sample_idx = random.randint(0, test_imgs_df.size-1)
test_sample_img = cv2.cvtColor(cv2.imread(str(IMAGE_PATH / test_imgs_df.iloc[test_sample_idx].Name)), 
                               cv2.COLOR_BGR2RGB)
_ = predict(test_sample_img, best_faster_rcnn_mob_model, show_scores=True)

In [None]:
# # Uncomment to update model version stages
# for m in model_registry_info:    
#     if m.version == model_latest_version:
#         if m.current_stage == 'Production':
#             continue
#         else:
#             m = client.transition_model_version_stage(
#                     name=reg_model_name,
#                     version=m.version,
#                     stage='Production')
#     else:
#         if m.current_stage == 'Production':
#             m = client.transition_model_version_stage(
#                     name=reg_model_name,
#                     version=m.version,
#                     stage='Archived')
            
# # View updated model version stages
# for m in client.get_latest_versions(reg_model_name):
#     print(f'{m.name}: version: {m.version}, current stage: {m.current_stage}')