# Pytorch starter - MaskRCNN Train
In this notebook I enabled the GPU and the Internet access (needed for the pre-trained weights). We can not use Internet during inference, so I'll create another notebook for commiting. Stay tuned!

You can find the [inference notebook here](https://www.kaggle.com/pestipeti/pytorch-starter-fasterrcnn-inference)

- FasterRCNN from torchvision
- Use Resnet50 backbone
- Albumentation enabled (simple flip for now)

In [None]:
import pandas as pd
import numpy as np
import cv2
import os
import re

from PIL import Image

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torch
import torchvision

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SequentialSampler

from matplotlib import pyplot as plt

DIR_INPUT = '/kaggle/input/global-wheat-detection'
DIR_TRAIN = f'{DIR_INPUT}/train'
DIR_TEST = f'{DIR_INPUT}/test'

print("Done with initial imports")

In [None]:
import numba
from numba import jit

@jit(nopython=True)
def calculate_iou(gt, pr, form='pascal_voc') -> float:
    """Calculates the Intersection over Union.

    Args:
        gt: (np.ndarray[Union[int, float]]) coordinates of the ground-truth box
        pr: (np.ndarray[Union[int, float]]) coordinates of the prdected box
        form: (str) gt/pred coordinates format
            - pascal_voc: [xmin, ymin, xmax, ymax]
            - coco: [xmin, ymin, w, h]
    Returns:
        (float) Intersection over union (0.0 <= iou <= 1.0)
    """
    if form == 'coco':
        gt = gt.copy()
        pr = pr.copy()

        gt[2] = gt[0] + gt[2]
        gt[3] = gt[1] + gt[3]
        pr[2] = pr[0] + pr[2]
        pr[3] = pr[1] + pr[3]

    # Calculate overlap area
    dx = min(gt[2], pr[2]) - max(gt[0], pr[0]) + 1
    
    if dx < 0:
        return 0.0
    
    dy = min(gt[3], pr[3]) - max(gt[1], pr[1]) + 1

    if dy < 0:
        return 0.0

    overlap_area = dx * dy

    # Calculate union area
    union_area = (
            (gt[2] - gt[0] + 1) * (gt[3] - gt[1] + 1) +
            (pr[2] - pr[0] + 1) * (pr[3] - pr[1] + 1) -
            overlap_area
    )

    return overlap_area / union_area

@jit(nopython=True)
def find_best_match(gts, pred, pred_idx, threshold = 0.5, form = 'pascal_voc', ious=None) -> int:
    """Returns the index of the 'best match' between the
    ground-truth boxes and the prediction. The 'best match'
    is the highest IoU. (0.0 IoUs are ignored).

    Args:
        gts: (List[List[Union[int, float]]]) Coordinates of the available ground-truth boxes
        pred: (List[Union[int, float]]) Coordinates of the predicted box
        pred_idx: (int) Index of the current predicted box
        threshold: (float) Threshold
        form: (str) Format of the coordinates
        ious: (np.ndarray) len(gts) x len(preds) matrix for storing calculated ious.

    Return:
        (int) Index of the best match GT box (-1 if no match above threshold)
    """
    best_match_iou = -np.inf
    best_match_idx = -1

    for gt_idx in range(len(gts)):
        
        if gts[gt_idx][0] < 0:
            # Already matched GT-box
            continue
        
        iou = -1 if ious is None else ious[gt_idx][pred_idx]

        if iou < 0:
            iou = calculate_iou(gts[gt_idx], pred, form=form)
            
            if ious is not None:
                ious[gt_idx][pred_idx] = iou

        if iou < threshold:
            continue

        if iou > best_match_iou:
            best_match_iou = iou
            best_match_idx = gt_idx

    return best_match_idx

@jit(nopython=True)
def calculate_precision(gts, preds, threshold = 0.5, form = 'coco', ious=None) -> float:
    """Calculates precision for GT - prediction pairs at one threshold.

    Args:
        gts: (List[List[Union[int, float]]]) Coordinates of the available ground-truth boxes
        preds: (List[List[Union[int, float]]]) Coordinates of the predicted boxes,
               sorted by confidence value (descending)
        threshold: (float) Threshold
        form: (str) Format of the coordinates
        ious: (np.ndarray) len(gts) x len(preds) matrix for storing calculated ious.

    Return:
        (float) Precision
    """
    n = len(preds)
    tp = 0
    fp = 0
    
    # for pred_idx, pred in enumerate(preds_sorted):
    for pred_idx in range(n):

        best_match_gt_idx = find_best_match(gts, preds[pred_idx], pred_idx,
                                            threshold=threshold, form=form, ious=ious)

        if best_match_gt_idx >= 0:
            # True positive: The predicted box matches a gt box with an IoU above the threshold.
            tp += 1
            # Remove the matched GT box
            gts[best_match_gt_idx] = -1

        else:
            # No match
            # False positive: indicates a predicted box had no associated gt box.
            fp += 1

    # False negative: indicates a gt box had no associated predicted box.
    fn = (gts.sum(axis=1) > 0).sum()

    return tp / (tp + fp + fn)


@jit(nopython=True)
def calculate_image_precision(gts, preds, thresholds = (0.5, ), form = 'coco') -> float:
    """Calculates image precision.

    Args:
        gts: (List[List[Union[int, float]]]) Coordinates of the available ground-truth boxes
        preds: (List[List[Union[int, float]]]) Coordinates of the predicted boxes,
               sorted by confidence value (descending)
        thresholds: (float) Different thresholds
        form: (str) Format of the coordinates

    Return:
        (float) Precision
    """
    n_threshold = len(thresholds)
    image_precision = 0.0
    
    ious = np.ones((len(gts), len(preds))) * -1
    # ious = None

    for threshold in thresholds:
        precision_at_threshold = calculate_precision(gts.copy(), preds, threshold=threshold,
                                                     form=form, ious=ious)
        image_precision += precision_at_threshold / n_threshold

    return image_precision

print("Done with mAP definition")

In [None]:
train_df = pd.read_csv(f'{DIR_INPUT}/train.csv')
print(train_df.shape)
print(train_df)

In [None]:
train_df['x'] = -1
train_df['y'] = -1
train_df['w'] = -1
train_df['h'] = -1

print(train_df)

def expand_bbox(x):
    r = np.array(re.findall("([0-9]+[.]?[0-9]*)", x))
    if len(r) == 0:
        r = [-1, -1, -1, -1]
    return r

train_df[['x', 'y', 'w', 'h']] = np.stack(train_df['bbox'].apply(lambda x: expand_bbox(x)))
train_df.drop(columns=['bbox'], inplace=True)
train_df['x'] = train_df['x'].astype(np.float)
train_df['y'] = train_df['y'].astype(np.float)
train_df['w'] = train_df['w'].astype(np.float)
train_df['h'] = train_df['h'].astype(np.float)

print(train_df)

In [None]:
image_ids = train_df['image_id'].unique()
valid_ids = image_ids[-665:]
train_ids = image_ids[:-665]
# We pick 665 images for validation I think
print(valid_ids[:20])
print(train_ids)

In [None]:
valid_df = train_df[train_df['image_id'].isin(valid_ids)]
train_df = train_df[train_df['image_id'].isin(train_ids)]
print(valid_df.shape, train_df.shape)

In [None]:
# Added extra stuff here
class WheatDataset(Dataset):

    def __init__(self, dataframe, image_dir, transforms=None):
        super().__init__()

        self.image_ids = dataframe['image_id'].unique()
        self.df = dataframe
        self.image_dir = image_dir
        self.transforms = transforms

    def __getitem__(self, index: int):

        image_id = self.image_ids[index]
        records = self.df[self.df['image_id'] == image_id]

        image = cv2.imread(f'{self.image_dir}/{image_id}.jpg', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0

        # Define bbox coordinates
        boxes = records[['x', 'y', 'w', 'h']].values
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        
        # Define areas of bboxes
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        area = torch.as_tensor(area, dtype=torch.float32)

        # there is only one class
        labels = torch.ones((records.shape[0],), dtype=torch.int64)
        
        # suppose all instances are not crowd
        iscrowd = torch.zeros((records.shape[0],), dtype=torch.int64)
        
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['image_id'] = torch.tensor([index])
        target['area'] = area
        target['iscrowd'] = iscrowd

        if self.transforms:
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': labels
            }
            sample = self.transforms(**sample)
            image = sample['image']
            
            target['boxes'] = torch.stack(tuple(map(torch.tensor, zip(*sample['bboxes'])))).permute(1, 0)

#        # Define elliptic mask based in boxes AFTER augmentation
#         im_H, im_W = image.shape[1], image.shape[2] ##
#         bcoords = target['boxes'] ##
#         mask = np.zeros((bcoords.shape[0], im_H, im_W)) ##
#         
#         for i in range(bcoords.shape[0]):
#             mask[i,:,:] = get_mask(im_H, im_W, bcoords[i, :]) ##
#         mask = torch.tensor(mask, dtype = torch.uint8) ##
#        # target['masks'] needs to be a UInt8Tensor[N, H, W] with N=no of bboxes.
#        # The uint8 values need to be binary, 1 or 0.
#        
#        target['masks'] = mask ##
            
        return image, target, image_id

    def __len__(self) -> int:
        return self.image_ids.shape[0]
    
print('Done with dataset loader...')

In [None]:
# Albumentations
def get_train_transform():
    return A.Compose([
        A.Flip(0.5),
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

def get_valid_transform():
    return A.Compose([
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

print('Done with augmentation definitions...')

# Call the Mask R-CNN model from torchvision with pretrained weights

In [None]:
# load a model; pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

In [None]:
num_classes = 2  # 1 class (wheat) + background

# get number of input channels for the final linear classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

print(in_features)
print('Done with initializing model...')

In [None]:
print(model)

In [None]:
class Averager:
    def __init__(self):
        self.current_total = 0.0
        self.iterations = 0.0

    def send(self, value):
        self.current_total += value
        self.iterations += 1

    @property
    def value(self):
        if self.iterations == 0:
            return 0
        else:
            return 1.0 * self.current_total / self.iterations

    def reset(self):
        self.current_total = 0.0
        self.iterations = 0.0

print('Done with useful functions...')

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

train_dataset = WheatDataset(train_df, DIR_TRAIN, get_train_transform())
valid_dataset = WheatDataset(valid_df, DIR_TRAIN, get_valid_transform())


# split the dataset in train and test set
indices = torch.randperm(len(train_dataset)).tolist()

train_data_loader = DataLoader(
    train_dataset,
    batch_size=12,
    shuffle=False,
    num_workers=0,
    collate_fn=collate_fn
) ## changed batch size from 16

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=8,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)
print('Done with data split...')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('GPU?: '+str(torch.cuda.is_available()))

# Train

In [None]:
# Load and get fft masks
all_ffts=np.load('../input/wheat-mean-ffts-200-images/wheat_detection_mean_ffts_200_images.npz')
wheat_freq=all_ffts['wheat_freq']
im_freq=all_ffts['im_freq']
pad2=1024

thr_list=[0.2, 0.4, 0.6, 0.8]
n_thr=len(thr_list)
mask=np.zeros((pad2,pad2,3,n_thr))
for i in range(3):
    plot_wheat=np.log(wheat_freq[:,:,i])
    plot_im=np.log(im_freq[:,:,i])

#     print(np.min(plot_wheat))
#     print(np.min(plot_im))

    # if printed values all positive
    if np.min(plot_wheat)>0 and np.min(plot_im)>0:
        plot_wheat[0,:]=0
        plot_wheat[:,0]=0
        plot_im[0,:]=0
        plot_im[:,0]=0

    plot_wheat=plot_wheat/np.sum(np.abs(plot_wheat))
    plot_im=plot_im/np.sum(np.abs(plot_im))

    fft_diff=plot_wheat-plot_im
    fft_diff=np.fft.fftshift(fft_diff)

    for kt in range(n_thr):
        f_thr=thr_list[kt]
        mask[:,:,i,kt]=fft_diff>f_thr*1e-7
        exclude=120
        mask[:exclude,:,:,kt],mask[-exclude:,:,:,kt]=0,0
        mask[:,:exclude,:,kt],mask[:,-exclude:,:,kt]=0,0
        
        plt.figure()
        plt.imshow(mask[:,:,i,kt],vmin=-0.2e-7,vmax=2e-7)
        plt.colorbar()
        plt.pause(0.1)

        mask[:,:,i,kt]=np.fft.fftshift(mask[:,:,i,kt])

In [None]:
model=model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
lr_scheduler = None

num_epochs = 6

# Initialize metrics (except mask loss)
best_map = 0
metrics = {'mean_ap' : [],
           'loss_cls' : [],
           'loss_reg' : [],
           'loss_obj' : [],
           'loss_rpn' : [],
           'loss_tot' : []}

# Numba typed list!
iou_thresholds = numba.typed.List()
for x in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75]:
    iou_thresholds.append(x)

print('Done with optim init...')

In [None]:
def mask_input(images,thr_list,pad2):
    new_images=[]
    n_thr=len(thr_list)
    im_masked=np.zeros((3,pad2,pad2))
    for image in images:
        # get fft of full image in 3 color channels and mask
        for j in range(3):
            for kt in range(n_thr):
                im_masked[j,:,:]+=np.real(np.fft.ifft2(np.fft.fft2(image[j,:,:])*mask[:,:,j,kt]))
        # normalize to 0-1
        im_masked=im_masked-np.min(im_masked)
        im_masked=im_masked/np.max(im_masked)
        
        new_images.append(torch.from_numpy(im_masked).float())
    return new_images        

In [None]:
#print(images[0].dtype,new_images[0].dtype)

In [None]:
# Train
loss_hist=Averager()
loss_cls=Averager()
loss_reg=Averager()
loss_obj=Averager()
loss_rpn=Averager()

itr = 1

for epoch in range(num_epochs):
    loss_hist.reset()
    loss_cls.reset()
    loss_reg.reset()
    loss_obj.reset()
    loss_rpn.reset()
    image_precisions=[]
    
    for images, targets, image_ids in train_data_loader:
        model.train()
        # images and targets are both tuples of length batch_size each
    
        # mask the input images
        images=mask_input(images,thr_list,pad2)
    
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        # images and targets are now both lists of length batch_size each, but sent to device
        
        print(images[0].dtype)

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        loss_value = losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        # Store metrics
        loss_hist.send(loss_value)
        losses_batch = np.array([v.cpu().detach().item() for v in loss_dict.values()]) ##
        loss_cls.send(losses_batch[0]) ##
        loss_reg.send(losses_batch[1])  ##
        loss_obj.send(losses_batch[2]) ##
        loss_rpn.send(losses_batch[3]) ##
        
        # Evaluate again to get the APs
        model.eval()
        with torch.no_grad():
            pred = model(images)
        for idx, image in enumerate(images):
            # Remove detections below threshold
            pred[idx]['boxes'] = pred[idx]['boxes'][pred[idx]['scores'] >= 0.5]
            pred[idx]['scores'] = pred[idx]['scores'][pred[idx]['scores'] >= 0.5]

            preds = pred[idx]['boxes'].cpu().detach().numpy()
            scores = pred[idx]['scores'].cpu().detach().numpy()

            gt_boxes = targets[idx]['boxes'].cpu().numpy()

            preds_sorted_idx = np.argsort(scores)[::-1]
            preds_sorted = preds[preds_sorted_idx]

            AP = calculate_image_precision(preds_sorted,
                                            gt_boxes,
                                            thresholds=iou_thresholds,
                                            form='pascal_voc')
            image_precisions.append(AP)
        del pred, image, preds, scores, gt_boxes, preds_sorted_idx, preds_sorted, AP ## for CUDA memory

        if itr % 40 == 0:
            print(f"Iteration #{itr} loss: {loss_value}")

        itr += 1

    metrics['mean_ap'].append(np.mean(image_precisions)) ##
    metrics['loss_cls'].append(loss_cls.value) ##
    metrics['loss_reg'].append(loss_reg.value) ##
    metrics['loss_obj'].append(loss_obj.value) ##
    metrics['loss_rpn'].append(loss_rpn.value) ##
    metrics['loss_tot'].append(loss_hist.value) ##

#         # Save best model to file
#         if mean_ap > best_map:
#             print('mAP improved from {:.4f} to {:.4f}.'.format(best_map, mean_ap))
#             best_map = mean_ap
#             torch.save({
#                 'epoch': epoch,
#                 'model_state_dict': model.state_dict(),
#                 }, 'output/best_weights.pth.tar')
    # update the learning rate
    if lr_scheduler is not None:
        lr_scheduler.step()

    print(f"Epoch #{epoch} loss: {loss_hist.value}")
    print(f"Epoch #{epoch} metrics: {metrics}")

In [None]:
print(image_precisions)

In [None]:
print(metrics)
print(epoch)

In [None]:
# Get val images
images, targets, image_ids = next(iter(valid_data_loader))

# Send input im and target to device
images = list(img.to(device) for img in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

# Get prediction
model.eval()
cpu_device = torch.device("cpu")

outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]

In [None]:
im_no=3

# Get ground truth for this image
boxes = targets[im_no]['boxes'].cpu().numpy().astype(np.int32)
# masks = targets[im_no]['masks'].cpu().numpy().astype(np.int32)
sample = images[im_no].permute(1,2,0).cpu().numpy()

# Get prediction values for this image
boxes_out = outputs[im_no]['boxes'].detach().numpy().astype(np.int32)
scores_out = outputs[im_no]['scores'].detach().numpy()
# masks_out = outputs[im_no]['masks'].detach().numpy()
print(scores_out)
# print(masks.sum())
# print(masks_out.sum())

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 8))

i, j = 0, 0
for box in boxes:
    i+=1
    cv2.rectangle(sample,
                  (box[0], box[1]),
                  (box[2], box[3]),
                  (220, 0, 0), 3)
for box in boxes_out:
    j+=1
    cv2.rectangle(sample,
                  (box[0], box[1]),
                  (box[2], box[3]),
                  (0, 50, 250), 3)

#ax.set_axis_off()
ax.imshow(sample)

print(i,j)

In [None]:
torch.save({'epoch': epoch,'model_state_dict': model.state_dict(),'metrics': metrics},'fft_faster_rcnn_weights_w_metric_6_epochs.pth.tar')