In [None]:
!pip install icevision[all]
!pip install torch_optimizer
!pip install wandb -U

This notebook uses [IceVision](https://github.com/airctic/icevision) object detection library.

For final submission efficientdet_d5 model was trained on 512x512 image size and also pretrained on provided separate image dataset.

[Inference notebook](https://www.kaggle.com/nikitautin/35th-place-efficientdet-inference)

In [None]:
import os
import functools
import numpy as np
import pandas as pd
import torch_optimizer as optim
import torchvision.transforms as T
import plotly.express as px
import plotly.graph_objects as go
from icevision.all import *
from tqdm.contrib.concurrent import process_map
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, Callback
from kaggle_secrets import UserSecretsClient
from torch.distributions.beta import Beta
from operator import itemgetter
from PIL import Image
from icevision.metrics import Metric
from scipy.optimize import linear_sum_assignment

In [None]:
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("wandb-key")

!wandb login $wandb_key

In [None]:
pl.seed_everything(42)

In [None]:
FRAME_RANGE = 4
VALID_PERCENT = 0.2
SIZE = (256, 256)
CLASSES_NUM = 2
IMPACT_CLASS = 2

In [None]:
path = Path('/kaggle/input/nfl-impact-detection')
train_video_path = path/'train'

In [None]:
video_labels = pd.read_csv(path/'train_labels.csv').fillna(0)
video_labels.head(2)

Set impact label for helmets at range of 4 frames from labeled impact:

In [None]:
video_labels_with_impact = video_labels[video_labels['impact'] > 0]
for index, row in tqdm(video_labels_with_impact.iterrows(), total=len(video_labels_with_impact)):
    frames = np.arange(-FRAME_RANGE, FRAME_RANGE + 1) + row.frame
    indexes = video_labels.query('video == @row.video and frame in @frames and label == @row.label').index
    video_labels.loc[indexes, 'impact'] = 1
video_labels['image_name'] = video_labels['video'].str.replace('.mp4', '') + '_' + video_labels['frame'].astype(str) + '.png'
video_labels = video_labels[video_labels.groupby('image_name')['impact'].transform('sum') > 0].reset_index(drop=True)
video_labels['impact'] = video_labels['impact'].astype(int) + 1
video_labels.head()

In [None]:
video_names = np.random.permutation(video_labels.video.unique())
valid_video_len = int(len(video_names) * VALID_PERCENT)
video_valid = video_names[:valid_video_len]
video_train = video_names[valid_video_len:]
images_valid = video_labels[video_labels.video.isin(video_valid)].image_name.unique()
images_train = video_labels[video_labels.video.isin(video_train)].image_name.unique()

In [None]:
def make_images(video_name, video_dir, video_labels, out_dir, only_with_impact=True, impact_cls=IMPACT_CLASS):
    vidcap = cv2.VideoCapture(str(video_dir/video_name))
    frame = 0
    while True:
        read, img = vidcap.read()
        if not read:
            break
        frame += 1
        if only_with_impact:
            query_str = 'video == @video_name and frame == @frame and impact == @impact_cls'
            boxes = video_labels.query(query_str)
            if len(boxes) == 0:
                continue
        image_path = f'{out_dir}/{video_name}'.replace('.mp4', f'_{frame}.png')
        _ = cv2.imwrite(image_path, img)

In [None]:
train_images_path = Path('/kaggle/working/train_images')
train_images_path.mkdir()

Create images using frames with impact labels:

In [None]:
make_images_part = functools.partial(make_images, video_dir=train_video_path, video_labels=video_labels, out_dir=train_images_path)
process_map(make_images_part, os.listdir(train_video_path), max_workers=2);

In [None]:
len(os.listdir(train_images_path))

In [None]:
class HelmetParser(parsers.FasterRCNN, parsers.FilepathMixin, parsers.SizeMixin):
    def __init__(self, df, source):
        self.df = df
        self.source = source

    def __iter__(self):
        yield from self.df.itertuples()

    def __len__(self):
        return len(self.df)

    def imageid(self, o) -> Hashable:
        return o.image_name

    def filepath(self, o) -> Union[str, Path]:
        return self.source/o.image_name

    def image_width_height(self, o) -> Tuple[int, int]:
        return get_image_size(self.filepath(o))

    def labels(self, o) -> List[int]:
        return [o.impact]

    def bboxes(self, o) -> List[BBox]:
        return [BBox.from_xywh(o.left, o.top, o.width, o.height)]

In [None]:
parser = HelmetParser(video_labels, train_images_path)
data_splitter = FixedSplitter([images_train, images_valid])

In [None]:
train_rs, valid_rs = parser.parse(data_splitter=data_splitter, autofix=True)

In [None]:
show_records(train_rs[:1], display_label=True, figsize=(10, 10), ncols=1)

In [None]:
train_tfms = tfms.A.Adapter([tfms.A.HorizontalFlip(p=0.5),
                             tfms.A.RGBShift(), tfms.A.RandomBrightnessContrast(),
                             tfms.A.Blur(blur_limit=(1, 3), p=0.5),
                             tfms.A.OneOrOther(tfms.A.RandomSizedBBoxSafeCrop(*SIZE), tfms.A.Resize(*SIZE), p=0.5),
                             tfms.A.Normalize()])
valid_tfms = tfms.A.Adapter([tfms.A.Resize(*SIZE), tfms.A.Normalize()])

In [None]:
train_ds = Dataset(train_rs, train_tfms)
valid_ds = Dataset(valid_rs, valid_tfms)

Sample from train dataset:

In [None]:
samples = [train_ds[0] for _ in range(6)]
show_samples(samples, ncols=3, denormalize_fn=denormalize_imagenet, display_label=False, figsize=(30,30))

Sample from validation dataset:

In [None]:
samples = [valid_ds[0] for _ in range(3)]
show_samples(samples, ncols=3, denormalize_fn=denormalize_imagenet, display_label=False, figsize=(30,30))

In [None]:
train_dl = efficientdet.train_dl(train_ds, batch_size=32, num_workers=2, shuffle=True)
valid_dl = efficientdet.valid_dl(valid_ds, batch_size=32, num_workers=2, shuffle=False)

Sample train batch:

In [None]:
batch, samples = next(iter(train_dl))
show_samples(samples[:6], ncols=3, denormalize_fn=denormalize_imagenet, display_label=False, figsize=(30,30))

In [None]:
model = efficientdet.model(model_name="tf_efficientdet_d3", num_classes=CLASSES_NUM, img_size=SIZE)

In [None]:
class MixUp:
    def __init__(self, alpha=20.0, min_w=0.4, max_w=0.6):
        self.distrib = Beta(tensor(alpha), tensor(alpha))
        self.min_w = min_w
        self.max_w = max_w
    
    def __call__(self, batch):
        x, y = batch
        batch_size = x.shape[0]
        device = x.device
        self.lam = self.distrib.sample((batch_size,)).squeeze().to(device)
        self.lam = torch.clip(self.lam, self.min_w, self.max_w)
        self.shuffle = torch.randperm(batch_size, device=device)
        classes = y['cls']
        bbox = y['bbox']
        dims = len(x.shape)
        return (torch.lerp(x, x[self.shuffle], self.unsqueeze(self.lam, dims - 1, -1)),
                {
                  'bbox': list(map(torch.cat, zip(bbox, itemgetter(*self.shuffle)(bbox)))),
                  'cls': list(map(torch.cat, zip(classes, itemgetter(*self.shuffle)(classes))))
                }
               )
    
    def unsqueeze(self, x, n, dim):
        for _ in range(n):
            x = x.unsqueeze(dim)
        return x

For metric implementation [this notebook](https://www.kaggle.com/nvnnghia/evaluation-metrics) was used.

In [None]:
class F1Metric(Metric):
    def __init__(self, detection_threshold):
        self._records, self._preds = [], []
        self.detection_threshold = detection_threshold

    def _reset(self):
        self._records.clear()
        self._preds.clear()

    def accumulate(self, records, preds):
        self._records.extend(records)
        self._preds.extend(preds)

    def finalize(self) -> Dict[str, float]:
        gt_boxes = []
        for s in self._records:
            gt_boxes.append(list(map(lambda b: np.array(b.xyxy), 
                                     np.array(s["bboxes"])[np.array(s["labels"]) == IMPACT_CLASS])))
        pred_boxes = []
        for p in self._preds:
            pred_boxes.append(list(map(lambda b: np.array(b.xyxy),
                                       np.array(p["bboxes"])[
                                           (np.array(p["scores"]) >= self.detection_threshold)
                                           & (np.array(p["labels"]) == IMPACT_CLASS)
                                       ]
                                      )))
        
        tps, fps, fns = [], [], []
        for i in range(len(gt_boxes)):
            tp, fp, fn = self.precision_calc(gt_boxes[i], pred_boxes[i])
            tps.append(tp)
            fps.append(fp)
            fns.append(fn)

        tp = np.sum(tps)
        fp = np.sum(fps)
        fn = np.sum(fns)
        precision = tp / (tp + fp + 1e-6)
        recall =  tp / (tp + fn + 1e-6)
        f1_score = 2 * (precision*recall ) /(precision + recall + 1e-6)
        
        self._reset()
        return {"f1": f1_score}
    
    @property
    def name(self) -> str:
        return self.__class__.__name__ + str(self.detection_threshold)
    
    def iou(self, bbox1, bbox2):
        bbox1 = list(map(float, bbox1))
        bbox2 = list(map(float, bbox2))

        (x0_1, y0_1, x1_1, y1_1) = bbox1
        (x0_2, y0_2, x1_2, y1_2) = bbox2

        # get the overlap rectangle
        overlap_x0 = max(x0_1, x0_2)
        overlap_y0 = max(y0_1, y0_2)
        overlap_x1 = min(x1_1, x1_2)
        overlap_y1 = min(y1_1, y1_2)

        # check if there is an overlap
        if overlap_x1 - overlap_x0 <= 0 or overlap_y1 - overlap_y0 <= 0:
            return 0

        # if yes, calculate the ratio of the overlap to each ROI size and the unified size
        size_1 = (x1_1 - x0_1) * (y1_1 - y0_1)
        size_2 = (x1_2 - x0_2) * (y1_2 - y0_2)
        size_intersection = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
        size_union = size_1 + size_2 - size_intersection

        return size_intersection / size_union
    
    def precision_calc(self, gt_boxes, pred_boxes):
        cost_matix = np.ones((len(gt_boxes), len(pred_boxes)))
        for i, box1 in enumerate(gt_boxes):
            for j, box2 in enumerate(pred_boxes):
                iou_score = self.iou(box1, box2)

                if iou_score < 0.35:
                    continue
                else:
                    cost_matix[i,j]=0

        row_ind, col_ind = linear_sum_assignment(cost_matix)
        fn = len(gt_boxes) - row_ind.shape[0]
        fp = len(pred_boxes) - col_ind.shape[0]
        tp = 0
        for i, j in zip(row_ind, col_ind):
            if cost_matix[i,j] == 0:
                tp += 1
            else:
                fp += 1
                fn += 1
        return tp, fp, fn

In [None]:
class LightModel(efficientdet.lightning.ModelAdapter):
    def __init__(self, lr, epochs, dl_len, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mixup = MixUp()
        self.save_hyperparameters('lr', 'epochs', 'dl_len')
        
    def training_step(self, batch, batch_idx):
        self.mixuped = self.mixup(batch[0])
        return super().training_step((self.mixuped, batch[1]), batch_idx)
    
    def configure_optimizers(self):
        optimizer =  optim.RAdam(self.parameters(), lr=self.hparams.lr, weight_decay=0.1)
        scheduler = {
            'scheduler': torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                    self.hparams.dl_len * self.hparams.epochs,
                                                                    self.hparams.lr / 100),
            'interval': 'step',
            'frequency': 1,
        }
        return [optimizer], [scheduler]

In [None]:
metrics = [F1Metric(0.3)]
light_model = LightModel(1e-2, 5, len(train_dl), model, metrics=metrics)

In [None]:
wandb_logger = WandbLogger(name='effdet_d3', project='NFL', log_model=True)

In [None]:
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(max_epochs=5, gpus=1, precision=16,
                     callbacks=[lr_monitor],
                     logger=wandb_logger,
                     log_every_n_steps=1,
                     flush_logs_every_n_steps=10,
                     auto_lr_find=False
                    )

In [None]:
# lr_finder = trainer.tuner.lr_find(light_model, train_dl, valid_dl)
# fig = lr_finder.plot(suggest=True)
# fig.show()

In [None]:
trainer.fit(light_model, train_dl, valid_dl)

Mixup augmentation samples:

In [None]:
imgs = light_model.mixuped[0].permute(0, 2, 3, 1).cpu().numpy()
px.imshow(denormalize_imagenet(imgs[:3]), facet_col=0)

In [None]:
!rm -rf /kaggle/working/train_images/*