# DFL benchmark - training
This is a simple benchmark script for DFL.  
It classifies each frame image in the video into 4 classes（'background','challenge','play','throwin'） 
It does not use temporal information, so it may not be competitive on its own for this competition, but it could be used as a feature extractor for more advanced models.

In [1]:
!nvidia-smi

Fri Sep 23 10:24:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
|  0%   49C    P8    41W / 350W |    739MiB / 24576MiB |     24%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import numpy as np
import pandas as pd
import random
import gc
import cv2
import matplotlib.pyplot as plt
import time

import timm
from timm import utils

from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, OneCycleLR, CosineAnnealingLR, ReduceLROnPlateau, StepLR, LambdaLR

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import imageio
from PIL import Image
from tqdm.notebook import tqdm

# from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


In [3]:
import warnings
warnings.filterwarnings('ignore')

# setting

In [4]:
DEBUG = False

In [5]:
class CFG:
    EXP = "eff_b5_ap_bce_flowimage_playclass"

    # model
    # model_path = "swint_large224" #effnetでやってみる？
    model_path =  "tf_efficientnet_b5_ap"
    MODEL_SAVE_DIR = f"/workdir/work/output/{EXP}"
    out_features = 3 # output class
    inp_channels = 3 #RGB -> 3
    dropout = 0
    pretrained = True

    # train, valid
    TRAIN_IMG_DIR = "/workdir/work/output/train_images_inferbase"
    TRAIN_CSV = "/workdir/work/output/train_images_inferbase.csv"
    
    random_seed = 42
    batch_size = 64
    num_workers = 8
    n_epoch = 100
    early_stopping_rounds = 5
    # n_fold = 5
    # TRAIN_FOLD = [0, 1, 2, 3, 4]

    img_height = 224
    img_width = 224
        
    #optimizer
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    opt_eps = 1e-5
    lr = 5e-6
    opt_wd_non_norm_bias = 0.01
    opt_wd_norm_bias = 0

    #scheduler
    scheduler_name = "CosineAnnealingLR"
    T_0 = 5
    min_lr = 1e-7
    max_lr = 5e-5
    T_max = 5

    seed = 42

    if DEBUG:
        n_epoch = 1
        TRAIN_FOLD = [0, 1]

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
CFG.device = device
print(device)



cuda


# Logger

In [6]:
# save trained model in this dir
if not os.path.exists(CFG.MODEL_SAVE_DIR):
    os.makedirs(CFG.MODEL_SAVE_DIR)

In [7]:
def init_logger(log_file=f'{CFG.MODEL_SAVE_DIR}/train_{CFG.EXP}.log'):
    """Output Log."""
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
LOGGER = init_logger()
LOGGER.info(f"EXP NAME = {CFG.EXP}")
LOGGER.info(f"Model = {CFG.model_path}, (height, width) = ({CFG.img_height}, {CFG.img_width})")

EXP NAME = eff_b5_ap_bce_flowimage_playclass
Model = tf_efficientnet_b5_ap, (height, width) = (224, 224)


In [8]:
class AverageMeter(object):
    """Computes and stores the average and current value."""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Seed

In [9]:
def seed_torch(seed=0):
    """Fixed seed value."""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

# Read Data

In [10]:
train_df = pd.read_csv(CFG.TRAIN_CSV)
train_df["frame"] = train_df["frame"].astype(int)
display(train_df)

Unnamed: 0,video_id,frame,event,distance,distance_mean,time
0,1606b0e6_0,5007,challenge,0.0,16.049134,200.28
1,1606b0e6_0,5008,challenge,0.0,15.109529,200.32
2,1606b0e6_0,5009,challenge,0.0,17.871048,200.36
3,1606b0e6_0,5010,challenge,0.0,18.790614,200.40
4,1606b0e6_0,5011,challenge,0.0,21.235978,200.44
...,...,...,...,...,...,...
139331,cfbe2e94_1,89390,throwin,0.0,8.433969,3575.60
139332,cfbe2e94_1,89391,throwin,0.0,9.094970,3575.64
139333,cfbe2e94_1,89392,throwin,0.0,7.196483,3575.68
139334,cfbe2e94_1,89393,throwin,0.0,6.319773,3575.72


In [11]:
train_df = train_df[train_df["event"] != "start"]

In [12]:
train_df["event"].unique()

array(['challenge', 'throwin', 'play'], dtype=object)

# Dataset

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
event_encoding = {
    # "background" : 0,
    "challenge" : 0,
    "play" : 1,
    "throwin" : 2,
}


In [15]:
class DFLDataset(Dataset):
    def __init__(self, video_id, frame, targets, transform=None):
        self.video_id = video_id
        self.frame = frame
        self.targets = targets
        # self.transform = transform
    
    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        frame = int(self.frame[idx])
        image_path = f"{CFG.TRAIN_IMG_DIR}/{self.video_id[idx]}_{frame:06}.jpg"
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, dsize=(CFG.img_height, CFG.img_width))
        image = image / 255 # convert to 0-1
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        # if self.transform is not None:
        #     image = self.transform(image = image)["image"]
        target_idx = event_encoding[self.targets[idx]]
        target = np.zeros(CFG.out_features).astype(np.float32)
        target[target_idx] = 1

        return image, target

# Model

In [16]:
class DFLNet(nn.Module):
    def __init__(self, model_name=CFG.model_path, 
                 out_features=CFG.out_features, inp_channels=CFG.inp_channels,
                 pretrained=CFG.pretrained):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, in_chans=inp_channels, num_classes=out_features)
    
    def forward(self, image):
        output = self.model(image)
        return output

# Loss

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha= 0.25, gamma=2.0, eps=1e-7):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.eps = eps
        self.ce = torch.nn.CrossEntropyLoss(reduction="none")

    def forward(self, input, target):
        logp = self.ce(input, target)
        p = torch.exp(-logp)
        loss = self.alpha * ((1 - p) ** self.gamma) * logp
        return loss.mean()

# optimizer

In [18]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [19]:
def get_scheduler(optimizer):
    scheduler = None
    if CFG.scheduler_name == 'CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(
            optimizer,
            T_0 = CFG.T_0,
            eta_min = CFG.min_lr,
            last_epoch=-1
        )
    elif CFG.scheduler_name == 'OneCycleLR':
        scheduler = OneCycleLR(
            optimizer,
            max_lr = CFG.max_lr,
            steps_per_epoch = int( ( (CFG.n_fold-1) * train_df.shape[0]) / (CFG.n_fold * CFG.batch_size) ) + 1,
            epochs = CFG.n_epoch,
        )

    elif CFG.scheduler_name == 'CosineAnnealingLR':
        scheduler = CosineAnnealingLR(
            optimizer,
            T_max = CFG.T_max,
            eta_min = CFG.min_lr,
            last_epoch = -1
        )
    
    return scheduler

In [20]:
def divice_norm_bias(model): 
    norm_bias_params = []
    non_norm_bias_params = []
    except_wd_layers = ['norm', '.bias']
    for n, p in model.model.named_parameters():
        if any([nd in n for nd in except_wd_layers]):
            norm_bias_params.append(p)
        else:
            non_norm_bias_params.append(p)
    return norm_bias_params, non_norm_bias_params

# Train function

In [21]:
softmax = nn.Softmax(dim=1)

def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler=None, scaler=None):
    model.train()
    stream = tqdm(train_loader)
    losses = AverageMeter()
    global_step = 0

    for step, (images, targets) in enumerate(stream, start=1):
    # for step, (images, targets) in enumerate(train_loader):
        images = images.to(CFG.device, non_blocking=True)
        targets = targets.to(CFG.device, non_blocking=True)

        preds = model(images)
        preds_softmax = softmax(preds)

        loss = criterion(preds_softmax, targets)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
            loss.backward()
        else:
            loss.backward()
        losses.update(loss.item(), CFG.batch_size) 

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        if scheduler is not None:
            scheduler.step()

    return losses.avg

# Valid function

In [22]:
def valid_fn(val_loader, model, criterion, epoch):
    model.eval()
    # stream = tqdm(val_loader)
    
    losses = AverageMeter()
    
    final_targets = []
    final_preds = []
    
    with torch.no_grad():
        # for i, (images, targets) in enumerate(stream, start=1):
        for i, (images, targets) in enumerate(val_loader):
            images = images.to(CFG.device, non_blocking=True)
            targets = targets.to(CFG.device, non_blocking=True)
            preds = model(images)
            preds_softmax = softmax(preds)

            loss = criterion(preds_softmax, targets)
            losses.update(loss.item(), CFG.batch_size)

            targets_list = (targets.detach().cpu().numpy()).tolist()
            preds_list = torch.argmax(preds, dim=1).tolist()
            
            final_targets.extend(targets_list)
            final_preds.extend(preds_list)
    return losses.avg, final_preds, final_targets

# Train

In [23]:
train_valid_videos = train_df["video_id"].unique()
print(train_valid_videos)
train_videos = train_valid_videos[:2]
valid_videos = train_valid_videos[2:]
if DEBUG:
    train_videos = [train_videos[0]]
    valid_videos = [valid_videos[0]]
LOGGER.info(f"train_videos {train_videos}")
LOGGER.info(f"valid_videos {valid_videos}")

train_videos ['1606b0e6_0' '1606b0e6_1']
valid_videos ['cfbe2e94_0' 'cfbe2e94_1']


['1606b0e6_0' '1606b0e6_1' 'cfbe2e94_0' 'cfbe2e94_1']


In [24]:
# use for scoring
valid_video_files = []
for valid_video in valid_videos:
    valid_video_files.append(f'/workdir/work/input/train/{valid_video}.mp4')
print(valid_video_files)

['/workdir/work/input/train/cfbe2e94_0.mp4', '/workdir/work/input/train/cfbe2e94_1.mp4']


## set dataset

In [25]:
# separate train/valid data 
X_train_videoid = train_df[train_df["video_id"].isin(train_videos)]["video_id"].values
X_train_frame = train_df[train_df["video_id"].isin(train_videos)]["frame"].values
y_train = train_df[train_df["video_id"].isin(train_videos)]["event"].values

X_valid_videoid = train_df[train_df["video_id"].isin(valid_videos)]["video_id"].values
X_valid_frame = train_df[train_df["video_id"].isin(valid_videos)]["frame"].values
y_valid = train_df[train_df["video_id"].isin(valid_videos)]["event"].values

In [26]:
# prepare dataset
train_dataset = DFLDataset(video_id= X_train_videoid, frame=X_train_frame, targets = y_train)
valid_dataset = DFLDataset(video_id= X_valid_videoid, frame=X_valid_frame, targets = y_valid)

# create dataloader
train_loader = DataLoader(train_dataset,
                        batch_size = CFG.batch_size,
                        shuffle = True,
                        num_workers = CFG.num_workers)
valid_loader = DataLoader(valid_dataset,
                        batch_size = CFG.batch_size,
                        shuffle = True,
                        num_workers = CFG.num_workers)

In [27]:
train_df[(train_df["video_id"]=="1606b0e6_0") & (train_df["frame"]==22761)]

Unnamed: 0,video_id,frame,event,distance,distance_mean,time
11774,1606b0e6_0,22761,play,47.082431,17.898882,910.44


# check 

In [28]:
# for step, (images, targets) in enumerate(train_loader):
#     if step > 5:
#         break
#     print(step)

In [29]:
# instantiate model, cost function and optimizer
model = DFLNet()
model = model.to(device)

norm_bias_params, non_norm_bias_params = divice_norm_bias(model)
# criterion = FocalLoss()
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCELoss()

#print(f"norm bias params: {len(norm_bias_params)}, non norm bias params: {len(non_norm_bias_params)}")
optimizer = torch.optim.AdamW(
    [
        {'params': norm_bias_params, 'weight_decay': CFG.opt_wd_norm_bias},
        {'params': non_norm_bias_params, 'weight_decay': CFG.opt_wd_non_norm_bias},
    ],
    eps = CFG.opt_eps,
    lr = CFG.lr,
    amsgrad = False
)

# load scaler
scheduler = get_scheduler(optimizer)
scaler = GradScaler()



In [30]:
# train / valid loop
# best_score = -9999.
best_loss = 1e10
ealry_stopping_count = 0

start_time = time.time()
for epoch in range(1, CFG.n_epoch + 1):
    train_avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, scaler)
    valid_avg_loss, preds, targets = valid_fn(valid_loader, model, criterion, epoch)

    elapsed = time.time() - start_time
    elapsed_min = elapsed/60
    LOGGER.info(f"Epoch {epoch}: Train loss {train_avg_loss:.6f},  Valid loss {valid_avg_loss:.6f}. elapsed time:{elapsed_min:.1f} min.")
    if valid_avg_loss < best_loss:
        LOGGER.info(f"Model is improved.")
        ealry_stopping_count = 0
        best_loss = valid_avg_loss
        model_name = CFG.model_path
        LOGGER.info(f'{CFG.MODEL_SAVE_DIR}/{model_name}.pth is saved.')
        torch.save(model.state_dict(), f'{CFG.MODEL_SAVE_DIR}/{model_name}.pth')

    else:
        ealry_stopping_count += 1
        if ealry_stopping_count >= CFG.early_stopping_rounds:
            LOGGER.info(f"Early stopping. Model is not improved in {CFG.early_stopping_rounds} epochs")
            break
del model, train_loader, train_dataset
gc.collect()

LOGGER.info("Learning finished.")

torch.cuda.empty_cache()

  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 1: Train loss 0.514795,  Valid loss 0.811646. elapsed time:9.8 min.
Model is improved.
/workdir/work/output/eff_b5_ap_bce_flowimage_playclass/tf_efficientnet_b5_ap.pth is saved.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 2: Train loss 0.144418,  Valid loss 0.775160. elapsed time:19.6 min.
Model is improved.
/workdir/work/output/eff_b5_ap_bce_flowimage_playclass/tf_efficientnet_b5_ap.pth is saved.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 3: Train loss 0.060475,  Valid loss 0.766755. elapsed time:29.4 min.
Model is improved.
/workdir/work/output/eff_b5_ap_bce_flowimage_playclass/tf_efficientnet_b5_ap.pth is saved.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 4: Train loss 0.027610,  Valid loss 0.791043. elapsed time:39.1 min.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 5: Train loss 0.013620,  Valid loss 0.808218. elapsed time:48.9 min.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 6: Train loss 0.007545,  Valid loss 0.837074. elapsed time:58.7 min.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 7: Train loss 0.003945,  Valid loss 0.860062. elapsed time:68.5 min.


  0%|          | 0/1308 [00:00<?, ?it/s]

Epoch 8: Train loss 0.002335,  Valid loss 0.907241. elapsed time:78.3 min.
Early stopping. Model is not improved in 5 epochs
Learning finished.


# calculate valid score

In [31]:
class ScoringDataset(Dataset):
    def __init__(self, video_id, frame, targets, transform=None):
        self.video_id = video_id
        self.frame = frame
        self.targets = targets
        # self.transform = transform
    
    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        video_id = self.video_id[idx]
        frame = self.frame[idx]
        image_path = f"{CFG.TRAIN_IMG_DIR}/{self.video_id[idx]}_{self.frame[idx]:06}.jpg"
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, dsize=(CFG.img_height, CFG.img_width))
        image = image / 255 # convert to 0-1
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        # if self.transform is not None:
        #     image = self.transform(image = image)["image"]
        target_idx = event_encoding[self.targets[idx]]
        target = np.zeros(CFG.out_features).astype(np.float32)
        target[target_idx] = 1

        return image, target, frame, video_id

In [32]:
# prepare dataset
scoring_dataset = ScoringDataset(video_id= X_valid_videoid, frame=X_valid_frame, targets = y_valid)

# create dataloader
scoring_loader = DataLoader(scoring_dataset,
                        batch_size = CFG.batch_size,
                        shuffle = False,
                        num_workers = CFG.num_workers)

In [33]:
# event_decoding = {
#     0 : "background",
#     1 : "challenge",
#     2 : "play",
#     3 : "throwin",
# }
# event_decoding = {
#     0 : "background",
#     1 : "play",
# }

event_decoding = {
    0 : "challenge",
    1 : "play",
    2 : "throwin",
}

In [34]:
model = DFLNet()
model.load_state_dict(torch.load(f'{CFG.MODEL_SAVE_DIR}/{model_name}.pth'))
model.to(device)
model.eval()

DFLNet(
  (model): EfficientNet(
    (conv_stem): Conv2dSame(3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (act1): SiLU(inplace=True)
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act1): SiLU(inplace=True)
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (conv_pw): Conv2d(48, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running

In [35]:
pred_subformat_list = []
all_pred_logits = []

for i, (images, targets, frames, video_id) in enumerate(scoring_loader):
    images = images.to(CFG.device, non_blocking=True)
    targets = targets.to(CFG.device, non_blocking=True)
    frames = frames.to('cpu').detach().numpy().copy()

    output = model(images)
    output = softmax(output)
    output = output.to('cpu').detach().numpy().copy()
    targets_list = (targets.detach().cpu().numpy()).tolist()

    preds_logits = output
    all_pred_logits.extend(list(preds_logits))
    preds_argmax_idx = np.argmax(preds_logits, axis=1)
    preds_prob = [pred_logits[idx] for idx, pred_logits in zip(preds_argmax_idx, preds_logits)]
    preds_event = [event_decoding[idx] for idx in preds_argmax_idx]
    for idx, pred_argmax_idx in enumerate(preds_argmax_idx):
        if pred_argmax_idx != 0:
            pred_subformat_list.append([video_id[idx], frames[idx]/25, preds_event[idx], preds_prob[idx]])

In [36]:
scoring_df = pd.DataFrame(pred_subformat_list, columns=["video_id", "time", "event", "score"])
display(scoring_df.head(10))
display(scoring_df.tail(10))
scoring_df["event"].value_counts()

Unnamed: 0,video_id,time,event,score
0,cfbe2e94_0,229.36,play,0.994606
1,cfbe2e94_0,229.4,play,0.914265
2,cfbe2e94_0,229.44,play,0.919449
3,cfbe2e94_0,229.48,play,0.880688
4,cfbe2e94_0,229.52,play,0.980615
5,cfbe2e94_0,229.56,play,0.991785
6,cfbe2e94_0,229.6,play,0.995509
7,cfbe2e94_0,229.64,play,0.996643
8,cfbe2e94_0,229.68,play,0.996324
9,cfbe2e94_0,229.72,play,0.998232


Unnamed: 0,video_id,time,event,score
46982,cfbe2e94_1,3575.4,play,0.698282
46983,cfbe2e94_1,3575.44,play,0.674796
46984,cfbe2e94_1,3575.48,play,0.87858
46985,cfbe2e94_1,3575.52,play,0.899038
46986,cfbe2e94_1,3575.56,play,0.890512
46987,cfbe2e94_1,3575.6,play,0.874942
46988,cfbe2e94_1,3575.64,play,0.867121
46989,cfbe2e94_1,3575.68,play,0.548596
46990,cfbe2e94_1,3575.72,play,0.839822
46991,cfbe2e94_1,3575.76,play,0.780337


play       45591
throwin     1401
Name: event, dtype: int64

In [37]:
scoring_df.to_csv(f"/workdir/work/output/{CFG.EXP}/validation.csv", index=False)

In [38]:
# copy from https://www.kaggle.com/code/ryanholbrook/competition-metric-dfl-event-detection-ap

import numpy as np
import pandas as pd
from pandas.testing import assert_index_equal
from typing import Dict, Tuple

tolerances = {
    "challenge": [0.3, 0.4, 0.5, 0.6, 0.7],
    "play": [0.15, 0.20, 0.25, 0.30, 0.35],
    "throwin": [0.15, 0.20, 0.25, 0.30, 0.35],
}

def filter_detections(
        detections: pd.DataFrame, intervals: pd.DataFrame
) -> pd.DataFrame:
    """Drop detections not inside a scoring interval."""
    detection_time = detections.loc[:, 'time'].sort_values().to_numpy()
    intervals = intervals.to_numpy()
    is_scored = np.full_like(detection_time, False, dtype=bool)

    i, j = 0, 0
    while i < len(detection_time) and j < len(intervals):
        time = detection_time[i]
        int_ = intervals[j]

        # If the detection is prior in time to the interval, go to the next detection.
        if time < int_.left:
            i += 1
        # If the detection is inside the interval, keep it and go to the next detection.        
        elif time in int_:
            is_scored[i] = True
            i += 1
        # If the detection is later in time, go to the next interval.
        else:
            j += 1

    return detections.loc[is_scored].reset_index(drop=True)


def match_detections(
        tolerance: float, ground_truths: pd.DataFrame, detections: pd.DataFrame
) -> pd.DataFrame:
    """Match detections to ground truth events. Arguments are taken from a common event x tolerance x video evaluation group."""
    detections_sorted = detections.sort_values('score', ascending=False).dropna()

    is_matched = np.full_like(detections_sorted['event'], False, dtype=bool)
    gts_matched = set()
    for i, det in enumerate(detections_sorted.itertuples(index=False)):
        best_error = tolerance
        best_gt = None

        for gt in ground_truths.itertuples(index=False):
            error = abs(det.time - gt.time)
            if error < best_error and not gt in gts_matched:
                best_gt = gt
                best_error = error
            
        if best_gt is not None:
            is_matched[i] = True
            gts_matched.add(best_gt)

    detections_sorted['matched'] = is_matched

    return detections_sorted


def precision_recall_curve(
        matches: np.ndarray, scores: np.ndarray, p: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    if len(matches) == 0:
        return [1], [0], []

    # Sort matches by decreasing confidence
    idxs = np.argsort(scores, kind='stable')[::-1]
    scores = scores[idxs]
    matches = matches[idxs]
    
    distinct_value_indices = np.where(np.diff(scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, matches.size - 1]
    thresholds = scores[threshold_idxs]
    
    # Matches become TPs and non-matches FPs as confidence threshold decreases
    tps = np.cumsum(matches)[threshold_idxs]
    fps = np.cumsum(~matches)[threshold_idxs]
    
    precision = tps / (tps + fps)
    precision[np.isnan(precision)] = 0
    recall = tps / p  # total number of ground truths might be different than total number of matches
    
    # Stop when full recall attained and reverse the outputs so recall is non-increasing.
    last_ind = tps.searchsorted(tps[-1])
    sl = slice(last_ind, None, -1)

    # Final precision is 1 and final recall is 0
    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]


def average_precision_score(matches: np.ndarray, scores: np.ndarray, p: int) -> float:
    precision, recall, _ = precision_recall_curve(matches, scores, p)
    # Compute step integral
    return -np.sum(np.diff(recall) * np.array(precision)[:-1])


def event_detection_ap(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        tolerances: Dict[str, float],
) -> float:

    assert_index_equal(solution.columns, pd.Index(['video_id', 'time', 'event']))
    assert_index_equal(submission.columns, pd.Index(['video_id', 'time', 'event', 'score']))

    # Ensure solution and submission are sorted properly
    solution = solution.sort_values(['video_id', 'time'])
    submission = submission.sort_values(['video_id', 'time'])
    
    # Extract scoring intervals.
    intervals = (
        solution
        .query("event in ['start', 'end']")
        .assign(interval=lambda x: x.groupby(['video_id', 'event']).cumcount())
        .pivot(index='interval', columns=['video_id', 'event'], values='time')
        .stack('video_id')
        .swaplevel()
        .sort_index()
        .loc[:, ['start', 'end']]
        .apply(lambda x: pd.Interval(*x, closed='both'), axis=1)
    )

    # Extract ground-truth events.
    ground_truths = (
        solution
        .query("event not in ['start', 'end']")
        .reset_index(drop=True)
    )

    # Map each event class to its prevalence (needed for recall calculation)
    class_counts = ground_truths.value_counts('event').to_dict()

    # Create table for detections with a column indicating a match to a ground-truth event
    detections = submission.assign(matched = False)

    # Remove detections outside of scoring intervals
    detections_filtered = []
    for (det_group, dets), (int_group, ints) in zip(
        detections.groupby('video_id'), intervals.groupby('video_id')
    ):
        assert det_group == int_group
        detections_filtered.append(filter_detections(dets, ints))
    detections_filtered = pd.concat(detections_filtered, ignore_index=True)

    # Create table of event-class x tolerance x video_id values
    aggregation_keys = pd.DataFrame(
        [(ev, tol, vid)
         for ev in tolerances.keys()
         for tol in tolerances[ev]
         for vid in ground_truths['video_id'].unique()],
        columns=['event', 'tolerance', 'video_id'],
    )

    # Create match evaluation groups: event-class x tolerance x video_id
    detections_grouped = (
        aggregation_keys
        .merge(detections_filtered, on=['event', 'video_id'], how='left')
        .groupby(['event', 'tolerance', 'video_id'])
    )
    ground_truths_grouped = (
        aggregation_keys
        .merge(ground_truths, on=['event', 'video_id'], how='left')
        .groupby(['event', 'tolerance', 'video_id'])
    )
    
    # Match detections to ground truth events by evaluation group
    detections_matched = []
    for key in aggregation_keys.itertuples(index=False):
        dets = detections_grouped.get_group(key)
        gts = ground_truths_grouped.get_group(key)
        detections_matched.append(
            match_detections(dets['tolerance'].iloc[0], gts, dets)
        )
    detections_matched = pd.concat(detections_matched)
    
    # Compute AP per event x tolerance group
    event_classes = ground_truths['event'].unique()
    ap_table = (
        detections_matched
        .query("event in @event_classes")
        .groupby(['event', 'tolerance']).apply(
        lambda group: average_precision_score(
        group['matched'].to_numpy(),
                group['score'].to_numpy(),
                class_counts[group['event'].iat[0]],
            )
        )
    )

    # Average over tolerances, then over event classes
    mean_ap = ap_table.groupby('event').mean().mean()

    return mean_ap

In [39]:
solution = pd.read_csv("/workdir/work/input/train.csv", usecols=['video_id', 'time', 'event'])
display(solution.head())

Unnamed: 0,video_id,time,event
0,1606b0e6_0,200.265822,start
1,1606b0e6_0,201.15,challenge
2,1606b0e6_0,202.765822,end
3,1606b0e6_0,210.124111,start
4,1606b0e6_0,210.87,challenge


In [40]:
solution[solution['video_id'].isin(valid_videos)]

Unnamed: 0,video_id,time,event
8652,cfbe2e94_0,229.321518,start
8653,cfbe2e94_0,230.200000,play
8654,cfbe2e94_0,232.520000,play
8655,cfbe2e94_0,234.016200,end
8656,cfbe2e94_0,246.666301,start
...,...,...,...
10233,cfbe2e94_1,3562.660000,play
10234,cfbe2e94_1,3563.835896,end
10235,cfbe2e94_1,3572.500727,start
10236,cfbe2e94_1,3574.340000,throwin


In [41]:
score_just_pred= event_detection_ap(solution[solution['video_id'].isin(valid_videos)], scoring_df, tolerances)
print(score_just_pred)

0.025119637636487935


# scoring with post proccessing

In [42]:
event_names = ['challenge', 'throwin', 'play']
label_dict = {
    'background':0,
    'challenge':1,
    'play':2,
    'throwin':3,
}
event_names_with_background = ['background','challenge','play','throwin']

def make_sub(prob, pred_df):
    
    frame_rate = 25
    window_size = 10
    ignore_width = 10
    group_count = 5

    df = pd.DataFrame(prob,columns=event_names_with_background)
    df['video_id'] = pred_df['video_id']
    df['frame_id'] = pred_df['time']*frame_rate

    train_df = pd.DataFrame()
    for video_id, each_video_df in df.groupby('video_id'):
        for i, event in enumerate(event_names):
            # イベント毎にwindow size分の移動平均を取る-> prob_arrに格納(最初と最後のwindow_sizeがたりない分はNanになるので-100で埋める)
            prob_arr = each_video_df[event].rolling(window=window_size, center=True).mean().fillna(-100).values
            each_video_df['rolling_prob'] = prob_arr
            
            sort_arr = np.argsort(-prob_arr)# 全frameの中で、そのフレームのlogitsが何番目に小さいかの順番を格納したarrayを作成
            rank_arr = np.empty_like(sort_arr) # sort_arrと同じshapeの空の配列を作成(実際は空というものはないのでランダムな値が入っている)
            rank_arr[sort_arr] = np.arange(len(sort_arr)) # 各フレームのlogitsが全フレームのうち何番目に小さいかの順番を格納?
            # index list for detected action
            idx_list = []
            for i in range(len(prob_arr)):
                this_idx = sort_arr[i]
                if this_idx >= 0:
                    # Add maximam index to index_list
                    idx_list.append(this_idx)
                    # parityを組んで、こingnorelistを作って、順番が一定以下のものはpredictからはずす(probが高いところの周辺は最高値を残して消えていく)
                    for parity in (-1,1):
                        # 除外対象を考えるために、-1~1のparityに無視する範囲をかけてex_idxを作る
                        for j in range(1, ignore_width+1):
                            ex_idx = this_idx + j * parity
                            # idxがprobの長さ以内にあるときに処理する
                            if ex_idx >= 0 and ex_idx < len(prob_arr):
                                # Exclude frames near this_idx where the action occurred. 
                                sort_arr[rank_arr[ex_idx]] = -1
            this_df = each_video_df.iloc[idx_list].reset_index(drop=True).reset_index().rename(columns={'index':'rank'})[['rank','video_id','frame_id']]
            this_df['event'] = event
            train_df = train_df.append(this_df)  
    
    train_df['time'] = train_df['frame_id']/frame_rate
    train_df['score'] = 1/(train_df['rank']+1)# rankに応じてスコアをつける検出個数が多いほど後ろのscoreは小さくなっていく
    
    return train_df

In [43]:
pp_df = make_sub(all_pred_logits, scoring_df)
pp_pred_df = pp_df[["video_id", "time", "event",  "score"]]
display(pp_pred_df)
pp_pred_df["event"].value_counts()

ValueError: 4 columns passed, passed data had 3 columns

In [None]:
score_after_pp = event_detection_ap(solution[solution['video_id'].isin(valid_videos)], pp_pred_df, tolerances)


In [None]:
# つくったdataからのvalidationになるので微妙かも
LOGGER.info(f"score just prediction:{score_just_pred}")
LOGGER.info(f"score after pp:{score_after_pp}")