# My baseline 

In [23]:
# !mkdir -p ../work
# !cd ../work && tar xfz ../input/dflfiles/timm.tgz
# import sys
# sys.path.append('../work/timm/pytorch-image-models')

In [24]:
import glob
import os
from tqdm.auto import tqdm
from multiprocessing import Pool, cpu_count
import cv2
import time
import argparse
import logging
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

import timm
from timm.models import create_model, apply_test_time_pool
from timm.data import ImageDataset, create_loader, resolve_data_config
from timm.utils import AverageMeter, setup_default_logging

In [25]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


# Configurations

In [26]:
DEBUG = False

In [27]:
class CFG:
    EXP = "eff_b5_ap"
    # model
    model_type =  "tf_efficientnet_b5_ap"
    trained_model_path = "/workdir/work/output/eff_b5_ap/tf_efficientnet_b5_ap.pth"
    out_features = 4 # output class
    inp_channels = 3 #RGB -> 3
    dropout = 0
    pretrained = False
    batch_size = 4
    
    IMG_SIZE = (456, 456)

In [28]:
event_decoding = {
    0 : "background",
    1 : "challenge",
    2 : "play",
    3 : "throwin",
}

# set inference files

In [29]:
valid_video_files = ['/workdir/work/input/train/cfbe2e94_1.mp4',
                     '/workdir/work/input/train/ecf251d4_0.mp4'
                     ]
print(valid_video_files)

['/workdir/work/input/train/cfbe2e94_1.mp4', '/workdir/work/input/train/ecf251d4_0.mp4']


# Load model

In [30]:
class DFLNet(nn.Module):
    def __init__(self, model_name=CFG.model_type, 
                 out_features=CFG.out_features, inp_channels=CFG.inp_channels,
                 pretrained=CFG.pretrained):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, in_chans=inp_channels, num_classes=out_features)
    
    def forward(self, image):
        output = self.model(image)
        return output

In [31]:
model = DFLNet()

In [32]:
model.load_state_dict(torch.load(CFG.trained_model_path))
model.to(device)
model.eval()

DFLNet(
  (model): EfficientNet(
    (conv_stem): Conv2dSame(3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (act1): SiLU(inplace=True)
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act1): SiLU(inplace=True)
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (conv_pw): Conv2d(48, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running

In [33]:
def image_read_formodel(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, dsize=CFG.IMG_SIZE)
    image = image / 255 # convert to 0-1
    image = image.reshape(-1, CFG.inp_channels, CFG.IMG_SIZE[0], CFG.IMG_SIZE[1])
    return torch.tensor(image, dtype=torch.float)

# Validation

In [34]:
submission_list = []
softmax = nn.Softmax(dim=1)

for video_path in valid_video_files:
    video_name = os.path.basename(video_path).split('.')[0]
    cam = cv2.VideoCapture(video_path)
    fps = cam.get(cv2.CAP_PROP_FPS)
    video_file = video_path.split("/")[-1]
    video_id = video_file.split(".")[0]
    print("video_id:", video_id)
    frame_count = 1
    preds_prob_video = []
    pred_subformat_list = []
    while True:
        input_img = []
#         for i in range(CFG.batch_size):
#             successed, img = cam.read()
#             if not successed:
#                 break
#             img = image_read_formodel(img)
#             img = img.to(device)
#             if len(input_img) == 0:
#                 input_img = img
#             else:
#                 input_img = torch.cat([input_img, img], dim=0)
        successed, img = cam.read()
        if not successed:
            break
        img = image_read_formodel(img)
        img = img.to(device)
        output = model(img)
        output = softmax(output)
        output = output.to('cpu').detach().numpy().copy()
        pred_logits = output[0]
        # make pred for submission
        time = frame_count/fps
        pred_argmax_idx = np.argmax(pred_logits, axis=0)
        pred_prob = pred_logits[pred_argmax_idx]
        pred_event = event_decoding[pred_argmax_idx]
        if pred_argmax_idx != 0 and frame_count%fps == 0:
            pred_subformat_list.append([video_id, time, pred_event, pred_prob])
        frame_count += 1
        if DEBUG and frame_count > 1000:
            break
    if len(submission_list) == 0:
        submission_list = pred_subformat_list
    else:
        submission_list.extend(pred_subformat_list)
    

video_id: cfbe2e94_1
video_id: ecf251d4_0


In [44]:
valid_df = pd.DataFrame(submission_list, columns=["video_id", "time", "event", "score"])
display(valid_df.head(10))
display(valid_df.tail(10))
valid_df["event"].value_counts()

Unnamed: 0,video_id,time,event,score
0,cfbe2e94_1,699.0,play,0.458633
1,cfbe2e94_1,892.0,challenge,0.529709
2,cfbe2e94_1,940.0,play,0.493017
3,cfbe2e94_1,957.0,play,0.497401
4,cfbe2e94_1,960.0,play,0.48881
5,cfbe2e94_1,963.0,play,0.469729
6,cfbe2e94_1,966.0,play,0.490877
7,cfbe2e94_1,967.0,play,0.622432
8,cfbe2e94_1,968.0,play,0.590542
9,cfbe2e94_1,969.0,play,0.53205


Unnamed: 0,video_id,time,event,score
1111,ecf251d4_0,3411.0,play,0.792697
1112,ecf251d4_0,3412.0,play,0.724956
1113,ecf251d4_0,3413.0,play,0.751927
1114,ecf251d4_0,3414.0,play,0.699007
1115,ecf251d4_0,3415.0,play,0.752983
1116,ecf251d4_0,3416.0,play,0.69602
1117,ecf251d4_0,3417.0,play,0.687544
1118,ecf251d4_0,3418.0,play,0.706604
1119,ecf251d4_0,3419.0,play,0.654514
1120,ecf251d4_0,3420.0,play,0.663463


play         1037
challenge      84
Name: event, dtype: int64

In [45]:
valid_df.to_csv(f"/workdir/work/output/{CFG.EXP}/validation_1sec.csv", index=False)

# validation scoring

In [46]:
# copy from https://www.kaggle.com/code/ryanholbrook/competition-metric-dfl-event-detection-ap

import numpy as np
import pandas as pd
from pandas.testing import assert_index_equal
from typing import Dict, Tuple

tolerances = {
    "challenge": [0.3, 0.4, 0.5, 0.6, 0.7],
    "play": [0.15, 0.20, 0.25, 0.30, 0.35],
    "throwin": [0.15, 0.20, 0.25, 0.30, 0.35],
}

def filter_detections(
        detections: pd.DataFrame, intervals: pd.DataFrame
) -> pd.DataFrame:
    """Drop detections not inside a scoring interval."""
    detection_time = detections.loc[:, 'time'].sort_values().to_numpy()
    intervals = intervals.to_numpy()
    is_scored = np.full_like(detection_time, False, dtype=bool)

    i, j = 0, 0
    while i < len(detection_time) and j < len(intervals):
        time = detection_time[i]
        int_ = intervals[j]

        # If the detection is prior in time to the interval, go to the next detection.
        if time < int_.left:
            i += 1
        # If the detection is inside the interval, keep it and go to the next detection.        
        elif time in int_:
            is_scored[i] = True
            i += 1
        # If the detection is later in time, go to the next interval.
        else:
            j += 1

    return detections.loc[is_scored].reset_index(drop=True)


def match_detections(
        tolerance: float, ground_truths: pd.DataFrame, detections: pd.DataFrame
) -> pd.DataFrame:
    """Match detections to ground truth events. Arguments are taken from a common event x tolerance x video evaluation group."""
    detections_sorted = detections.sort_values('score', ascending=False).dropna()

    is_matched = np.full_like(detections_sorted['event'], False, dtype=bool)
    gts_matched = set()
    for i, det in enumerate(detections_sorted.itertuples(index=False)):
        best_error = tolerance
        best_gt = None

        for gt in ground_truths.itertuples(index=False):
            error = abs(det.time - gt.time)
            if error < best_error and not gt in gts_matched:
                best_gt = gt
                best_error = error
            
        if best_gt is not None:
            is_matched[i] = True
            gts_matched.add(best_gt)

    detections_sorted['matched'] = is_matched

    return detections_sorted


def precision_recall_curve(
        matches: np.ndarray, scores: np.ndarray, p: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    if len(matches) == 0:
        return [1], [0], []

    # Sort matches by decreasing confidence
    idxs = np.argsort(scores, kind='stable')[::-1]
    scores = scores[idxs]
    matches = matches[idxs]
    
    distinct_value_indices = np.where(np.diff(scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, matches.size - 1]
    thresholds = scores[threshold_idxs]
    
    # Matches become TPs and non-matches FPs as confidence threshold decreases
    tps = np.cumsum(matches)[threshold_idxs]
    fps = np.cumsum(~matches)[threshold_idxs]
    
    precision = tps / (tps + fps)
    precision[np.isnan(precision)] = 0
    recall = tps / p  # total number of ground truths might be different than total number of matches
    
    # Stop when full recall attained and reverse the outputs so recall is non-increasing.
    last_ind = tps.searchsorted(tps[-1])
    sl = slice(last_ind, None, -1)

    # Final precision is 1 and final recall is 0
    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]


def average_precision_score(matches: np.ndarray, scores: np.ndarray, p: int) -> float:
    precision, recall, _ = precision_recall_curve(matches, scores, p)
    # Compute step integral
    return -np.sum(np.diff(recall) * np.array(precision)[:-1])


def event_detection_ap(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        tolerances: Dict[str, float],
) -> float:

    assert_index_equal(solution.columns, pd.Index(['video_id', 'time', 'event']))
    assert_index_equal(submission.columns, pd.Index(['video_id', 'time', 'event', 'score']))

    # Ensure solution and submission are sorted properly
    solution = solution.sort_values(['video_id', 'time'])
    submission = submission.sort_values(['video_id', 'time'])
    
    # Extract scoring intervals.
    intervals = (
        solution
        .query("event in ['start', 'end']")
        .assign(interval=lambda x: x.groupby(['video_id', 'event']).cumcount())
        .pivot(index='interval', columns=['video_id', 'event'], values='time')
        .stack('video_id')
        .swaplevel()
        .sort_index()
        .loc[:, ['start', 'end']]
        .apply(lambda x: pd.Interval(*x, closed='both'), axis=1)
    )

    # Extract ground-truth events.
    ground_truths = (
        solution
        .query("event not in ['start', 'end']")
        .reset_index(drop=True)
    )

    # Map each event class to its prevalence (needed for recall calculation)
    class_counts = ground_truths.value_counts('event').to_dict()

    # Create table for detections with a column indicating a match to a ground-truth event
    detections = submission.assign(matched = False)

    # Remove detections outside of scoring intervals
    detections_filtered = []
    for (det_group, dets), (int_group, ints) in zip(
        detections.groupby('video_id'), intervals.groupby('video_id')
    ):
        assert det_group == int_group
        detections_filtered.append(filter_detections(dets, ints))
    detections_filtered = pd.concat(detections_filtered, ignore_index=True)

    # Create table of event-class x tolerance x video_id values
    aggregation_keys = pd.DataFrame(
        [(ev, tol, vid)
         for ev in tolerances.keys()
         for tol in tolerances[ev]
         for vid in ground_truths['video_id'].unique()],
        columns=['event', 'tolerance', 'video_id'],
    )

    # Create match evaluation groups: event-class x tolerance x video_id
    detections_grouped = (
        aggregation_keys
        .merge(detections_filtered, on=['event', 'video_id'], how='left')
        .groupby(['event', 'tolerance', 'video_id'])
    )
    ground_truths_grouped = (
        aggregation_keys
        .merge(ground_truths, on=['event', 'video_id'], how='left')
        .groupby(['event', 'tolerance', 'video_id'])
    )
    
    # Match detections to ground truth events by evaluation group
    detections_matched = []
    for key in aggregation_keys.itertuples(index=False):
        dets = detections_grouped.get_group(key)
        gts = ground_truths_grouped.get_group(key)
        detections_matched.append(
            match_detections(dets['tolerance'].iloc[0], gts, dets)
        )
    detections_matched = pd.concat(detections_matched)
    
    # Compute AP per event x tolerance group
    event_classes = ground_truths['event'].unique()
    ap_table = (
        detections_matched
        .query("event in @event_classes")
        .groupby(['event', 'tolerance']).apply(
        lambda group: average_precision_score(
        group['matched'].to_numpy(),
                group['score'].to_numpy(),
                class_counts[group['event'].iat[0]],
            )
        )
    )

    # Average over tolerances, then over event classes
    mean_ap = ap_table.groupby('event').mean().mean()

    return mean_ap

In [47]:
solution = pd.read_csv("/workdir/work/input/train.csv", usecols=['video_id', 'time', 'event'])
display(solution.head())

Unnamed: 0,video_id,time,event
0,1606b0e6_0,200.265822,start
1,1606b0e6_0,201.15,challenge
2,1606b0e6_0,202.765822,end
3,1606b0e6_0,210.124111,start
4,1606b0e6_0,210.87,challenge


In [48]:
valid_video_files

['/workdir/work/input/train/cfbe2e94_1.mp4',
 '/workdir/work/input/train/ecf251d4_0.mp4']

In [49]:
valid_video_id = [valid_id.split("/")[-1].split(".")[0] for valid_id in valid_video_files]
print(valid_video_id)

['cfbe2e94_1', 'ecf251d4_0']


In [50]:
solution[solution['video_id'].isin(valid_video_id)]

Unnamed: 0,video_id,time,event
9475,cfbe2e94_1,637.111502,start
9476,cfbe2e94_1,638.300000,play
9477,cfbe2e94_1,639.611502,end
9478,cfbe2e94_1,641.118367,start
9479,cfbe2e94_1,642.060000,play
...,...,...,...
11213,ecf251d4_0,3056.587000,challenge
11214,ecf251d4_0,3058.072895,end
11215,ecf251d4_0,3068.280519,start
11216,ecf251d4_0,3069.547000,throwin


In [51]:
score = event_detection_ap(solution[solution['video_id'].isin(valid_video_id)], valid_df, tolerances)

In [52]:
print(score)

0.009853336937742712


- focalloss : 0.009853336937742712