# NFL Late

In [1]:
import yaml
import os

In [2]:
CFG = {
        "kaggle" : False,
        "DEBUG" : True,
        "wandb" : True,
        # model config
        "model_name" : "swin_s3_tiny_224",
        "out_features" : 20,
        "inp_channels": 3,
        "num_img_feature" : 10,
        "pretrained" : True,
        
        "roll_sum_window_size" : 10,
        "features" : ['x_position_1', 'y_position_1', 'x_position_2', 'y_position_2', 
                      'speed_1', 'distance_1', 'direction_1', 'orientation_1','acceleration_1', 'sa_1', 
                      'speed_2', 'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
                      'players_dis', #'is_ground'
                      ],

        # learning config
        "n_epoch" : 5,
        "n_folds": 3,
        "train_folds" : [0,1,2],
        "lr" : 1e-4,
        "T_max" : 10,
        "min_lr" : 1e-8,
        "weight_decay" : 1e-6,

        # etc
        "print_freq" : 1000,
        "random_seed" : 21,

        # data config    
        "img_size" : [224, 224],
        "batch_size" : 128,
        "num_workers" : 2,
        "masksize_helmet_ratio" : 4, # helmetサイズにこの係数をかけたサイズだけ色を残して後は黒塗りする
        "TRAIN_VIDEO_NUM" : 100,
        "VALID_VIDEO_NUM" : 10,
        "sample_num" : -1, 

        "EXP_CATEGORY" : "late",
        "EXP_NAME" : "refactored",
}

if CFG["DEBUG"]:
    CFG["EXP_CATEGORY"] = "DEBUG"
    CFG["EXP_NAME"] = "DEBUG"
    CFG["n_epoch"] = 2
    CFG["sample_num"] = 1000
    CFG["batch_size"] = 32
    CFG["train_folds"] : [0,1]


CFG["INPUT_DIR"] = "/workspace/input"
CFG["OUTPUT_DIR"] = "/workspace/output"
CFG["TRAIN_HELMET_CSV"] = os.path.join(CFG["INPUT_DIR"], "train_baseline_helmets.csv")
CFG["TRAIN_TRACKING_CSV"] = os.path.join(CFG["INPUT_DIR"], "train_player_tracking.csv")
CFG["TRAIN_VIDEO_META_CSV"] = os.path.join(CFG["INPUT_DIR"], "train_video_metadata.csv")
CFG["TRAIN_LABEL_CSV"] = os.path.join(CFG["INPUT_DIR"], "train_labels.csv")
CFG["SAVED_CONTACT_CSV"] = os.path.join(CFG["INPUT_DIR"], "Saved_contact_frames.csv")
CFG["CONTACT_IMG_DIR"] = os.path.join(CFG["INPUT_DIR"], "contact_images")
CFG["MODEL_DIR"] = os.path.join(CFG["OUTPUT_DIR"], CFG["EXP_NAME"] ,"model")
    
if not CFG["kaggle"] and not CFG["DEBUG"]:
    os.mkdir(os.path.join(CFG["OUTPUT_DIR"], CFG["EXP_NAME"]))
    os.mkdir(CFG["MODEL_DIR"])


In [3]:
cofig_yaml = os.path.join(CFG["INPUT_DIR"] , "Config.yaml")
with open(cofig_yaml, "w") as f:
    yaml.dump(CFG, f)

# import libraries

In [4]:
# general
import os
import gc
import pickle
import glob
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import cv2
import matplotlib.pyplot as plt
import time
import math

import sys
sys.path.append('/kaggle/input/timm-pytorch-image-models/pytorch-image-models-master')


# deep learning
# from torch.utils.data import Dataset, DataLoader
# from torch.optim import SGD, Adam, AdamW
# from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
import torch
# import torchvision
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim

# import albumentations as A
# from albumentations.pytorch import ToTensorV2

# from sklearn.model_selection import GroupKFold

# loss metrics
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score

# import cudf
import polars as pl

import wandb
# warningの表示方法の設定
import warnings
warnings.filterwarnings("ignore")


# my utils
sys.path.append('../nfl_utils')
from nfl_utils.dataset_utils import *
from nfl_utils.train_utils import *

/workspace/nfl_utils


# Set Configurations

In [5]:
if CFG["wandb"]:
    os.environ["WANDB_SILENT"] = "true"
    WANDB_CONFIG = {'competition': 'NFL', '_wandb_kernel': 'taro'}
    if CFG["kaggle"]:
        # Secrets
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb")

        !wandb login $secret_value_0
    #! TODO : logger settings
    wandb.init(project=WANDB_CONFIG["competition"], config=CFG, group=CFG["EXP_CATEGORY"], name=CFG["EXP_NAME"])

# Utils

In [6]:
def seed_everything(seed=CFG["random_seed"]):
    #os.environ['PYTHONSEED'] = str(seed)
    np.random.seed(seed%(2**32-1))
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic =True
    torch.backends.cudnn.benchmark = False
seed_everything()

# device optimization
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


---

# Load Target

In [7]:
target_dtypes = {'contact_id':str, 
                'game_play':str,
                'datetime':str,
                'step':int,
                'nfl_player_id_1':str,
                'nfl_player_id_2':str,
                'contact':int,
                }
target_df = pl.read_csv(CFG["TRAIN_LABEL_CSV"], dtypes=target_dtypes)    

FEATURE_COLS = ["nfl_player_id_1", "nfl_player_id_2", "step"]

# Merge tracking_df

In [8]:
tracking_df = pl.read_csv(CFG["TRAIN_TRACKING_CSV"])
tracking_df, SHIFT_COLS = get_tracking_shift(tracking_df)
target_df, FEATURE_COLS = target_merge_tracking(target_df, tracking_df, FEATURE_COLS, SHIFT_COLS)
# display(tracking_df.filter((pl.col("game_play")=="58580_001136") & (pl.col("nfl_player_id").cast(str)=="44830")))# ちゃんとshiftできてそう
del tracking_df

target_df, FEATURE_COLS = create_trackmerged_ftr(target_df, FEATURE_COLS)

75
original length 4721618


# Exclude distance 2

In [9]:
print(len(target_df))
target_df = target_df.filter(pl.col("players_dis") <= 2)
print(len(target_df))

4721618
660553


# Merge helmet df

In [10]:
helmet_df = pl.read_csv(CFG["TRAIN_HELMET_CSV"])
target_df, FEATURE_COLS = target_merge_helmet(target_df, helmet_df, FEATURE_COLS)
target_df, FEATURE_COLS = create_helmetmerged_ftr(target_df, FEATURE_COLS)
target_df, FEATURE_COLS = get_categorical_ftr(target_df, FEATURE_COLS)
print(len(target_df))

original length 660553
108
original length 660553
660553


# Reduce Data

In [11]:
target_df = target_df.to_pandas()
target_df_E = target_df[target_df["is_E_helmet"]==1]
target_df_E["EorS"] = "Endzone"

target_df_S = target_df[target_df["is_S_helmet"]==1]
target_df_S["EorS"] = "Sideline"

target_df = pd.concat([target_df_E, target_df_S], axis=0).reset_index(drop=True)

if CFG["DEBUG"]:
    target_df = target_df.sample(CFG["sample_num"]).reset_index(drop=True)
elif CFG["sample_num"] != -1:
    target_df = target_df.sample(CFG["sample_num"]).reset_index(drop=True)

print(len(target_df))
print(len(target_df["game_play"].unique()))
display(target_df["contact"].value_counts())

1000
233


0    892
1    108
Name: contact, dtype: int64

In [12]:
target_df[CFG["features"]]

Unnamed: 0,x_position_1,y_position_1,x_position_2,y_position_2,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,players_dis
0,40.83,19.66,40.34,18.40,0.35,0.03,168.82,120.32,0.73,0.27,0.20,0.02,117.42,124.86,0.93,0.52,1.351925
1,108.19,32.73,0.00,0.00,1.47,0.16,4.76,274.63,2.47,-0.14,0.00,0.00,0.00,0.00,0.00,0.00,0.000000
2,48.04,23.49,48.81,23.35,0.25,0.03,46.47,207.34,0.59,-0.55,0.36,0.04,155.09,203.34,0.12,0.11,0.782624
3,83.21,30.11,0.00,0.00,3.03,0.31,220.38,205.74,1.52,-0.89,0.00,0.00,0.00,0.00,0.00,0.00,0.000000
4,103.70,33.78,103.16,35.39,0.62,0.07,253.47,239.03,0.47,-0.39,0.99,0.10,58.72,184.76,0.58,0.29,1.698146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,57.20,23.53,0.00,0.00,1.21,0.11,70.69,59.05,1.93,1.84,0.00,0.00,0.00,0.00,0.00,0.00,0.000000
996,14.49,29.13,14.14,30.64,0.00,0.00,169.94,262.82,0.00,0.00,0.08,0.01,213.29,149.76,0.58,-0.46,1.550032
997,47.07,13.00,0.00,0.00,6.44,0.65,215.62,203.86,2.30,-2.18,0.00,0.00,0.00,0.00,0.00,0.00,0.000000
998,83.00,30.56,83.92,31.50,1.48,0.15,192.74,37.37,1.99,-0.28,0.96,0.10,166.73,275.59,1.64,-0.59,1.315295


In [13]:
oof_df = training_loop(target_df)
wandb.finish()

---
fold 0 start training...
train target contact
0    595
1     71
Name: contact, dtype: int64
valid target contact
0    297
1     37
Name: contact, dtype: int64
	 === epoch: 1: training ===
	 Epoch: [1][0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.7028(0.7028) 
	 Epoch: [1][20/21] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3711(0.3422) 
	 EVAL: [0/11] Elapsed 0m 0s (remain 0m 2s) Loss: 0.2745(0.2745) 
	 EVAL: [10/11] Elapsed 0m 1s (remain 0m 0s) Loss: 0.2482(0.3225) 
	 epoch:1, avg train loss:0.3422, avg valid loss:0.3225
	 score:0.1861(th=0.2) AUC = 0.7032=> time:0.07 min
	 Epoch 1 - Save Best Score: 0.1861. Model is saved.
	 === epoch: 2: training ===
	 Epoch: [2][0/21] Elapsed 0m 0s (remain 0m 7s) Loss: 0.2498(0.2498) 
	 Epoch: [2][20/21] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3447(0.2882) 
	 EVAL: [0/11] Elapsed 0m 0s (remain 0m 2s) Loss: 0.2830(0.2830) 
	 EVAL: [10/11] Elapsed 0m 1s (remain 0m 0s) Loss: 0.2237(0.3378) 
	 epoch:2, avg train loss:0.2882, avg valid loss:0.3378
	 score:0.182

# Save oof_df

In [14]:
display(oof_df)
if CFG["kaggle"]:
    oof_filename = os.path.join(CFG["OUTPUT_DIR"], "oof_df.csv")
    oof_df.to_csv(oof_filename, index=False)
else:
    oof_filename = os.path.join(CFG["OUTPUT_DIR"], CFG["EXP_NAME"], "oof_df.csv")
    oof_df.to_csv(oof_filename, index=False)

Unnamed: 0,contact_id,pred,contact,fold,is_End,img_emb_0,img_emb_1,img_emb_2,img_emb_3,img_emb_4,img_emb_5,img_emb_6,img_emb_7,img_emb_8,img_emb_9
0,58301_000786_33_42474_G,0.037604,0,0,1,-0.546112,-2.193521,0.715826,1.776386,2.185322,0.642115,1.188046,0.393539,1.504683,2.279351
1,58204_002822_28_42388_45033,0.259820,0,0,0,-0.526446,-0.525903,0.262488,0.472419,0.448866,0.237950,0.458530,-0.230513,0.567145,0.951220
2,58526_001849_6_39684_43079,0.260703,0,0,1,-0.247266,-0.587335,0.365002,0.397820,0.510249,0.022756,0.443198,0.015854,0.663336,0.729495
3,58527_000757_85_39984_G,0.036322,0,0,0,-0.866156,-1.987489,0.844865,1.952339,2.367661,0.633730,1.627647,0.604657,1.452764,1.907179
4,58341_002805_66_42414_G,0.034330,0,0,1,-0.741025,-1.978688,0.537359,1.561857,2.601082,0.493723,1.286664,0.638580,1.496845,2.481718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,58306_002362_11_42381_G,0.058961,0,2,1,-1.245600,-0.400236,-3.580675,0.394263,1.361500,4.706434,0.856520,1.974225,-1.325062,-3.468157
329,58506_002821_3_46205_G,0.087315,0,2,1,-0.996156,-0.414261,-2.752366,0.527066,1.476637,4.150136,0.407533,1.683318,-1.067082,-2.828903
330,58217_002635_4_38551_G,0.070863,0,2,1,-0.916825,-0.656434,-3.132491,0.592365,1.570376,4.598554,0.370798,1.749889,-1.214511,-2.983730
331,58203_003141_46_52523_G,0.067070,0,2,0,-1.120295,-0.201938,-3.319584,0.710952,1.663988,4.604896,0.502082,1.612141,-1.626012,-3.163211


In [15]:
for idx in range(1, 10, 1):
    thr = idx*0.1
    valid_targets = (np.array(oof_df["contact"]) > thr).astype(np.int32)
    valid_binary_preds = (np.array(oof_df["pred"]) > thr).astype(np.int32)
    score = matthews_corrcoef(valid_targets, valid_binary_preds)
    print(f"threshold={thr:.3f}, score={score:.5f}")

threshold=0.100, score=0.20094
threshold=0.200, score=0.26399
threshold=0.300, score=0.08686
threshold=0.400, score=0.00000
threshold=0.500, score=0.00000
threshold=0.600, score=0.00000
threshold=0.700, score=0.00000
threshold=0.800, score=0.00000
threshold=0.900, score=0.00000


In [16]:
alldata_df = pd.read_csv("/workspace/input/train_labels.csv")
alldata_df = alldata_df.merge(oof_df[["contact_id", "pred"]], on="contact_id", how="left")

In [17]:
for idx in range(1, 10, 1):
    thr = idx*0.1
    valid_targets = (np.array(alldata_df["contact"]) > thr).astype(np.int32)
    valid_binary_preds = (np.array(alldata_df["pred"]) > thr).astype(np.int32)
    score = matthews_corrcoef(valid_targets, valid_binary_preds)
    print(f"threshold={thr:.3f}, score={score:.5f}")

threshold=0.100, score=0.01380
threshold=0.200, score=0.01467
threshold=0.300, score=0.00460
threshold=0.400, score=0.00000
threshold=0.500, score=0.00000
threshold=0.600, score=0.00000
threshold=0.700, score=0.00000
threshold=0.800, score=0.00000
threshold=0.900, score=0.00000
