# NFL Baseline
- create target_df (distance in tracking_df is lower than threshold=3)
https://www.kaggle.com/code/stgkrtua/nfl-creatatraindataset-targetdf
- create dataset save frames in target_df
https://www.kaggle.com/code/stgkrtua/nfl-createdataset-saveframes
- check saved images
https://www.kaggle.com/code/stgkrtua/nfl-checkdataset-plotsavedimage

# import libraries

In [1]:
# general
import os
import gc
import pickle
import glob
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import cv2
import matplotlib.pyplot as plt
import time
import math

import sys

# deep learning
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD, Adam, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.model_selection import GroupKFold

# loss metrics
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score

# import cudf
import polars as pl

# warningの表示方法の設定
import warnings
warnings.filterwarnings("ignore")

# Set Configurations

In [2]:
CFG = {
        "kaggle" : False,
        "DEBUG" : False,
        "EXP_NAME" : "expT010_Trackshift1D",
        # model config
        "num_track_features" : 3,

        "features" : ['x_position_1', 'y_position_1', 'x_position_2', 'y_position_2', 
                      'speed_1', 'orientation_1','acceleration_1', 'distance_1', 'sa_1', 
                      'speed_2', 'orientation_2', 'acceleration_2', 'distance_2','sa_2',
                      'players_dis',
                      ],
        "SHIFT_COLS" : ["x_position", "y_position", 'speed',  'direction', 'orientation', 'acceleration', 
                        'distance','sa',
                       ],
        "SHIFT_NUM" : range(-6, 6, 1),
                                
        # learning config
        "n_epoch" : 10,
        "n_folds": 3,
        "train_folds" : [0,1,2],
        "lr" : 5e-3,
        "T_max" : 10,
        "min_lr" : 1e-8,
        "weight_decay" : 1e-6,

        # etc
        "print_freq" : 1000,
        "random_seed" : 21,

        # data config    
        "batch_size" : 512,
        "num_workers" : 0,
}
CFG["xpos_1"] = [f"x_position_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["ypos_1"] = [f"y_position_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["speed_1"] = [f"speed_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["dire_1"] = [f"direction_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["orie_1"] = [f"orientation_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["acc_1"] = [f"acceleration_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["sa_1"] = [f"sa_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["dis_1"] = [f"distance_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["E_wcenter_1"] = [f"E_w_center_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["E_hcenter_1"] = [f"E_h_center_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["S_wcenter_1"] = [f"S_w_center_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]
CFG["S_hcenter_1"] = [f"S_h_center_shift{shift_num}_1" for shift_num in CFG["SHIFT_NUM"]]

CFG["xpos_2"] = [f"x_position_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["ypos_2"] = [f"y_position_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["speed_2"] = [f"speed_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["dire_2"] = [f"direction_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["orie_2"] = [f"orientation_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["acc_2"] = [f"acceleration_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["sa_2"] = [f"sa_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["dis_2"] = [f"distance_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["E_wcenter_2"] = [f"E_w_center_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["E_hcenter_2"] = [f"E_h_center_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["S_wcenter_2"] = [f"S_w_center_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]
CFG["S_hcenter_2"] = [f"S_h_center_shift{shift_num}_2" for shift_num in CFG["SHIFT_NUM"]]

if CFG["DEBUG"]:
    CFG["EXP_CATEGORY"] = "DEBUG"
    CFG["EXP_NAME"] = "DEBUG"
    CFG["n_epoch"] = 2
    CFG["sample_num"] = 1000
    CFG["batch_size"] = 32
    CFG["train_folds"] : [0,1]


CFG["INPUT_DIR"] = "/workspace/input"
CFG["OUTPUT_DIR"] = "/workspace/output"
CFG["BASE_DIR"] = os.path.join(CFG["INPUT_DIR"])
CFG["TRAIN_HELMET_CSV"] = os.path.join(CFG["BASE_DIR"], "train_baseline_helmets.csv")
CFG["TRAIN_TRACKING_CSV"] = os.path.join(CFG["BASE_DIR"], "train_player_tracking.csv")
CFG["TRAIN_VIDEO_META_CSV"] = os.path.join(CFG["BASE_DIR"], "train_video_metadata.csv")
CFG["TRAIN_LABEL_CSV"] = os.path.join(CFG["BASE_DIR"], "train_labels.csv")
CFG["EXP_DIR"] = os.path.join(CFG["OUTPUT_DIR"], CFG["EXP_NAME"])

if not CFG["kaggle"] and not CFG["DEBUG"]:
    os.mkdir(CFG["EXP_DIR"])


# Utils

In [3]:
def seed_everything(seed=CFG["random_seed"]):
    #os.environ['PYTHONSEED'] = str(seed)
    np.random.seed(seed%(2**32-1))
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic =True
    torch.backends.cudnn.benchmark = False
seed_everything()

# device optimization
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
def asMinutes(s):
    """Convert Seconds to Minutes."""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """Accessing and Converting Time Data."""
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

class AverageMeter(object):
    """Computes and stores the average and current value."""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Dataset Functions

In [5]:
def add_feature_cols(df_, FEATURE_COLS,remove_col_list):
    additional_cols = list(df_.columns)
    additional_cols = [col for col in additional_cols if not col in remove_col_list]
    FEATURE_COLS.extend(additional_cols)
    return FEATURE_COLS

## target df func

In [6]:
def create_trackmerged_ftr(target_df, FEATURE_COLS,
                           diff_cols = ['x_position', 'y_position', 'speed', 'distance',
                                        'direction', 'orientation', 'acceleration', 'sa']):
#     target_df = target_df.with_column((np.sqrt(np.square(pl.col("x_position_1")-pl.col("x_position_2")) \
#                                               + np.square(pl.col("y_position_1")-pl.col("y_position_2"))) \
#                                       ).alias("players_dis"))
    target_df = target_df.with_column((np.sqrt(np.square(pl.col("x_position_1")*120. - pl.col("x_position_2")*120.) \
                                              + np.square(pl.col("y_position_1")*60. - pl.col("y_position_2")*60.)) \
                                      ).alias("players_dis"))
    target_df = target_df.with_column(pl.col("players_dis").fill_null(0))
    
    # players distance sum(in shift range time : default(-6~6 frames not step))
    players_distance_sum = 0
    for idx in range(-6,6,1):
        players_distance_sum += np.sqrt((target_df[f"x_position_shift{idx}_1"] - target_df[f"x_position_shift{idx}_2"])**2 \
                                       + (target_df[f"y_position_shift{idx}_1"] - target_df[f"y_position_shift{idx}_2"])**2)
    target_df = target_df.with_column(pl.Series("players_distance_sum", players_distance_sum))
    target_df = target_df.with_column(pl.col("players_distance_sum").fill_null(0))
    FEATURE_COLS.append("players_distance_sum")

    # players each axis distance sum(in shift range time : default(-6~6 frames not step))
    for axis in ["x", "y"]:
        axis_distance_1 = 0
        axis_distance_2 = 0
        for idx in range(-6, 5, 1):
            axis_distance_1 += abs(target_df[f"{axis}_position_shift{idx}_1"] - target_df[f"{axis}_position_shift{idx+1}_1"])
            axis_distance_2 += abs(target_df[f"{axis}_position_shift{idx}_2"] - target_df[f"{axis}_position_shift{idx+1}_2"])
        target_df = target_df.with_column(pl.Series(f"{axis}_move_distance_1", axis_distance_1))
        target_df = target_df.with_column(pl.col(f"{axis}_move_distance_1").fill_null(0))
        target_df = target_df.with_column(pl.Series(f"{axis}_move_distance_2", axis_distance_2))
        target_df = target_df.with_column(pl.col(f"{axis}_move_distance_2").fill_null(0))
        FEATURE_COLS.extend([f"{axis}_move_distance_1", f"{axis}_move_distance_2"])

    # players difference ftr (in each step)
    for col in diff_cols:
        colname = f"{col}_diff"
        target_df = target_df.with_column((abs(pl.col(f"{col}_1") - pl.col(f"{col}_2"))).alias(colname))
        target_df = target_df.with_column(pl.col(colname).fill_null(0))
        FEATURE_COLS.append(colname)
        
    return target_df, FEATURE_COLS

In [7]:
# -------------------------------
# replace polars
# polars groupby rolling (groupby_dynamics)
# https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_dynamic.html
# -------------------------------
def create_roll_ftr(target_df, FEATURE_COLS_,
                    key_cols = ["contact_id", "game_play", "nfl_player_id_1", 
                                "nfl_player_id_2", "datetime", "step"]):
    roll_df = target_df.select(roll_cols+key_cols)

    roll_df = roll_df.with_column(pl.concat_str([pl.col("game_play"),
                                                 pl.col("nfl_player_id_1"),
                                                 pl.col("nfl_player_id_2"),
                                                ], sep='_').alias('key'))
    # cast datetime
    roll_df = roll_df.with_column(pl.col('datetime').str.strptime(pl.Datetime,
                                                                          fmt="%+",
                                                                          strict=False
                                                                         ).alias('datetime'))
    # groupby rolling
    roll_df = roll_df.groupby_dynamic("step", every="1i", period="6i", by="key", closed="both").agg([pl.col(roll_cols).sum().suffix("_rollsum"), pl.col("contact_id")])
    roll_df = roll_df.with_column(pl.col("contact_id").apply(lambda x:x[0]))
    roll_df = roll_df.drop(["key", "step"])
    target_df = target_df.join(roll_df, on="contact_id", how="left")
    additional_cols = [col+"_rollsum" for col in roll_cols]
    FEATURE_COLS_.extend(additional_cols)
    
    del roll_df
    
    return target_df, FEATURE_COLS_

In [8]:
# -------------------------------
# replace polars
# -------------------------------
def create_helmetmerged_ftr(target_df, FEATURE_COLS_):
    # helmet center distance feature
    for view in ["Endzone", "Sideline"]:
        for p_id in ["1", "2"]: 
            # get helmet center
            target_df = target_df.with_column((pl.col(f"{view[0]}_left_{p_id}") + (pl.col(f"{view[0]}_width_{p_id}")//2)).alias(f"{view[0]}_Wcenter_{p_id}"))
            target_df = target_df.with_column((pl.col(f"{view[0]}_top_{p_id}") + (pl.col(f"{view[0]}_height_{p_id}")//2)).alias(f"{view[0]}_Hcenter_{p_id}"))
        # helmet center distance
        target_df = target_df.with_column((np.sqrt(np.square(pl.col(f"{view[0]}_Wcenter_1") - pl.col(f"{view[0]}_Wcenter_2")) \
                                                  + np.square(pl.col(f"{view[0]}_Hcenter_1") - pl.col(f"{view[0]}_Hcenter_2")))
                                          ).alias(f"{view[0]}_helmet_dis"))

        # GがNanになるので0にしておく
        target_df = target_df.with_column(pl.col(f"{view[0]}_helmet_dis").fill_null(0))
        FEATURE_COLS_.append(f"{view[0]}_helmet_dis")
    
    # helmet cols fillna(0) after get helmet distance 
    helmet_cols = ['E_left_1', 'E_width_1', 'E_top_1', 'E_height_1',
                   'E_left_2', 'E_width_2', 'E_top_2', 'E_height_2', 
                   'S_left_1','S_width_1', 'S_top_1', 'S_height_1', 
                   'S_left_2', 'S_width_2', 'S_top_2', 'S_height_2']
    target_df = target_df.with_column(pl.col(helmet_cols).fill_null(0))

    return target_df, FEATURE_COLS_

In [9]:
# -------------------------------
# replace polars
# -------------------------------
def get_categorical_ftr(target_df, FEATURE_COLS_):
    target_df = target_df.with_column((pl.col("nfl_player_id_2")== "G").cast(int).alias("is_ground"))
    
#     target_df["nfl_player_id_2"] = target_df["nfl_player_id_2"].replace("G", "99999").astype(np.int64) # when inference this is after cnn pred
    target_df = target_df.with_column((1 - ((pl.col("E_width_1")==0) & (pl.col("E_width_2")==0) \
                                          & (pl.col("S_width_1")==0) & (pl.col("S_width_2")==0)).cast(int)).alias("is_helmet"))
    target_df = target_df.with_column((1 - ((pl.col("E_width_1")==0) & (pl.col("E_width_2")==0)).cast(int)).alias("is_E_helmet"))
    target_df = target_df.with_column((1 - ((pl.col("S_width_1")==0) & (pl.col("S_width_2")==0)).cast(int)).alias("is_S_helmet"))
    target_df = target_df.with_column(((pl.col("is_E_helmet")==1) & (pl.col("is_S_helmet")==1)).cast(int).alias("both_helmet"))
    # set team 
    target_df = target_df.with_column(((pl.col("team_1")=="home").cast(int)).alias("team_1"))
    target_df = target_df.with_column(((pl.col("team_2")=="home").cast(int)).alias("team_2"))
    
    target_df = target_df.fill_null(0)
    target_df = target_df.fill_nan(0)
    FEATURE_COLS_.extend(["is_ground", "is_helmet"])
    return target_df, FEATURE_COLS_

## tracking df func

In [10]:
# -------------------------------
# replace polars
# -------------------------------
def get_tracking_shift(tracking_df_, shift_cols=CFG["SHIFT_COLS"], shift_nums=CFG["SHIFT_NUM"]):
    # get shift key
    #     tracking_df = tracking_df.with_column((pl.col('step')/10*59.94+5*59.94 + 5000).alias('frame_add'))
    tracking_df_ = tracking_df_.with_column(pl.concat_str([pl.col("game_play"),
                                                         pl.col("nfl_player_id"),
                                                        ], sep='_').alias('shift_key'))
    # get shift features
    SHIFT_COLS_ = []
    for num in shift_nums:
        tracking_df_ = tracking_df_.with_columns(
                            pl.col(shift_cols).shift(periods=num).over("shift_key").suffix(f"_shift{num}"))
    for col in shift_cols:
        colname = [f"{col}_shift{idx}" for idx in shift_nums]
        SHIFT_COLS_.extend(colname)
                    
    return tracking_df_, SHIFT_COLS_

In [11]:
# -------------------------------
# replace polars
# -------------------------------
def target_merge_tracking(target_df, tracking_df, FEATURE_COLS_, SHIFT_COLS_,
                          TRACKING_COLS_ = ["game_play", "nfl_player_id", "step", 
                                           "x_position", "y_position", "datetime",
                                           "speed","distance","direction","orientation",
                                           "acceleration","sa", "team", "jersey_number"]):
    target_df = target_df.with_column(pl.concat_str([pl.col("game_play"),
                                                     pl.col("step").cast(str),
                                                     pl.col("nfl_player_id_1"),
                                                    ], sep='_').alias('game_step_player_1'))
    target_df = target_df.with_column(pl.concat_str([pl.col("game_play"),
                                                     pl.col("step").cast(str),
                                                     pl.col("nfl_player_id_2"),
                                                    ], sep='_').alias('game_step_player_2'))
    
    # Norm cols
    norm_cols = ["x_position", "y_position", "direction", "orientation"]
    Norm_value_list = [120., 60., 360., 360.]
    for col, norm_val in zip(norm_cols, Norm_value_list):
        tracking_df = tracking_df.with_column((pl.col(col)/norm_val).alias(col))

    TRACKING_COLS_.extend(SHIFT_COLS_)
    # print(TRACKING_COLS_)
    tracking_df = tracking_df.select(TRACKING_COLS_)
    tracking_df = tracking_df.with_column(pl.concat_str([pl.col("game_play"),
                                                         pl.col("step").cast(str),
                                                         pl.col("nfl_player_id"),
                                                        ], sep='_').alias('game_step_player'))

    tracking_df = tracking_df.drop(["game_play", "step", "nfl_player_id", "datetime"])

    # merge tracking to target
    for player_id in [1,2]:
        tracking_player = tracking_df.select([pl.all().suffix(f"_{player_id}")])
        target_df = target_df.join(tracking_player, on=[f"game_step_player_{player_id}"], how="left")
        # add features col
        FEATURE_COLS_ = add_feature_cols(tracking_player, FEATURE_COLS_,
                                        [f"game_step_player_{player_id}", f"frame_{player_id}", f"datetime_{player_id}"])
    # drop col
    target_df = target_df.drop(["game_step_player_1", "game_step_player_2"])
    print(len(target_df.columns))
    print("original length", len(target_df))
    return target_df, FEATURE_COLS_

## helmet df func

In [12]:
# -------------------------------
# replace polars
# -------------------------------
def target_merge_helmet(target_df, helmet_df, FEATURE_COLS_, SHIFT_COLS_):
    print("original length", len(target_df))
    # set merge-key (game_frame_player_1,2) to merge helmet_df
    target_df = target_df.with_column(((pl.col("step")/10*59.94+5*59.94).cast(int)+1).alias("frame"))
    target_df = target_df.with_column(pl.concat_str([pl.col("game_play"),
                                                     pl.col("frame").cast(str),
                                                     pl.col("nfl_player_id_1"),
                                                    ], sep='_').alias('game_frame_player_1'))
    target_df = target_df.with_column(pl.concat_str([pl.col("game_play"),
                                                     pl.col("frame").cast(str),
                                                     pl.col("nfl_player_id_2"),
                                                    ], sep='_').alias('game_frame_player_2'))
    # set merge key
    helmet_df = helmet_df.with_column(pl.concat_str([pl.col("game_play"),
                                                     pl.col("frame").cast(str),
                                                     pl.col("nfl_player_id"),
                                                    ], sep='_').alias('game_frame_player'))

    # merge target df & helmet_df
    player_views = [[1, "Endzone"],[2, "Endzone"], [1, "Sideline"],[2, "Sideline"]]
    for player_id, view in player_views:
        helmet_view = helmet_df.filter(pl.col("view")==view)
#         helmet_view = helmet_view[["game_frame_player", "left", "width", "top", "height"]]
        helmet_view = helmet_view[["game_frame_player", "left", "width", "top", "height"]+SHIFT_COLS_]
        helmet_view = helmet_view.select(pl.all().suffix(f"_{player_id}"))
        helmet_view = helmet_view.select([pl.col(helmet_view.columns[0]), pl.col(helmet_view.columns[1:]).prefix(f"{view[0]}_")])
        target_df = target_df.join(helmet_view, on=f"game_frame_player_{player_id}", how="left")  
        # add features col
        FEATURE_COLS_ = add_feature_cols(helmet_view, FEATURE_COLS_, [f"game_frame_player_{player_id}"])
    
    del helmet_view
    print(len(target_df.columns))
    print("original length", len(target_df))
    return target_df, FEATURE_COLS_

In [13]:
def get_helmet_shift(helmet_df_, shift_cols=["w_center", "h_center"], shift_nums=CFG["SHIFT_NUM"]):
    # get shift key
    helmet_df_ = helmet_df_.with_column(pl.concat_str([pl.col("game_play"),
                                                         pl.col("nfl_player_id"),
                                                     ], sep='_').alias('shift_key'))
    
    helmet_df_ = helmet_df_.with_column(((pl.col("left") + (pl.col("width")//2))/640.).alias("w_center"))
    helmet_df_ = helmet_df_.with_column(((pl.col("top") + (pl.col("height")//2))/360.).alias("h_center"))
    # get shift features
    SHIFT_COLS_ = []
    for num in shift_nums:
        helmet_df_ = helmet_df_.with_columns(
                            pl.col(shift_cols).shift(periods=num).over("shift_key").suffix(f"_shift{num}"))
    for col in shift_cols:
        colname = [f"{col}_shift{idx}" for idx in shift_nums]
        SHIFT_COLS_.extend(colname)
                    
    return helmet_df_, SHIFT_COLS_

---

# Load Target

In [14]:
target_dtypes = {'contact_id':str, 
                'game_play':str,
                'datetime':str,
                'step':int,
                'nfl_player_id_1':str,
                'nfl_player_id_2':str,
                'contact':int,
                }
target_df = pl.read_csv(CFG["TRAIN_LABEL_CSV"], dtypes=target_dtypes)    

FEATURE_COLS = ["nfl_player_id_1", "nfl_player_id_2", "step"]
# display(target_df)

# Merge tracking_df

In [15]:
# -------------------------------
# replace polars
# -------------------------------
tracking_df = pl.read_csv(CFG["TRAIN_TRACKING_CSV"])
tracking_df, SHIFT_COLS = get_tracking_shift(tracking_df)
target_df, FEATURE_COLS = target_merge_tracking(target_df, tracking_df, FEATURE_COLS, SHIFT_COLS)
# display(tracking_df.filter((pl.col("game_play")=="58580_001136") & (pl.col("nfl_player_id").cast(str)=="44830")))# ちゃんとshiftできてそう
del tracking_df

target_df, FEATURE_COLS = create_trackmerged_ftr(target_df, FEATURE_COLS)

SHIFT_FEATURES = []
for col in SHIFT_COLS:
    SHIFT_FEATURES.append(f"{col}_1")
for col in SHIFT_COLS:
    SHIFT_FEATURES.append(f"{col}_2")

219
original length 4721618


# Exclude distance 10

In [16]:
print(len(target_df))
target_df = target_df.filter(pl.col("players_dis") <= 10.0)
print(len(target_df))
gc.collect()
torch.cuda.empty_cache()

4721618
2368536


# Merge helmet df

In [17]:
helmet_df = pl.read_csv(CFG["TRAIN_HELMET_CSV"])
helmet_df, SHIFT_COLS = get_helmet_shift(helmet_df)
target_df, FEATURE_COLS = target_merge_helmet(target_df, helmet_df, FEATURE_COLS, SHIFT_COLS)
del helmet_df

target_df, FEATURE_COLS = create_helmetmerged_ftr(target_df, FEATURE_COLS)
target_df, FEATURE_COLS = get_categorical_ftr(target_df, FEATURE_COLS)
print(len(target_df))

original length 2368536
348
original length 2368536
2368536


# Reduce Data

In [18]:
target_df = target_df.to_pandas()

print(len(target_df))
print(len(target_df["game_play"].unique()))
display(target_df["contact"].value_counts())

2368536
240


0    2304014
1      64522
Name: contact, dtype: int64

In [19]:
target_df[SHIFT_FEATURES]

Unnamed: 0,x_position_shift-6_1,x_position_shift-5_1,x_position_shift-4_1,x_position_shift-3_1,x_position_shift-2_1,x_position_shift-1_1,x_position_shift0_1,x_position_shift1_1,x_position_shift2_1,x_position_shift3_1,...,sa_shift-4_2,sa_shift-3_2,sa_shift-2_2,sa_shift-1_2,sa_shift0_2,sa_shift1_2,sa_shift2_2,sa_shift3_2,sa_shift4_2,sa_shift5_2
0,40.74,40.65,40.57,40.49,40.43,40.38,40.33,40.30,40.27,40.25,...,0.71,1.25,1.58,1.62,1.74,1.18,0.86,0.51,0.27,0.15
1,40.74,40.65,40.57,40.49,40.43,40.38,40.33,40.30,40.27,40.25,...,1.47,1.51,1.80,0.90,0.81,0.72,0.60,0.12,-0.09,-0.09
2,40.74,40.65,40.57,40.49,40.43,40.38,40.33,40.30,40.27,40.25,...,3.06,2.71,2.29,1.58,1.10,0.71,0.37,0.21,0.05,0.19
3,40.74,40.65,40.57,40.49,40.43,40.38,40.33,40.30,40.27,40.25,...,1.75,1.81,1.80,1.56,1.23,-1.18,-1.04,-0.87,-0.69,-0.44
4,40.74,40.65,40.57,40.49,40.43,40.38,40.33,40.30,40.27,40.25,...,1.22,1.11,1.33,1.37,1.39,1.20,0.87,0.43,0.02,-0.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2368531,34.81,34.54,34.28,34.02,33.75,33.46,33.18,32.92,32.67,32.42,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2368532,40.37,40.03,39.66,39.33,38.96,38.60,38.25,37.88,37.52,37.15,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2368533,24.87,24.66,24.48,24.28,24.07,23.88,23.69,23.49,23.33,23.20,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2368534,62.85,62.82,62.80,62.77,62.72,62.67,62.63,62.58,62.53,62.48,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


# Dataset

In [20]:
class NFLDataset(Dataset):
    def __init__(self, target_df):
        self.target_df = target_df
        # player 1
        self.xpos_1 = target_df[CFG["xpos_1"]].values
        self.ypos_1 = target_df[CFG["ypos_1"]].values
        self.speed_1 = target_df[CFG["speed_1"]].values
        self.dire_1 = target_df[CFG["dire_1"]].values
        self.orie_1 = target_df[CFG["orie_1"]].values
        self.acc_1 = target_df[CFG["acc_1"]].values
        self.sa_1 = target_df[CFG["sa_1"]].values
        self.dis_1 = target_df[CFG["dis_1"]].values
        self.E_wcen_1 = target_df[CFG["E_wcenter_1"]].values
        self.E_hcen_1 = target_df[CFG["E_hcenter_1"]].values
        self.S_wcen_1 = target_df[CFG["S_wcenter_1"]].values
        self.S_hcen_1 = target_df[CFG["S_hcenter_1"]].values
        # player 2
        self.xpos_2 = target_df[CFG["xpos_2"]].values
        self.ypos_2 = target_df[CFG["ypos_2"]].values
        self.speed_2 = target_df[CFG["speed_2"]].values
        self.dire_2 = target_df[CFG["dire_2"]].values
        self.orie_2 = target_df[CFG["orie_2"]].values
        self.acc_2 = target_df[CFG["acc_2"]].values
        self.sa_2 = target_df[CFG["sa_2"]].values
        self.dis_2 = target_df[CFG["dis_2"]].values
        self.E_wcen_2 = target_df[CFG["E_wcenter_2"]].values
        self.E_hcen_2 = target_df[CFG["E_hcenter_2"]].values
        self.S_wcen_2 = target_df[CFG["S_wcenter_2"]].values
        self.S_hcen_2 = target_df[CFG["S_hcenter_2"]].values

    def __len__(self):
        return len(self.target_df)

    def __getitem__(self, idx):
        target_info = self.target_df.iloc[idx]
        target = target_info.contact
        # player 1
        xpos_1 = self.xpos_1[idx]
        ypos_1 = self.ypos_1[idx]
        speed_1 = self.speed_1[idx]
        dire_1 = self.dire_1[idx]
        orie_1 = self.orie_1[idx]
        acc_1 = self.acc_1[idx]
        sa_1 = self.sa_1[idx]
        dis_1 = self.dis_1[idx]
        E_wcenter_1 = self.E_wcen_1[idx]
        E_hcenter_1 = self.E_hcen_1[idx]
        S_wcenter_1 = self.S_wcen_1[idx]
        S_hcenter_1 = self.S_hcen_1[idx]
        # player 2
        xpos_2 = self.xpos_2[idx]
        ypos_2 = self.ypos_2[idx]
        speed_2 = self.speed_2[idx]
        dire_2 = self.dire_2[idx]
        orie_2 = self.orie_2[idx]
        acc_2 = self.acc_2[idx]
        sa_2 = self.sa_2[idx]
        dis_2 = self.dis_2[idx]
        E_wcenter_2 = self.E_wcen_2[idx]
        E_hcenter_2 = self.E_hcen_2[idx]
        S_wcenter_2 = self.S_wcen_2[idx]
        S_hcenter_2 = self.S_hcen_2[idx]
                
        pos_features = np.concatenate([xpos_1[np.newaxis, :], ypos_1[np.newaxis, :],xpos_2[np.newaxis, :], ypos_2[np.newaxis, :]])
        E_helcenter_features = np.concatenate([E_wcenter_1[np.newaxis, :], E_wcenter_2[np.newaxis, :],
                                               E_hcenter_1[np.newaxis, :], E_hcenter_2[np.newaxis, :]])
        S_helcenter_features = np.concatenate([S_wcenter_1[np.newaxis, :], S_wcenter_2[np.newaxis, :],
                                               S_hcenter_1[np.newaxis, :], S_hcenter_2[np.newaxis, :]])
                
        speed_features = np.concatenate([speed_1[np.newaxis, :], speed_2[np.newaxis, :]]) 
        dire_features = np.concatenate([dire_1[np.newaxis, :], dire_2[np.newaxis, :]])
        orie_features = np.concatenate([orie_1[np.newaxis, :], orie_2[np.newaxis, :]])
        acc_features = np.concatenate([acc_1[np.newaxis, :], acc_2[np.newaxis, :]])
        sa_features = np.concatenate([sa_1[np.newaxis, :], sa_2[np.newaxis, :]])
        dis_features = np.concatenate([dis_1[np.newaxis, :], dis_2[np.newaxis, :]])
        
        
        return pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, E_helcenter_features, S_helcenter_features, target
#         return pos_features, speed_features, dire_features, orie_features, acc_features, target

In [21]:
if CFG["DEBUG"]:
    check_batch = 4
    pick_df = target_df.copy()
    train_dataset = NFLDataset(pick_df)
    train_loader = DataLoader(
        train_dataset,
        batch_size = check_batch,
        shuffle = False,
        num_workers = CFG["num_workers"],
        pin_memory = True
    )
    for batch_idx, (pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, E_helcenter_features, S_helcenter_features, targets) in enumerate(train_loader):
#     for batch_idx, (pos_features, speed_features, dire_features, orie_features, acc_features, targets) in enumerate(train_loader):
        print("pos_features shape =",pos_features.shape)
        print("speed_features shape =",speed_features.shape)
        print("dire_features shape =",dire_features.shape)
        print("orie_features shape =",orie_features.shape)
        print("acc_features shape =",acc_features.shape)
        print("sa_features shape =",sa_features.shape)
        print("dis_features shape =",dis_features.shape)
        print("E_helcenter_features shape =",E_helcenter_features.shape)
        print("S_helcenter_features shape =",S_helcenter_features.shape)
        print("target shape", targets.shape)
        break

# Model

In [22]:
class NFLTrackNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.pos_conv = nn.Sequential(
                        nn.Conv1d(4, 1, 5), # x_pos, y_pos
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.speed_conv = nn.Sequential(
                        nn.Conv1d(2, 1, 5),
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.dire_conv = nn.Sequential(
                        nn.Conv1d(2, 1, 5),
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )

        self.orie_conv = nn.Sequential(
                        nn.Conv1d(2, 1, 5),
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.acc_conv = nn.Sequential(
                        nn.Conv1d(2, 1, 5),
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.sa_conv = nn.Sequential(
                        nn.Conv1d(2, 1, 5),
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.dis_conv = nn.Sequential(
                        nn.Conv1d(2, 1, 5),
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.E_hel_conv = nn.Sequential(
                        nn.Conv1d(4, 1, 5), # wcenter, hcenter
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )
        self.S_hel_conv = nn.Sequential(
                        nn.Conv1d(4, 1, 5), # wcenter, hcenter
                        nn.Linear(len(CFG["SHIFT_NUM"])-4, CFG["num_track_features"]),
                        nn.ReLU(),
                    )

        self.emb = nn.Linear(CFG["num_track_features"]*9, CFG["num_track_features"])
        self.fc = nn.Linear(CFG["num_track_features"], 1)

    def forward(self, pos, speed, dire, orie, acc, sa, dis, e_hel, s_hel):
        pos = self.pos_conv(pos)
        speed = self.speed_conv(speed)
        dire = self.dire_conv(dire)
        orie = self.orie_conv(orie)
        acc = self.acc_conv(acc)
        sa = self.sa_conv(sa)
        dis = self.sa_conv(dis)
        e_hel = self.E_hel_conv(e_hel)
        s_hel = self.S_hel_conv(s_hel)
        
        pos = torch.squeeze(pos, dim=1)
        speed = torch.squeeze(speed, dim=1)
        dire = torch.squeeze(dire, dim=1)
        orie = torch.squeeze(orie, dim=1)
        acc = torch.squeeze(acc, dim=1)
        sa = torch.squeeze(sa, dim=1)
        dis = torch.squeeze(dis, dim=1)
        e_hel = torch.squeeze(e_hel, dim=1)
        s_hel = torch.squeeze(s_hel, dim=1)
        
        embeddings = self.emb(torch.cat([pos, speed, dire, orie, acc, sa, dis, e_hel, s_hel], dim=1))
#         embeddings = self.emb(torch.cat([pos, speed, dire, orie, acc], dim=1))
        output = self.fc(embeddings)
        return output, embeddings

# train fn

In [23]:
def train_fn(train_loader, model, criterion, epoch ,optimizer, scheduler):
    model.train()
    batch_time = AverageMeter()
    losses = AverageMeter()
    start, end = time.time(), time.time()
    for batch_idx, (pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, e_hel_features, s_hel_features,targets) in enumerate(train_loader):
        pos_features = pos_features.to(device, non_blocking = True).float()
        speed_features = speed_features.to(device, non_blocking = True).float()
        dire_features = dire_features.to(device, non_blocking = True).float()
        orie_features = orie_features.to(device, non_blocking = True).float()
        acc_features = acc_features.to(device, non_blocking = True).float()
        sa_features = sa_features.to(device, non_blocking = True).float()
        dis_features = dis_features.to(device, non_blocking = True).float()
        e_hel_features = e_hel_features.to(device, non_blocking = True).float()
        s_hel_features = s_hel_features.to(device, non_blocking = True).float()
        targets = targets.to(device, non_blocking = True).float().view(-1, 1)      
        preds, _ = model(pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, e_hel_features, s_hel_features)

        loss = criterion(preds, targets)
        losses.update(loss.item(), CFG["batch_size"]) 
        targets = targets.detach().cpu().numpy().ravel().tolist()
        preds = torch.sigmoid(preds).detach().cpu().numpy().ravel().tolist()

        loss.backward() # パラメータの勾配を計算
        optimizer.step() # モデル更新
        optimizer.zero_grad() # 勾配の初期化
                
        batch_time.update(time.time() - end)
        end = time.time()
        if batch_idx % CFG["print_freq"] == 0 or batch_idx == (len(train_loader)-1):
            print('\t Epoch: [{0}][{1}/{2}] '
                    'Elapsed {remain:s} '
                    'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                    .format(
                        epoch, batch_idx, len(train_loader), batch_time=batch_time, loss=losses,
                        remain=timeSince(start, float(batch_idx+1)/len(train_loader)),
            ))
        del preds, pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, e_hel_features, s_hel_features, targets
    gc.collect()
    torch.cuda.empty_cache()
    return losses.avg

# valid fn

In [24]:
def valid_fn(model, valid_loader, criterion):
    model.eval()# モデルを検証モードに設定
    test_targets = []
    test_preds = []
    track_embs = []

    batch_time = AverageMeter()
    losses = AverageMeter()
    start, end = time.time(), time.time()
    view_list = []
    for batch_idx, (pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, e_hel_features, s_hel_features, targets) in enumerate(valid_loader):
#     for batch_idx, (pos_features, speed_features, dire_features, orie_features, acc_features, targets) in enumerate(valid_loader):
        pos_features = pos_features.to(device, non_blocking = True).float()
        speed_features = speed_features.to(device, non_blocking = True).float()
        dire_features = dire_features.to(device, non_blocking = True).float()
        orie_features = orie_features.to(device, non_blocking = True).float()
        acc_features = acc_features.to(device, non_blocking = True).float()
        sa_features = sa_features.to(device, non_blocking = True).float()
        dis_features = dis_features.to(device, non_blocking = True).float()
        e_hel_features = e_hel_features.to(device, non_blocking = True).float()
        s_hel_features = s_hel_features.to(device, non_blocking = True).float()
        targets = targets.to(device, non_blocking = True).float().view(-1, 1)      
        with torch.no_grad():
            preds, track_emb = model(pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, e_hel_features, s_hel_features)
            loss = criterion(preds, targets)
        losses.update(loss.item(), CFG["batch_size"])
        batch_time.update(time.time() - end)

        track_emb = track_emb.detach().cpu().numpy()
        track_embs.extend(track_emb)

        targets = targets.detach().cpu().numpy().ravel().tolist()
        preds = torch.sigmoid(preds).detach().cpu().numpy().ravel().tolist()

        test_preds.extend(preds)
        test_targets.extend(targets)
        # score = matthews_corrcoef(preds, targets)
        if batch_idx % CFG["print_freq"] == 0 or batch_idx == (len(valid_loader)-1):
            print('\t EVAL: [{0}/{1}] '
                'Elapsed {remain:s} '
                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                .format(
                    batch_idx, len(valid_loader), batch_time=batch_time, loss=losses,
                    remain=timeSince(start, float(batch_idx+1)/len(valid_loader)),
                ))
        del preds, pos_features, speed_features, dire_features, orie_features, acc_features, sa_features, dis_features, e_hel_features, s_hel_features, targets
        gc.collect()
        torch.cuda.empty_cache()
    test_preds = np.array(test_preds)
    test_targets = np.array(test_targets)
    return test_targets, test_preds, track_embs, losses.avg

# Train loop

In [25]:
def training_loop(target_df):
    # set model & learning fn
    oof_df = pd.DataFrame()
    kf = GroupKFold(n_splits=CFG["n_folds"])
    for fold, (idx_train, idx_valid) in enumerate(kf.split(target_df, target_df["contact_id"], target_df["game_play"])):
        print("---")
        print(f"fold {fold} start training...")
        model = NFLTrackNet()
        model = model.to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = AdamW(model.parameters(), lr=CFG["lr"], weight_decay=CFG["weight_decay"], amsgrad=False)
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG["T_max"], eta_min=CFG["min_lr"], last_epoch=-1)

        if not fold in CFG["train_folds"]:
            print(f"fold{fold} is skip")
            continue
        # separate train/valid data 
        train_df = target_df.iloc[idx_train]
        valid_df = target_df.iloc[idx_valid]
        print("train target contact")
        print(train_df["contact"].value_counts())
        print("valid target contact")
        print(valid_df["contact"].value_counts())
        # separate train/valid data 
        train_dataset = NFLDataset(train_df)
        valid_dataset = NFLDataset(valid_df)
        train_loader = DataLoader(train_dataset,batch_size=CFG["batch_size"], shuffle = True,
                                    num_workers = CFG["num_workers"], pin_memory = True)
        valid_loader = DataLoader(valid_dataset,batch_size=CFG["batch_size"], shuffle = False,
                                    num_workers = CFG["num_workers"], pin_memory = True)

        # training
        best_score = -np.inf
        best_auc = -np.inf
        start_time, end = time.time(), time.time()
        for epoch in range(1, CFG["n_epoch"] + 1):
            print(f'\t === epoch: {epoch}: training ===')
            train_loss_avg = train_fn(train_loader, model, criterion, epoch, optimizer, scheduler)
            valid_targets, valid_preds, valid_embs, valid_loss_avg = valid_fn(model, valid_loader, criterion)

            valid_score = -np.inf
            valid_threshold = 0
            tn_best, fp_best, fn_best, tp_best = 0, 0, 0, 0
            for idx in range(1, 10, 1):
                thr = idx*0.1
                valid_targets = (np.array(valid_targets) > thr).astype(np.int32)
                valid_binary_preds = (np.array(valid_preds) > thr).astype(np.int32)
                score_tmp = matthews_corrcoef(valid_targets, valid_binary_preds)
                cm = confusion_matrix(valid_targets, valid_binary_preds)
                tn, fp, fn, tp = cm.flatten()
                if score_tmp > valid_score:
                    valid_score = score_tmp 
                    valid_threshold = thr
                    tn_best, fp_best, fn_best, tp_best = tn, fp, fn, tp
            elapsed = (time.time() - start_time)/60
            auc_score = roc_auc_score(valid_targets, valid_preds)
            print(f'\t epoch:{epoch}, avg train loss:{train_loss_avg:.4f}, avg valid loss:{valid_loss_avg:.4f}')
            print(f'\t score:{valid_score:.4f}(th={valid_threshold}) AUC = {auc_score:.4f}=> time:{elapsed:.2f} min')
            scheduler.step()
            # validationスコアがbestを更新したらモデルを保存する
            if valid_score > best_score:
                best_score = valid_score
                torch.save(model.state_dict(), f'track_fold{fold}.pth')
                print(f'\t Epoch {epoch} - Save Best Score: {best_score:.4f}. Model is saved.')
                contact_id = valid_df["contact_id"].values
                _oof_df = pd.DataFrame({
                    "contact_id" : contact_id,
                    "pred" : valid_preds,
                    "contact" : valid_targets,
                    "fold" : fold,
                })
                track_emb_colname = [f"track_emb_{idx}" for idx in range(CFG["num_track_features"])]
                track_emb_df = pd.DataFrame(valid_embs, columns=track_emb_colname)
                _oof_df = pd.concat([_oof_df, track_emb_df], axis=1)
            
        del train_loader, train_dataset, valid_loader, valid_dataset
        oof_df = pd.concat([oof_df, _oof_df], axis = 0)
        del _oof_df
        gc.collect()
        torch.cuda.empty_cache()
    return oof_df

In [26]:
oof_df = training_loop(target_df)

---
fold 0 start training...
train target contact
0    1535600
1      43152
Name: contact, dtype: int64
valid target contact
0    768414
1     21370
Name: contact, dtype: int64
	 === epoch: 1: training ===
	 Epoch: [1][0/3084] Elapsed 0m 1s (remain 63m 14s) Loss: 0.9790(0.9790) 
	 Epoch: [1][1000/3084] Elapsed 2m 32s (remain 5m 17s) Loss: 0.1378(0.1134) 
	 Epoch: [1][2000/3084] Elapsed 4m 58s (remain 2m 41s) Loss: 0.0820(0.1050) 
	 Epoch: [1][3000/3084] Elapsed 7m 22s (remain 0m 12s) Loss: 0.0607(0.1008) 
	 Epoch: [1][3083/3084] Elapsed 7m 35s (remain 0m 0s) Loss: 0.1136(0.1007) 
	 EVAL: [0/1543] Elapsed 0m 0s (remain 2m 49s) Loss: 0.0110(0.0110) 
	 EVAL: [1000/1543] Elapsed 3m 0s (remain 1m 37s) Loss: 0.0505(0.0876) 
	 EVAL: [1542/1543] Elapsed 4m 44s (remain 0m 0s) Loss: 0.0192(0.0911) 
	 epoch:1, avg train loss:0.1007, avg valid loss:0.0911
	 score:0.3300(th=0.1) AUC = 0.8895=> time:12.40 min
	 Epoch 1 - Save Best Score: 0.3300. Model is saved.
	 === epoch: 2: training ===
	 Epoch: 

# Save oof_df

In [27]:
display(oof_df)
oof_filename = os.path.join(CFG["OUTPUT_DIR"], CFG["EXP_NAME"], "oof_df.csv")
oof_df.to_csv(oof_filename, index=False)

Unnamed: 0,contact_id,pred,contact,fold,track_emb_0,track_emb_1,track_emb_2
0,58174_001792_0_47818_47961,0.001198,0,0,5.411310,8.839700,10.320980
1,58174_001792_0_47818_52733,0.000253,0,0,5.754009,10.351372,13.157951
2,58174_001792_0_47818_52450,0.000015,0,0,8.363820,14.156092,17.675667
3,58174_001792_0_47818_47872,0.000187,0,0,4.883364,9.007547,14.712217
4,58174_001792_0_47961_52733,0.000204,0,0,5.418433,9.920938,13.937220
...,...,...,...,...,...,...,...
789378,58579_003527_81_43986_G,0.108529,0,2,0.629529,-0.867118,2.214957
789379,58579_003527_81_35443_G,0.003999,0,2,-3.513778,-7.284797,7.543453
789380,58579_003527_81_40008_G,0.010786,0,2,-2.445926,-5.070010,6.242380
789381,58579_003527_81_47819_G,0.000849,0,2,-5.828184,-8.966326,10.939510


In [28]:
for idx in range(1, 10, 1):
    thr = idx*0.1
    valid_targets = (np.array(oof_df["contact"]) > thr).astype(np.int32)
    valid_binary_preds = (np.array(oof_df["pred"]) > thr).astype(np.int32)
    score = matthews_corrcoef(valid_targets, valid_binary_preds)
    print(f"threshold={thr:.3f}, score={score:.5f}")

threshold=0.100, score=0.37858
threshold=0.200, score=0.40341
threshold=0.300, score=0.36964
threshold=0.400, score=0.30380
threshold=0.500, score=0.22572
threshold=0.600, score=0.15206
threshold=0.700, score=0.08852
threshold=0.800, score=0.04406
threshold=0.900, score=0.00748


In [29]:
alldata_df = pd.read_csv(CFG["TRAIN_LABEL_CSV"])
alldata_df = alldata_df.merge(oof_df[["contact_id", "pred"]], on="contact_id", how="left")

In [30]:
for idx in range(1, 10, 1):
    thr = idx*0.1
    valid_targets = (np.array(alldata_df["contact"]) > thr).astype(np.int32)
    valid_binary_preds = (np.array(alldata_df["pred"]) > thr).astype(np.int32)
    score = matthews_corrcoef(valid_targets, valid_binary_preds)
    print(f"threshold={thr:.3f}, score={score:.5f}")

threshold=0.100, score=0.39182
threshold=0.200, score=0.41277
threshold=0.300, score=0.37672
threshold=0.400, score=0.30910
threshold=0.500, score=0.22952
threshold=0.600, score=0.15465
threshold=0.700, score=0.09015
threshold=0.800, score=0.04501
threshold=0.900, score=0.00807


In [31]:
import yaml

yaml_file = os.path.join(CFG["OUTPUT_DIR"], CFG["EXP_NAME"],"configuration.yaml")
with open(yaml_file, "w") as f:
    yaml.dump(CFG, f)