V6 : 全ての列車データ（4M 721K行）を使用する代わりに、距離が2より大きい行を除外することができます。

V8 : グラウンドの欠損値処理

In [None]:
dir_name = 'xgb24'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

Wed Mar  1 06:40:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    55W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import torch

class Config:
    
    seed = 42
    num_fold = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [None]:
import random

import numpy as np

def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(42)

In [None]:
%%capture
!pip install -q cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install -q cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install -q cugraph-cu11 --extra-index-url=https://pypi.nvidia.com

In [None]:
!pip install -q xgboost==1.6.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import gc
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython.display import Video, display

from scipy.optimize import minimize
import cv2
from glob import glob
from tqdm import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    roc_auc_score,
    matthews_corrcoef,
)

import xgboost as xgb

import torch

if torch.cuda.is_available():
    import cupy 
    import cudf
    from cuml import ForestInference
    
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import ADASYN
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)

In [None]:
os.makedirs(f'/content/drive/MyDrive/kaggle/NFL/weights/{dir_name}',exist_ok=True)
MODELS_PATH = f'/content/drive/MyDrive/kaggle/NFL/weights/{dir_name}'
os.makedirs(f'/content/drive/MyDrive/kaggle/NFL/oof/{dir_name}',exist_ok=True)
OOF_PATH = f'/content/drive/MyDrive/kaggle/NFL/oof/{dir_name}'

def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.INPUT = '/content/drive/MyDrive/kaggle/NFL/dataset/original'
    cfg.EXP = '/content/drive/MyDrive/kaggle/NFL'

    cfg.EXP_MODEL = MODELS_PATH
    cfg.EXP_PREDS = OOF_PATH
        
    return cfg

In [None]:
# ==============================
# function
# ==============================
# ref: https://www.kaggle.com/code/robikscube/nfl-player-contact-detection-getting-started
def add_contact_id(df):
    # Create contact ids
    df["contact_id"] = (
        df["game_play"]
        + "_"
        + df["step"].astype("str")
        + "_"
        + df["nfl_player_id_1"].astype("str")
        + "_"
        + df["nfl_player_id_2"].astype("str")
    )
    return df

def expand_contact_id(df):
    """
    Splits out contact_id into seperate columns.
    """
    df["game_play"] = df["contact_id"].str[:12]
    df["step"] = df["contact_id"].str.split("_").str[-3].astype("int")
    df["nfl_player_id_1"] = df["contact_id"].str.split("_").str[-2]
    df["nfl_player_id_2"] = df["contact_id"].str.split("_").str[-1]
    return df

# cross validation
def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# xgboost code
def fit_xgboost(cfg, X, y, params, add_suffix=''):
    """
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.01,
        'tree_method':'gpu_hist'
    }
    """
    oof_pred = np.zeros(len(y), dtype=np.float32)
    for fold in sorted(cfg.folds.unique()):
        if fold == -1: continue
        idx_train = (cfg.folds!=fold)
        idx_valid = (cfg.folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        display(pd.Series(y_valid).value_counts())
        #################################################
        sm = SMOTE(random_state=42)
        x_train, y_train = sm.fit_resample(x_train, y_train)
        #x_valid, y_valid = sm.fit_resample(x_valid, y_valid)
        #################################################
        #weight_train = compute_sample_weight(class_weight={0:2,1:1}, y=y_train)
        #weight_valid = compute_sample_weight(class_weight={0:2,1:1}, y=y_valid)
        #################################################
        xgb_train = xgb.DMatrix(x_train, label=y_train)#,weight=weight_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)#,weight=weight_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]

        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=10_000,
            early_stopping_rounds=100,
            evals=evals,
            verbose_eval=100,
        )

        model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model.save_model(model_path)
        if not torch.cuda.is_available():
            model = xgb.Booster().load_model(model_path)
        else:
            model = ForestInference.load(model_path, output_class=True, model_type='xgboost')
        pred_i = model.predict_proba(x_valid).values[:, 1]
        oof_pred[x_valid.index] = pred_i
        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred{add_suffix}'), oof_pred)
    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred

def pred_xgboost(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'xgb_fold*{add_suffix}.model'))
    if not torch.cuda.is_available():
         models = [xgb.Booster().load_model(model) for model in models]
    else:
        models = [ForestInference.load(model, output_class=True, model_type='xgboost') for model in models]
    preds = np.array([model.predict_proba(X)[:, 1] for model in models])
    preds = np.mean(preds, axis=0)
    return preds

In [None]:
# ==============================
# read data
# ==============================
cfg = setup(Config)

if not torch.cuda.is_available():
    #tracking data
    tr_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # helmets data
    tr_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    te_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # video metadata
    tr_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    te_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = pd.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = expand_contact_id(sub)
    
else:
    #tracking data
    tr_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # helmets data
    tr_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    te_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # video metadata
    tr_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    te_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = cudf.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = cudf.DataFrame(expand_contact_id(sub))

In [None]:
tr_tracking.head()

Unnamed: 0,game_play,game_key,play_id,nfl_player_id,datetime,step,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa
0,58580_001136,58580,1136,44830,2021-10-10 21:08:20.900,-108,away,CB,22,61.59,42.6,1.11,0.11,320.33,263.93,0.71,-0.64
1,58580_001136,58580,1136,47800,2021-10-10 21:08:20.900,-108,away,DE,97,59.48,26.81,0.23,0.01,346.84,247.16,1.29,0.9
2,58580_001136,58580,1136,52444,2021-10-10 21:08:20.900,-108,away,FS,29,72.19,31.46,0.61,0.06,11.77,247.69,0.63,-0.33
3,58580_001136,58580,1136,46206,2021-10-10 21:08:20.900,-108,home,TE,86,57.37,22.12,0.37,0.04,127.85,63.63,0.69,0.62
4,58580_001136,58580,1136,52663,2021-10-10 21:08:20.900,-108,away,ILB,48,63.25,27.5,0.51,0.05,183.62,253.71,0.31,0.31


In [None]:
tr_helmets.head()

Unnamed: 0,game_play,game_key,play_id,view,video,frame,nfl_player_id,player_label,left,width,top,height
0,58168_003392,58168,3392,Endzone,58168_003392_Endzone.mp4,290,39947,H72,946,25,293,34
1,58168_003392,58168,3392,Endzone,58168_003392_Endzone.mp4,290,37211,H42,151,25,267,33
2,58168_003392,58168,3392,Endzone,58168_003392_Endzone.mp4,290,38590,H70,810,25,293,35
3,58168_003392,58168,3392,Endzone,58168_003392_Endzone.mp4,290,44822,H15,681,26,254,33
4,58168_003392,58168,3392,Endzone,58168_003392_Endzone.mp4,290,41944,V92,680,23,303,33


In [None]:
tr_video_metadata.head()

Unnamed: 0,game_play,game_key,play_id,view,start_time,end_time,snap_time
0,58168_003392,58168,3392,Endzone,2020-09-11T03:01:43.134Z,2020-09-11T03:01:54.971Z,2020-09-11T03:01:48.134Z
1,58168_003392,58168,3392,Sideline,2020-09-11T03:01:43.134Z,2020-09-11T03:01:54.971Z,2020-09-11T03:01:48.134Z
2,58172_003247,58172,3247,Endzone,2020-09-13T19:30:42.414Z,2020-09-13T19:31:00.524Z,2020-09-13T19:30:47.414Z
3,58172_003247,58172,3247,Sideline,2020-09-13T19:30:42.414Z,2020-09-13T19:31:00.524Z,2020-09-13T19:30:47.414Z
4,58173_003606,58173,3606,Endzone,2020-09-13T19:45:07.527Z,2020-09-13T19:45:26.438Z,2020-09-13T19:45:12.527Z


特徴量の作成には、以下のコードを使用します。 

基本的には、player_tracking.csv に含まれる数値特徴量を、player_id_1 と player_id_2 にそれぞれマージしています。

# player tracking feature

In [None]:
# ==============================
# feature engineering
# ==============================
def create_features(df, tr_tracking, merge_col="step", use_cols=["x_position", "y_position"]):
    output_cols = []
    df_combo = (
        df.astype({"nfl_player_id_1": "str"})
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id",] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .rename(columns={c: c+"_1" for c in use_cols})
        .drop("nfl_player_id", axis=1)
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id"] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .drop("nfl_player_id", axis=1)
        .rename(columns={c: c+"_2" for c in use_cols})
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    output_cols += [c+"_1" for c in use_cols]
    output_cols += [c+"_2" for c in use_cols]
    
    if ("x_position" in use_cols) & ("y_position" in use_cols):
        index = df_combo['x_position_2'].notnull()
        if torch.cuda.is_available():
            index = index.to_numpy()
        distance_arr = np.full(len(index), np.nan)
        tmp_distance_arr = np.sqrt(
            np.square(df_combo.loc[index, "x_position_1"] - df_combo.loc[index, "x_position_2"])
            + np.square(df_combo.loc[index, "y_position_1"]- df_combo.loc[index, "y_position_2"])
        )
        if torch.cuda.is_available():
            tmp_distance_arr = tmp_distance_arr.to_numpy()
        distance_arr[index] = tmp_distance_arr
        df_combo['distance'] = distance_arr
        output_cols += ["distance"]
        
    df_combo['G_flug'] = (df_combo['nfl_player_id_2']=="G")
    output_cols += ["G_flug"]
    return df_combo, output_cols


use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa','team','position'
]
train, feature_cols = create_features(train, tr_tracking, use_cols=use_cols)
test, feature_cols = create_features(test, te_tracking, use_cols=use_cols)
if torch.cuda.is_available():
    train = train.to_pandas()
    test = test.to_pandas()

display(train)

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,team_1,position_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,team_2,position_2,distance,G_flug
0,58168_003392_0_37084_37211,58168_003392,2020-09-11 03:01:48.100,0,37084,37211,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,away,DE,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,home,FB,3.794232,False
1,58168_003392_0_37084_38556,58168_003392,2020-09-11 03:01:48.100,0,37084,38556,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,away,DE,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,away,OLB,10.530043,False
2,58168_003392_0_37084_38567,58168_003392,2020-09-11 03:01:48.100,0,37084,38567,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,away,DE,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,home,T,1.543017,False
3,58168_003392_0_37084_38590,58168_003392,2020-09-11 03:01:48.100,0,37084,38590,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,away,DE,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,home,G,5.431841,False
4,58168_003392_0_37084_39947,58168_003392,2020-09-11 03:01:48.100,0,37084,39947,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,away,DE,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,home,T,6.886697,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4721613,58582_003121_91_52500_52619,58582_003121,2021-10-12 02:42:29.100,91,52500,52619,0,58.74,40.11,1.34,0.13,204.96,136.56,1.23,-1.20,home,WR,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,away,CB,18.020710,False
4721614,58582_003121_91_52500_G,58582_003121,2021-10-12 02:42:29.100,91,52500,G,0,58.74,40.11,1.34,0.13,204.96,136.56,1.23,-1.20,home,WR,,,,,,,,,,,,True
4721615,58582_003121_91_52609_52619,58582_003121,2021-10-12 02:42:29.100,91,52609,52619,0,60.32,25.93,1.38,0.15,261.77,269.45,0.35,-0.30,home,WR,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,away,CB,4.094142,False
4721616,58582_003121_91_52609_G,58582_003121,2021-10-12 02:42:29.100,91,52609,G,0,60.32,25.93,1.38,0.15,261.77,269.45,0.35,-0.30,home,WR,,,,,,,,,,,,True


# Exclude distance > 2
2人のプレイヤーの距離が2より大きい場合、接触の確率は非常に低いので、0とみなし、学習データは470万行から660K行に減少する

In [None]:
DISTANCE_THRESH = 2

train_y = train['contact'].values
oof_pred = np.zeros(len(train))
cond_dis_train = (train['distance']<=DISTANCE_THRESH) | (train['distance'].isna())
cond_dis_test = (test['distance']<=DISTANCE_THRESH) | (test['distance'].isna())

train = train[cond_dis_train]
train.reset_index(inplace = True, drop = True)

print('number of train data : ',len(train))

_ = gc.collect()

number of train data :  660560


# Helmet track Features

In [None]:
CLUSTERS = [10, 50, 100, 500]
#CLUSTERS = [10,50,100,150,200,250,300,350,400,450,500]

def add_step_pct(df, cluster):
    df['step_pct'] = cluster * (df['step']-min(df['step']))/(max(df['step'])-min(df['step']))
    df['step_pct'] = df['step_pct'].apply(np.ceil).astype(np.int32)
    return df

for cluster in CLUSTERS:
    train = train.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
    test = test.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))

    for helmet_view in ['Sideline', 'Endzone']:
        helmet_train = pd.read_csv('/content/drive/MyDrive/kaggle/NFL/dataset/original/train_baseline_helmets.csv')
        helmet_train.loc[helmet_train['view']=='Endzone2','view'] = 'Endzone'
        helmet_test = pd.read_csv('/content/drive/MyDrive/kaggle/NFL/dataset/original/test_baseline_helmets.csv')
        helmet_test.loc[helmet_test['view']=='Endzone2','view'] = 'Endzone'

        helmet_train.rename(columns = {'frame': 'step'}, inplace = True)
        helmet_train = helmet_train.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
        helmet_test.rename(columns = {'frame': 'step'}, inplace = True)
        helmet_test = helmet_test.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
        helmet_train = helmet_train[helmet_train['view']==helmet_view]
        helmet_test = helmet_test[helmet_test['view']==helmet_view]

        helmet_train['helmet_id'] = helmet_train['game_play'] + '_' + helmet_train['nfl_player_id'].astype(str) + '_' + helmet_train['step_pct'].astype(str)
        helmet_test['helmet_id'] = helmet_test['game_play'] + '_' + helmet_test['nfl_player_id'].astype(str) + '_' + helmet_test['step_pct'].astype(str)

        helmet_train = helmet_train[['helmet_id', 'left', 'width', 'top', 'height']].groupby('helmet_id').mean().reset_index()
        helmet_test = helmet_test[['helmet_id', 'left', 'width', 'top', 'height']].groupby('helmet_id').mean().reset_index()
        for player_ind in [1, 2]:
            train['helmet_id'] = train['game_play'] + '_' + train['nfl_player_id_'+str(player_ind)].astype(str) + \
                                    '_' + train['step_pct'].astype(str)
            test['helmet_id'] = test['game_play'] + '_' + test['nfl_player_id_'+str(player_ind)].astype(str) + \
                                    '_' + test['step_pct'].astype(str)

            train = train.merge(helmet_train, how = 'left')
            test = test.merge(helmet_test, how = 'left')

            train.rename(columns = {i:i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']}, inplace = True)
            test.rename(columns = {i:i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']}, inplace = True)

            del train['helmet_id'], test['helmet_id']
            gc.collect()

            feature_cols += [i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']]
        #del helmet_train, helmet_test
        gc.collect()

# Fill missing values for the ground

In [None]:
for cluster in CLUSTERS:
    for helmet_view in ['Sideline', 'Endzone']:
        train.loc[train['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_2'] = train.loc[train['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_1']
        train.loc[train['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_2'] = train.loc[train['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_1']
        train.loc[train['G_flug']==True,'width_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        train.loc[train['G_flug']==True,'height_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        
        test.loc[test['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_2'] = test.loc[test['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_1']
        test.loc[test['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_2'] = test.loc[test['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_1']
        test.loc[test['G_flug']==True,'width_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        test.loc[test['G_flug']==True,'height_'+helmet_view+'_'+str(cluster)+'_2'] = 0

# team features

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
not_null = train['team_1'][train['team_1'].notnull()]
train['team_1'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

le = LabelEncoder()
not_null = train['team_2'][train['team_2'].notnull()]
train['team_2'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

In [None]:
le = LabelEncoder()
not_null = test['team_1'][test['team_1'].notnull()]
test['team_1'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

le = LabelEncoder()
not_null = test['team_2'][test['team_2'].notnull()]
test['team_2'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

# position features

In [None]:
le = LabelEncoder()
not_null = train['position_1'][train['position_1'].notnull()]
train['position_1'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

le = LabelEncoder()
not_null = train['position_2'][train['position_2'].notnull()]
train['position_2'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)



In [None]:
le = LabelEncoder()
not_null = test['position_1'][test['position_1'].notnull()]
test['position_1'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

le = LabelEncoder()
not_null = test['position_2'][test['position_2'].notnull()]
test['position_2'] = pd.Series(le.fit_transform(not_null),index=not_null.index).astype(np.int8)

# diff features

In [None]:
cols = [i[:-2] for i in train.columns if i[-2:]=='_1' and i!='nfl_player_id_1']
train[[i+'_diff' for i in cols]] = np.abs(train[[i+'_1' for i in cols]].values - train[[i+'_2' for i in cols]].values)
test[[i+'_diff' for i in cols]] = np.abs(test[[i+'_1' for i in cols]].values - test[[i+'_2' for i in cols]].values)
feature_cols += [i+'_diff' for i in cols]

# add features

In [None]:
#cols = ['x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
#train[[i+'_add' for i in cols]] = np.abs(train[[i+'_1' for i in cols]].values + train[[i+'_2' for i in cols]].values)
#test[[i+'_add' for i in cols]] = np.abs(test[[i+'_1' for i in cols]].values + test[[i+'_2' for i in cols]].values)
#feature_cols += [i+'_add' for i in cols]

# nan features

In [None]:
#cols = [i[:-2] for i in train.columns if i[-2:]=='_1' and i!='nfl_player_id_1']
#train[[i+'_nan_1' for i in cols]] = train[[i+'_1' for i in cols]].isnull()
#train[[i+'_nan_2' for i in cols]] = train[[i+'_2' for i in cols]].isnull()
#test[[i+'_nan_1' for i in cols]] = test[[i+'_1' for i in cols]].isnull()
#test[[i+'_nan_2' for i in cols]] = test[[i+'_2' for i in cols]].isnull()
#feature_cols += [i+'_nan_1' for i in cols]
#feature_cols += [i+'_nan_2' for i in cols]

# is zero features

In [None]:
#cols = [i[:-2] for i in train.columns if i[-2:]=='_1' and i!='nfl_player_id_1']
#train[[i+'_iszero_1' for i in cols]] = train[[i+'_1' for i in cols]]==0
#train[[i+'_iszero_2' for i in cols]] = train[[i+'_2' for i in cols]]==0
#test[[i+'_iszero_1' for i in cols]] = test[[i+'_1' for i in cols]]==0
#test[[i+'_iszero_2' for i in cols]] = test[[i+'_2' for i in cols]]==0
#feature_cols += [i+'_iszero_1' for i in cols]
#feature_cols += [i+'_iszero_2' for i in cols]

# is non zero feature

In [None]:
#cols = [i[:-2] for i in train.columns if i[-2:]=='_1' and i!='nfl_player_id_1']
#train[[i+'_isnonzero_1' for i in cols]] = train[[i+'_1' for i in cols]]!=0
#train[[i+'_isnonzero_2' for i in cols]] = train[[i+'_2' for i in cols]]!=0
#test[[i+'_isnonzero_1' for i in cols]] = test[[i+'_1' for i in cols]]!=0
#test[[i+'_isnonzero_2' for i in cols]] = test[[i+'_2' for i in cols]]!=0
#feature_cols += [i+'_isnonzero_1' for i in cols]
#feature_cols += [i+'_isnonzero_2' for i in cols]

# helmets fetures

In [None]:
#wcols = [i for i in train.columns if i.split('_')[-1]!='diff' and i.split('_')[0]=='width']

In [None]:
#hcols = [i for i in train.columns if i.split('_')[-1]!='diff' and i.split('_')[0]=='height']

In [30]:
#train[[i+'_prod' for i in hcols]] = train[[i for i in hcols]].values * train[[i for i in wcols]].values
#test[[i+'_prod' for i in hcols]] = test[[i for i in hcols]].values * test[[i for i in wcols]].values
#feature_cols += [i+'_prod' for i in hcols]

# count features

In [31]:
"""cols = ['team_1','team_2','position_1','position_2']
for i in cols:
    train[i+'_count'] = train.groupby(i)[i].transform('count')
feature_cols += [i+'_count' for i in cols]"""

"cols = ['team_1','team_2','position_1','position_2']\nfor i in cols:\n    train[i+'_count'] = train.groupby(i)[i].transform('count')\nfeature_cols += [i+'_count' for i in cols]"

# prod features

In [32]:
cols = ['x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
train[[i+'_prod' for i in cols]] = train[[i+'_1' for i in cols]].values * train[[i+'_2' for i in cols]].values
test[[i+'_prod' for i in cols]] = test[[i+'_1' for i in cols]].values * test[[i+'_2' for i in cols]].values
feature_cols += [i+'_prod' for i in cols]

print('number of features : ',len(feature_cols))
print('number of train data : ',len(train))

number of features :  136
number of train data :  660560


# clipping(1-99)

In [33]:
#feature_cols.remove('G_flug')

In [34]:
#for i in train[feature_cols].columns:
#    upper,lower = np.percentile(train[i],[1,99])
#    train[i] = np.clip(train[i],upper,lower)

In [35]:
#feature_cols.append('G_flug')

# fill na

In [36]:
train = train.fillna(-9999)

In [37]:
train[feature_cols]

Unnamed: 0,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,team_1,position_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,team_2,position_2,distance,G_flug,left_Sideline_10_1,width_Sideline_10_1,top_Sideline_10_1,height_Sideline_10_1,left_Sideline_10_2,width_Sideline_10_2,top_Sideline_10_2,height_Sideline_10_2,left_Endzone_10_1,width_Endzone_10_1,top_Endzone_10_1,height_Endzone_10_1,left_Endzone_10_2,width_Endzone_10_2,top_Endzone_10_2,height_Endzone_10_2,left_Sideline_50_1,width_Sideline_50_1,top_Sideline_50_1,height_Sideline_50_1,left_Sideline_50_2,width_Sideline_50_2,top_Sideline_50_2,height_Sideline_50_2,left_Endzone_50_1,width_Endzone_50_1,top_Endzone_50_1,height_Endzone_50_1,left_Endzone_50_2,width_Endzone_50_2,top_Endzone_50_2,height_Endzone_50_2,left_Sideline_100_1,width_Sideline_100_1,top_Sideline_100_1,height_Sideline_100_1,left_Sideline_100_2,width_Sideline_100_2,top_Sideline_100_2,height_Sideline_100_2,left_Endzone_100_1,width_Endzone_100_1,top_Endzone_100_1,height_Endzone_100_1,left_Endzone_100_2,width_Endzone_100_2,top_Endzone_100_2,height_Endzone_100_2,left_Sideline_500_1,width_Sideline_500_1,top_Sideline_500_1,height_Sideline_500_1,left_Sideline_500_2,width_Sideline_500_2,top_Sideline_500_2,height_Sideline_500_2,left_Endzone_500_1,width_Endzone_500_1,top_Endzone_500_1,height_Endzone_500_1,left_Endzone_500_2,width_Endzone_500_2,top_Endzone_500_2,height_Endzone_500_2,x_position_diff,y_position_diff,speed_diff,distance_diff,direction_diff,orientation_diff,acceleration_diff,sa_diff,team_diff,position_diff,left_Sideline_10_diff,width_Sideline_10_diff,top_Sideline_10_diff,height_Sideline_10_diff,left_Endzone_10_diff,width_Endzone_10_diff,top_Endzone_10_diff,height_Endzone_10_diff,left_Sideline_50_diff,width_Sideline_50_diff,top_Sideline_50_diff,height_Sideline_50_diff,left_Endzone_50_diff,width_Endzone_50_diff,top_Endzone_50_diff,height_Endzone_50_diff,left_Sideline_100_diff,width_Sideline_100_diff,top_Sideline_100_diff,height_Sideline_100_diff,left_Endzone_100_diff,width_Endzone_100_diff,top_Endzone_100_diff,height_Endzone_100_diff,left_Sideline_500_diff,width_Sideline_500_diff,top_Sideline_500_diff,height_Sideline_500_diff,left_Endzone_500_diff,width_Endzone_500_diff,top_Endzone_500_diff,height_Endzone_500_diff,x_position_prod,y_position_prod,speed_prod,distance_prod,direction_prod,orientation_prod,acceleration_prod,sa_prod
0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,0,3,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.0,25.0,1.543017,False,483.000000,14.000000,469.000000,16.000000,433.000000,15.0,465.000000,19.0,394.000000,22.000000,317.000000,17.000000,384.000000,25.0,288.000000,34.0,483.000000,14.000000,469.000000,16.000000,433.000000,15.0,465.000000,19.0,394.000000,22.000000,317.0,17.000000,384.000000,25.0,288.0,34.0,483.000000,14.000000,469.000000,16.000000,433.000000,15.0,465.000000,19.0,394.000000,22.000000,317.000000,17.000000,384.000000,25.0,288.000000,34.0,483.0,14.0,469.0,16.0,433.0,15.0,465.0,19.0,394.0,22.0,317.0,17.0,384.0,25.0,288.0,34.0,1.53,0.20,0.12,0.01,115.99,173.39,0.02,0.01,1.0,22.0,50.0,1.000000,4.0,3.000000,10.0,3.000000,29.0,17.000000,50.0,1.000000,4.0,3.000000,10.0,3.000000,29.0,17.000000,50.0,1.000000,4.0,3.000000,10.0,3.000000,29.0,17.000000,50.0,1.0,4.0,3.0,10.0,3.0,29.0,17.0,1691.5030,399.1904,0.3564,0.0042,34542.7230,23324.6052,0.8280,0.8010
1,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,0,3,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,483.000000,14.000000,469.000000,16.000000,483.000000,0.0,469.000000,0.0,394.000000,22.000000,317.000000,17.000000,394.000000,0.0,317.000000,0.0,483.000000,14.000000,469.000000,16.000000,483.000000,0.0,469.000000,0.0,394.000000,22.000000,317.0,17.000000,394.000000,0.0,317.0,0.0,483.000000,14.000000,469.000000,16.000000,483.000000,0.0,469.000000,0.0,394.000000,22.000000,317.000000,17.000000,394.000000,0.0,317.000000,0.0,483.0,14.0,469.0,16.0,483.0,0.0,469.0,0.0,394.0,22.0,317.0,17.0,394.0,0.0,317.0,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,0.0,14.000000,0.0,16.000000,0.0,22.000000,0.0,17.000000,0.0,14.000000,0.0,16.000000,0.0,22.000000,0.0,17.000000,0.0,14.000000,0.0,16.000000,0.0,22.000000,0.0,17.000000,0.0,14.0,0.0,16.0,0.0,22.0,0.0,17.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000
2,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,1,6,40.34,18.08,1.10,0.10,148.93,92.39,2.03,2.03,1.0,26.0,1.258014,False,372.000000,15.000000,512.000000,18.000000,418.000000,16.0,501.000000,18.0,151.000000,25.000000,267.000000,33.000000,244.000000,25.0,296.000000,32.0,372.000000,15.000000,512.000000,18.000000,418.000000,16.0,501.000000,18.0,151.000000,25.000000,267.0,33.000000,244.000000,25.0,296.0,32.0,372.000000,15.000000,512.000000,18.000000,418.000000,16.0,501.000000,18.0,151.000000,25.000000,267.000000,33.000000,244.000000,25.0,296.000000,32.0,372.0,15.0,512.0,18.0,418.0,16.0,501.0,18.0,151.0,25.0,267.0,33.0,244.0,25.0,296.0,32.0,0.75,1.01,0.57,0.05,14.09,7.66,0.60,0.61,0.0,20.0,46.0,1.000000,11.0,0.000000,93.0,0.000000,29.0,1.000000,46.0,1.000000,11.0,0.000000,93.0,0.000000,29.0,1.000000,46.0,1.000000,11.0,0.000000,93.0,0.000000,29.0,1.000000,46.0,1.0,11.0,0.0,93.0,0.0,29.0,1.0,1597.0606,308.6256,0.5830,0.0050,20081.7212,7828.2047,2.9029,2.8826
3,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,1,6,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,372.000000,15.000000,512.000000,18.000000,372.000000,0.0,512.000000,0.0,151.000000,25.000000,267.000000,33.000000,151.000000,0.0,267.000000,0.0,372.000000,15.000000,512.000000,18.000000,372.000000,0.0,512.000000,0.0,151.000000,25.000000,267.0,33.000000,151.000000,0.0,267.0,0.0,372.000000,15.000000,512.000000,18.000000,372.000000,0.0,512.000000,0.0,151.000000,25.000000,267.000000,33.000000,151.000000,0.0,267.000000,0.0,372.0,15.0,512.0,18.0,372.0,0.0,512.0,0.0,151.0,25.0,267.0,33.0,151.0,0.0,267.0,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,0.0,15.000000,0.0,18.000000,0.0,25.000000,0.0,33.000000,0.0,15.000000,0.0,18.000000,0.0,25.000000,0.0,33.000000,0.0,15.000000,0.0,18.000000,0.0,25.000000,0.0,33.000000,0.0,15.0,0.0,18.0,0.0,25.0,0.0,33.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000
4,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,0,17,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,544.000000,14.000000,282.000000,18.000000,544.000000,0.0,282.000000,0.0,1215.000000,27.000000,284.000000,23.000000,1215.000000,0.0,284.000000,0.0,544.000000,14.000000,282.000000,18.000000,544.000000,0.0,282.000000,0.0,1215.000000,27.000000,284.0,23.000000,1215.000000,0.0,284.0,0.0,544.000000,14.000000,282.000000,18.000000,544.000000,0.0,282.000000,0.0,1215.000000,27.000000,284.000000,23.000000,1215.000000,0.0,284.000000,0.0,544.0,14.0,282.0,18.0,544.0,0.0,282.0,0.0,1215.0,27.0,284.0,23.0,1215.0,0.0,284.0,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,0.0,14.000000,0.0,18.000000,0.0,27.000000,0.0,23.000000,0.0,14.000000,0.0,18.000000,0.0,27.000000,0.0,23.000000,0.0,14.000000,0.0,18.000000,0.0,27.000000,0.0,23.000000,0.0,14.0,0.0,18.0,0.0,27.0,0.0,23.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660555,33.18,25.26,2.55,0.26,95.39,152.22,0.43,-0.40,1,8,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,-9999.000000,-9999.000000,-9999.0,-9999.000000,-9999.000000,0.0,-9999.0,0.0,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,-9999.0,0.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,-9999.0,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000
660556,65.04,38.68,1.31,0.14,168.40,143.47,0.74,-0.69,0,7,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,913.375000,20.660714,341.982143,26.267857,913.375000,0.0,341.982143,0.0,186.875000,44.017857,388.535714,40.732143,186.875000,0.0,388.535714,0.0,916.250000,22.583333,351.833333,25.750000,916.250000,0.0,351.833333,0.0,133.416667,42.333333,405.0,39.833333,133.416667,0.0,405.0,0.0,914.833333,23.000000,354.333333,25.500000,914.833333,0.0,354.333333,0.0,124.333333,42.333333,406.666667,39.333333,124.333333,0.0,406.666667,0.0,914.0,23.0,355.5,26.0,914.0,0.0,355.5,0.0,118.5,42.5,407.0,39.0,118.5,0.0,407.0,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,0.0,20.660714,0.0,26.267857,0.0,44.017857,0.0,40.732143,0.0,22.583333,0.0,25.750000,0.0,42.333333,0.0,39.833333,0.0,23.000000,0.0,25.500000,0.0,42.333333,0.0,39.333333,0.0,23.0,0.0,26.0,0.0,42.5,0.0,39.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000
660557,58.74,40.11,1.34,0.13,204.96,136.56,1.23,-1.20,1,27,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,531.571429,22.928571,304.375000,23.910714,531.571429,0.0,304.375000,0.0,318.321429,33.357143,173.089286,44.982143,318.321429,0.0,173.089286,0.0,503.666667,23.083333,313.250000,24.083333,503.666667,0.0,313.250000,0.0,264.333333,33.833333,173.5,44.416667,264.333333,0.0,173.5,0.0,499.666667,23.166667,315.166667,24.166667,499.666667,0.0,315.166667,0.0,256.833333,33.833333,173.833333,44.333333,256.833333,0.0,173.833333,0.0,497.5,23.5,316.0,24.5,497.5,0.0,316.0,0.0,252.0,33.5,173.5,44.0,252.0,0.0,173.5,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,0.0,22.928571,0.0,23.910714,0.0,33.357143,0.0,44.982143,0.0,23.083333,0.0,24.083333,0.0,33.833333,0.0,44.416667,0.0,23.166667,0.0,24.166667,0.0,33.833333,0.0,44.333333,0.0,23.5,0.0,24.5,0.0,33.5,0.0,44.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000
660558,60.32,25.93,1.38,0.15,261.77,269.45,0.35,-0.30,1,27,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,-9999.000000,True,660.660714,24.178571,694.857143,23.517857,660.660714,0.0,694.857143,0.0,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,605.916667,24.000000,695.750000,23.333333,605.916667,0.0,695.750000,0.0,-9999.000000,-9999.000000,-9999.0,-9999.000000,-9999.000000,0.0,-9999.0,0.0,597.500000,24.000000,695.500000,23.666667,597.500000,0.0,695.500000,0.0,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,0.0,-9999.000000,0.0,592.0,24.0,695.5,24.0,592.0,0.0,695.5,0.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,-9999.0,0.0,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0,-9999.0,0.0,24.178571,0.0,23.517857,-9999.0,-9999.000000,-9999.0,-9999.000000,0.0,24.000000,0.0,23.333333,-9999.0,-9999.000000,-9999.0,-9999.000000,0.0,24.000000,0.0,23.666667,-9999.0,-9999.000000,-9999.0,-9999.000000,0.0,24.0,0.0,24.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000,-9999.0000


# train

In [38]:
%%time
cfg.folds = get_groupkfold(train, 'contact', 'game_play', cfg.num_fold)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'), index=False)

oof_pred[np.where(cond_dis_train)] = fit_xgboost(cfg, train[feature_cols], train['contact'], 
                                              cfg.xgb_params, add_suffix="_xgb_1st")
np.save(os.path.join(cfg.EXP_PREDS, f'all_oof.npy'),oof_pred)

0    119036
1     13030
Name: contact, dtype: int64

[0]	train-auc:0.93368	eval-auc:0.91817
[100]	train-auc:0.96830	eval-auc:0.94424
[200]	train-auc:0.97995	eval-auc:0.94900
[300]	train-auc:0.98580	eval-auc:0.95044
[400]	train-auc:0.98949	eval-auc:0.95083
[500]	train-auc:0.99203	eval-auc:0.95094
[600]	train-auc:0.99409	eval-auc:0.95153
[700]	train-auc:0.99531	eval-auc:0.95203
[800]	train-auc:0.99607	eval-auc:0.95247
[900]	train-auc:0.99665	eval-auc:0.95290
[1000]	train-auc:0.99708	eval-auc:0.95334
[1100]	train-auc:0.99741	eval-auc:0.95355
[1200]	train-auc:0.99770	eval-auc:0.95382
[1300]	train-auc:0.99797	eval-auc:0.95406
[1400]	train-auc:0.99819	eval-auc:0.95432
[1500]	train-auc:0.99839	eval-auc:0.95443
[1600]	train-auc:0.99855	eval-auc:0.95472
[1700]	train-auc:0.99870	eval-auc:0.95495
[1800]	train-auc:0.99882	eval-auc:0.95504
[1883]	train-auc:0.99892	eval-auc:0.95498
Performance of the prediction: 0.95498



0    119555
1     12629
Name: contact, dtype: int64

[0]	train-auc:0.93520	eval-auc:0.90910
[100]	train-auc:0.96924	eval-auc:0.93942
[200]	train-auc:0.98056	eval-auc:0.94380
[300]	train-auc:0.98604	eval-auc:0.94415
[389]	train-auc:0.98941	eval-auc:0.94365
Performance of the prediction: 0.94365



0    118545
1     13620
Name: contact, dtype: int64

[0]	train-auc:0.93648	eval-auc:0.90111
[100]	train-auc:0.97029	eval-auc:0.93413
[200]	train-auc:0.98142	eval-auc:0.93796
[300]	train-auc:0.98688	eval-auc:0.93822
[351]	train-auc:0.98878	eval-auc:0.93803
Performance of the prediction: 0.93801



0    119615
1     12566
Name: contact, dtype: int64

[0]	train-auc:0.93659	eval-auc:0.89906
[100]	train-auc:0.97035	eval-auc:0.92987
[200]	train-auc:0.98174	eval-auc:0.93292
[300]	train-auc:0.98738	eval-auc:0.93367
[400]	train-auc:0.99072	eval-auc:0.93417
[500]	train-auc:0.99362	eval-auc:0.93516
[600]	train-auc:0.99495	eval-auc:0.93626
[700]	train-auc:0.99587	eval-auc:0.93681
[800]	train-auc:0.99652	eval-auc:0.93773
[900]	train-auc:0.99697	eval-auc:0.93862
[1000]	train-auc:0.99735	eval-auc:0.93942
[1100]	train-auc:0.99765	eval-auc:0.93974
[1200]	train-auc:0.99791	eval-auc:0.94001
[1300]	train-auc:0.99813	eval-auc:0.94030
[1400]	train-auc:0.99834	eval-auc:0.94057
[1500]	train-auc:0.99850	eval-auc:0.94075
[1600]	train-auc:0.99863	eval-auc:0.94103
[1700]	train-auc:0.99879	eval-auc:0.94122
[1800]	train-auc:0.99891	eval-auc:0.94123
[1900]	train-auc:0.99900	eval-auc:0.94127
[2000]	train-auc:0.99911	eval-auc:0.94128
[2100]	train-auc:0.99920	eval-auc:0.94144
[2200]	train-auc:0.99928	eval-auc:0.94166
[2300]	train-auc:0.99935	eval-auc:0.94176
[240

0    119497
1     12467
Name: contact, dtype: int64

[0]	train-auc:0.93452	eval-auc:0.91409
[100]	train-auc:0.96906	eval-auc:0.93870
[200]	train-auc:0.98026	eval-auc:0.94113
[300]	train-auc:0.98632	eval-auc:0.94192
[400]	train-auc:0.98994	eval-auc:0.94231
[500]	train-auc:0.99253	eval-auc:0.94276
[600]	train-auc:0.99426	eval-auc:0.94361
[700]	train-auc:0.99541	eval-auc:0.94380
[800]	train-auc:0.99615	eval-auc:0.94438
[900]	train-auc:0.99670	eval-auc:0.94442
[1000]	train-auc:0.99713	eval-auc:0.94456
[1068]	train-auc:0.99737	eval-auc:0.94439
Performance of the prediction: 0.94439

All Performance of the prediction: 0.93974
CPU times: user 8min 18s, sys: 6min 16s, total: 14min 34s
Wall time: 7min 6s


# slack setting

In [39]:
import json
import requests

# 任意のメッセージを通知する関数
def send_slack_message_notification(message):
    webhook_url = 'https://hooks.slack.com/services/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'  
    data = json.dumps({'text': message})
    headers = {'content-type': 'application/json'}
    requests.post(webhook_url, data=data, headers=headers)

# errorを通知する関数
def send_slack_error_notification(message):
    webhook_url = 'https://hooks.slack.com/services/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' 
    # no_entry_signは行き止まりの絵文字を出力
    data = json.dumps({"text":":no_entry_sign:" + message})  
    headers = {'content-type': 'application/json'}
    requests.post(webhook_url, data=data, headers=headers)

# threshold

In [None]:
def func(x_list):
    score = matthews_corrcoef(train_y, oof_pred>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead")
cfg.threshold = result.x[0]
print("score:", round(matthews_corrcoef(train_y, oof_pred>cfg.threshold), 5))
print("threshold", round(cfg.threshold, 5))

In [None]:
send_slack_message_notification(f'{dir_name} score:{round(matthews_corrcoef(train_y, oof_pred>cfg.threshold), 5)}')
send_slack_message_notification(f'{dir_name} threshold:{round(cfg.threshold, 5)}')

# infer

In [None]:
#sub_pred = pred_xgboost(test.loc[cond_dis_test, feature_cols], cfg.EXP_MODEL, add_suffix="_xgb_1st")

# Submission

In [None]:
# ==============================
# optimize
# ==============================
"""
del train
gc.collect()

test = add_contact_id(test)
test['contact'] = 0
test.loc[cond_dis_test, 'contact'] = (sub_pred > cfg.threshold).astype(int)
test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
display(test[['contact_id', 'contact']].head())
"""