# Mixture Density Networks(MDN)

Mixture density networks(MDN)(Bishop, 1994) is a neural network with outputs of Gaussian Mixuture Model parameter.  

MDN have 3types of output, and the output number is changed by parameter N(the number of gaussian distrubution components).
- π(k): a weight of (k)th gaussian distribution(∑π(k) = 1)
- μ(k): a mean of (k)th gaussian distribution
- σ(k): a standard deviation of (k)th gaussian distribution

If N = 3, there are 9 output(3x3components).

## Why GMM?
My idea is near [This kernel](https://www.kaggle.com/kenmatsu4/nn-outputs-gaussian-distribution-directly) .

I think the target distribution is more complicated than a single gaussian distribution looking at target distribution.  


## Reference
- thesis: https://publications.aston.ac.uk/id/eprint/373/1/NCRG_94_004.pdf
- imprement referencce: https://qiita.com/ctgk/items/19c4a4f205b855cf6a05 (Japanese)

# Implementation(Pytorch)
In this kernel, features are used from these kernels.
- https://www.kaggle.com/ryancaldwell/location-eda
- https://www.kaggle.com/coolcoder22/nn-19-features

In [None]:
import gc
import os
import copy
import math
import time
import numba
import warnings
import argparse
import logging
from pathlib import Path
from contextlib import contextmanager
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import (
    train_test_split,
    GroupKFold
)
from kaggle.competitions import nflrush

In [None]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# Config

In [None]:
class Config():
    def __init__(self):
        self.IS_KERNEL = True
        self.DEBUG = True
        # path
        self.DATA_DIR = "../input/nfl-big-data-bowl-2020/"
        self.OUT_PATH = "log"
        self.PROJECT_PATH = "./"
        self.INITIAL_CHECKPOINT = None
        # dataset
        self.N_WORKERS = 0
        self.BATCH_SIZE = 256
        # epoch
        self.N_EPOCH = 50
        # model/train
        self.N_COMPONENTS = 3
        self.N_EMB_OUT = 8
        self.USE_CAT = True
        # learning rate
        self.LEARNING_RATE = 0.01
        self.LR_DECAY = 0.9
        # other
        self.SEED = 2019
        # validation display
        self.VAL_PERIOD = 1
        # early stopping
        self.N_EARLY_STOPPPING_PATIENCE = 3
        # validation
        self.VALID_TYPE = "GroupKFold"
        self.N_FOLDS = 5
        self.TEST_SIZE = 0.15
        self.TARGET_NORM = False

In [None]:
config = Config()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(filename)s[line:%(lineno)d] "
           + "%(levelname)s %(message)s",
    datefmt='%a, %d %b %Y %H:%M:%S',
    filename="./train.log",
    filemode="w")

# Print to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

# Validation

In [None]:
class Validation():
    def __init__(self, config):
        self.valid_type = config.VALID_TYPE
        self.config = config
        self.valid_list = []

    def make_validation(self):
        df = pd.read_csv(Path(self.config.DATA_DIR) / "train.csv")
        # random split
        if self.valid_type == "random_split":
            game_ids = df["GameId"].unique()
            trn_game_ids, val_game_ids = \
                train_test_split(
                    game_ids,
                    test_size=self.config.TEST_SIZE,
                    random_state=self.config.SEED
                )
            trn_play_ids = \
                df[df["GameId"].isin(trn_game_ids)]["PlayId"].unique()
            val_play_ids = \
                df[df["GameId"].isin(val_game_ids)]["PlayId"].unique()
            self.valid_list.append((trn_play_ids, val_play_ids))
        elif self.valid_type == "time_split":
            game_ids = df["GameId"].unique()
            trn_game_ids, val_game_ids = \
                train_test_split(
                    game_ids,
                    test_size=self.config.TEST_SIZE
                )
            trn_play_ids = \
                df[df["GameId"].isin(trn_game_ids)]["PlayId"].unique()
            val_play_ids = \
                df[df["GameId"].isin(val_game_ids)]["PlayId"].unique()
            self.valid_list.append((trn_play_ids, val_play_ids))

        # group kfold
        elif self.valid_type == "GroupKFold":
            game_ids = df["GameId"]
            folds = GroupKFold(n_splits=self.config.N_FOLDS)
            for trn_, val_ in folds.split(df, groups=game_ids):
                trn_play_ids = df.loc[trn_, "PlayId"].unique()
                val_play_ids = df.loc[val_, "PlayId"].unique()
                self.valid_list.append((trn_play_ids, val_play_ids))
        else:
            raise("Implemented Error (No such valid type)")

    def get_split(self, fold_idx=None):
        if fold_idx is None:
            return self.valid_list[0]
        else:
            return self.valid_list[fold_idx]


# ReadData

In [None]:
def read_train(config):
    def read_data(csv_path, pkl_path):
        if os.path.exists(pkl_path):
            df = pd.read_pickle(pkl_path)
        else:
            df = pd.read_csv(csv_path)
            if not config.IS_KERNEL:
                df.to_pickle(pkl_path)
        return df
    pkl_path = os.path.join(config.DATA_DIR, "train.pkl")
    csv_path = os.path.join(config.DATA_DIR, "train.csv")
    df = read_data(csv_path, pkl_path)
    return df

# DataSet

In [None]:
class NFLDataSet(Dataset):
    def __init__(self,
                 df,
                 play_ids,
                 numerical_features,
                 cat_features,
                 mode="train",
                 preprocessing=None,
                 except_features=[]):
        self.play_ids = play_ids
        self.numerical_features = numerical_features
        self.cat_features = cat_features
        self.df = df[df["PlayId"].isin(play_ids)].reset_index(drop=True)
        self.mode = mode
        self.preprocessing = preprocessing
        self.target_col = "Yards"
        self.use_cols = \
            [c for c in df.columns if c not in except_features]
        self.retype()

    def retype(self):
        for c in self.numerical_features:
            self.df[c] = self.df[c].fillna(0).astype(np.float32)

    def __len__(self):
        return len(self.play_ids)

    def __getitem__(self, idx):
        return \
            self.df.loc[idx, self.numerical_features].values, \
            self.df.loc[idx, self.cat_features].values, \
            self.df.loc[idx, self.target_col]


# LRScheduler

In [None]:
def adjust_learning_rate(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def get_learning_rate(optimizer):
    lr = []
    for param_group in optimizer.param_groups:
        lr += [param_group['lr']]
    assert(len(lr) == 1)
    lr = lr[0]
    return lr


class NullScheduler():
    def __init__(self, lr=0.01):
        super(NullScheduler, self).__init__()
        self.lr = lr
        self.cycle = 0

    def __call__(self, time):
        return self.lr

    def __str__(self):
        string = "NullScheduler\n" \
            + "lr={0:0.5f}".format(self.lr)
        return string


class ManualScheduler():
    def __init__(self, lr=0.01, lr_decay=0.9):
        super(ManualScheduler, self).__init__()
        self.lr_list = [lr * (lr_decay ** i) for i in range(100)]
        self.cycle = 0

    def __call__(self, time):
        if time < len(self.lr_list):
            return self.lr_list[time]
        else:
            return self.lr_list[-1]

    def __str__(self):
        string = "ManualScheduler\n" \
            + "lr={0:0.5f}".format(self.lr_list[0])
        return string

# EarlyStopping

In [None]:
class EarlyStopping:
    """
    ref: https://github.com/Bjarten/early-stopping-pytorch
    """
    def __init__(self, patience=2, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.best_model = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model, save_name):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_model = copy.deepcopy(model)
            self.save_checkpoint(val_loss, model, save_name)
        elif score < self.best_score:
            self.counter += 1
            # print(f'EarlyStopping counter: {self.counter} '
            #      'out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model = copy.deepcopy(model)
            self.save_checkpoint(val_loss, model, save_name)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, save_name):
        if self.verbose:
            print(f'Validation loss decreased ('
                  '{self.val_loss_min:.5f} --> {val_loss:.5f}'
                  ').  Saving model ...')
            print("Save model: {}".format(save_name))
        torch.save(model.state_dict(), save_name)
        self.val_loss_min = val_loss

    def get_best_model(self):
        return self.best_model

# Loss

In [None]:
# コスト関数のクラス
class GaussianMixture(torch.autograd.Function):
    """Negative Log Likelihood of Gaussian Mixture model"""
    def __init__(self, n_components):
        # ガウス分布の個数、今までの例だと3
        self.n_components = n_components

    # コスト関数の値を計算するメソッド
    def __call__(self, pred, targets):
        # ネットワークの出力Xを活性化関数で変換して、標準偏差、混合係数、平均を計算
        sigma, weight, mu = pred
        # ガウス関数N(t|mu,sigma^2)の値を計算
        gauss = self.gauss(mu, sigma, targets) + 1e-8
        # 混合ガウス分布の負の対数尤度E(w):PRML(式5.153)
        return -torch.log(torch.sum(weight * gauss, dim=1)).sum()

    # ガウス関数N(target|mu,sigma^2)を計算
    def gauss(self, mu, sigma, targets):
        tmp = (-0.5 * (mu - targets.view(-1, 1))**2) / sigma**2
        return torch.exp(tmp) / torch.sqrt(2 * np.pi * sigma**2)

    # コスト関数を活性で微分(今のとこ使ってない)
    def backward(self, X, targets):
        sigma, weight, mu = self.activate(X)
        var = sigma**2
        gamma = weight * self.gauss(mu, sigma, targets)
        gamma /= torch.sum(gamma, axis=1, keepdims=True)
        # それぞれの微分を計算
        delta_mu = gamma * (mu - targets) / var
        delta_sigma = gamma * (1 - (mu - targets) ** 2 / var)
        delta_weight = weight - gamma
        # 連結させてから返す
        delta = torch.concat([delta_sigma, delta_weight, delta_mu], dim=1)
        return delta

# Metric

In [None]:
@numba.njit(cache=True)
def crps(y_true, y_pred_cdf):
    # target_thre = target_std + target_mean
    # y_true = y_true * target_std + target_mean
    N = len(y_pred_cdf)  # Number of plays
    total = 0.
    for m in range(N):
        for i, n in enumerate(range(-99, 100)):
            total += (
                y_pred_cdf[m][i] -
                1 * (n - y_true[m] >= 0)  # Heavyside
            ) ** 2
    return total / (199 * N)

# Model

In [None]:
class LinearBn(nn.Module):
    def __init__(self, in_channel, out_channel, act=None):
        super(LinearBn, self).__init__()
        self.linear = nn.Linear(in_channel, out_channel, bias=False)
        self.bn = nn.BatchNorm1d(out_channel, eps=1e-05, momentum=0.1)
        self.act = act

    def forward(self, x):
        x = self.linear(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.act is not None:
            x = self.act(x)
        return x


class NFLNet(nn.Module):
    def __init__(self, numerical_features, cat_features,
                 cat_nuniques, output_dim, target_mean, target_std,
                 n_components=3, internal_dim=128, n_emb_out=4, target_norm=True):
        super(NFLNet, self).__init__()
        # features
        self.numerical_features = numerical_features
        self.cat_features = cat_features
        self.numerical_input_dim = len(numerical_features)
        self.cat_input_dim = len(cat_features)
        self.cat_nuniques = cat_nuniques
        # components for gmm
        self.n_components = n_components
        # target info
        self.target_mean = target_mean
        self.target_std = target_std
        self.target_norm = target_norm
        # model setting
        input_dim = self.numerical_input_dim + self.cat_input_dim*n_emb_out
        self.process = nn.Sequential(
            LinearBn(input_dim, internal_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            LinearBn(internal_dim, internal_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            LinearBn(internal_dim, internal_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            LinearBn(internal_dim, internal_dim),
            nn.ReLU(inplace=True)
         )
        self.cat_embedding = {}
        for c in self.cat_features:
            self.cat_embedding[c] = \
                nn.Embedding(self.cat_nuniques[c], n_emb_out)
        self.output_layer = nn.Linear(internal_dim, output_dim)
        self.weight_activate = nn.Softmax(dim=1)
        self.pred_x = np.linspace(-99, 99, 199, endpoint=True)

    def forward(self, numerical_x, cat_x):
        emb_out = []
        for i, c in enumerate(self.cat_features):
            emb_out.append(self.cat_embedding[c](cat_x[:, i]))
        emb_h = torch.cat(emb_out, dim=1)
        h = torch.cat([numerical_x, emb_h], dim=1)
        h = self.process(h)
        h = self.output_layer(h)
        X_sigma, X_weight, X_mu = \
            torch.split(h, self.n_components, dim=1)
        # 標準偏差を活性化関数で変換
        X_sigma = torch.exp(X_sigma)
        # 混合係数を活性化関数で変換、桁があふれないように最大値で引く
        X_weight = self.weight_activate(X_weight)
        return X_sigma, X_weight, X_mu

    def predict(self, numerical_x, cat_x):
        # model forward
        h_sigma_batch, h_weights_batch, h_mu_batch = \
            self.forward(numerical_x, cat_x)
        # to numpy
        h_sigma_arr = h_sigma_batch.numpy()
        h_weights_arr = h_weights_batch.numpy()
        h_mu_arr = h_mu_batch.numpy()
        # make pred arr
        pred_arr = np.zeros([len(numerical_x), 199])
        for data_i, (h_sigma, h_weights, h_mu) in enumerate(
                zip(h_sigma_arr, h_weights_arr, h_mu_arr)):
            pred = np.zeros([self.n_components, 199])
            # gmmのcomponents毎の分布を算出
            for comp_i in range(self.n_components):
                if self.target_norm:
                    pred[comp_i, :] = norm.cdf(
                        self.pred_x, h_mu[comp_i]*self.target_std+self.target_mean,
                        h_sigma[comp_i] * self.target_std
                    ) * h_weights[comp_i]
                else:
                    # print(h_mu[comp_i], h_sigma[comp_i], h_weights[comp_i])
                    pred[comp_i, :] = norm.cdf(
                        self.pred_x, h_mu[comp_i], h_sigma[comp_i]
                    ) * h_weights[comp_i]
            pred_arr[data_i, :] = pred.sum(axis=0)
        pred_arr = np.clip(pred_arr, 0., 1.)
        return pred_arr

# Feature
- https://www.kaggle.com/ryancaldwell/location-eda
- https://www.kaggle.com/coolcoder22/nn-19-features

In [None]:
EXCEPT_FEATURES = ["PlayId", "GameId", "Yards"]
cat_uniques = {}

In [None]:
def create_features(df, deploy=False, outcomes=None):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2

        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]

        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)

        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')

        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]

        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']

        return defense

    def static_features(df):
        static_features = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                            'YardLine','Quarter','Down','Distance','DefendersInTheBox']].drop_duplicates()
        static_features['DefendersInTheBox'] = static_features['DefendersInTheBox'].fillna(np.mean(static_features['DefendersInTheBox']))

        return static_features
    
    def split_personnel(s):
        splits = s.split(',')
        for i in range(len(splits)):
            splits[i] = splits[i].strip()

        return splits

    def defense_formation(l):
        dl = 0
        lb = 0
        db = 0
        other = 0

        for position in l:
            sub_string = position.split(' ')
            if sub_string[1] == 'DL':
                dl += int(sub_string[0])
            elif sub_string[1] in ['LB','OL']:
                lb += int(sub_string[0])
            else:
                db += int(sub_string[0])

        counts = (dl,lb,db,other)

        return counts

    def offense_formation(l):
        qb = 0
        rb = 0
        wr = 0
        te = 0
        ol = 0

        sub_total = 0
        qb_listed = False
        for position in l:
            sub_string = position.split(' ')
            pos = sub_string[1]
            cnt = int(sub_string[0])

            if pos == 'QB':
                qb += cnt
                sub_total += cnt
                qb_listed = True
            # Assuming LB is a line backer lined up as full back
            elif pos in ['RB','LB']:
                rb += cnt
                sub_total += cnt
            # Assuming DB is a defensive back and lined up as WR
            elif pos in ['WR','DB']:
                wr += cnt
                sub_total += cnt
            elif pos == 'TE':
                te += cnt
                sub_total += cnt
            # Assuming DL is a defensive lineman lined up as an additional line man
            else:
                ol += cnt
                sub_total += cnt

        # If not all 11 players were noted at given positions we need to make some assumptions
        # I will assume if a QB is not listed then there was 1 QB on the play
        # If a QB is listed then I'm going to assume the rest of the positions are at OL
        # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
        if sub_total < 11:
            diff = 11 - sub_total
            if not qb_listed:
                qb += 1
                diff -= 1
            ol += diff

        counts = (qb,rb,wr,te,ol)

        return counts
    
    def personnel_features(df):
        personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
        personnel['num_DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
        personnel['num_LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
        personnel['num_DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
        personnel['num_QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
        personnel['num_RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
        personnel['num_WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
        personnel['num_TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
        personnel['num_OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

        # Let's create some features to specify if the OL is covered
        personnel['OL_diff'] = personnel['num_OL'] - personnel['num_DL']
        personnel['OL_TE_diff'] = (personnel['num_OL'] + personnel['num_TE']) - personnel['num_DL']
        # Let's create a feature to specify if the defense is preventing the run
        # Let's just assume 7 or more DL and LB is run prevention
        personnel['run_def'] = (personnel['num_DL'] + personnel['num_LB'] > 6).astype(int)

        personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)
        
        return personnel

    def combine_features(relative_to_back, defense, static, personnel, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,personnel,on=['GameId','PlayId'],how='inner')

        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df
    
    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats = defense_features(df)
    static_feats = static_features(df)
    personnel = personnel_features(df)
    basetable = combine_features(rel_back, def_feats, static_feats, personnel, deploy=deploy)
    return basetable

In [None]:
def process_two(t_):
    t_['fe1'] = pd.Series(np.sqrt(np.absolute(np.square(t_.X.values) - np.square(t_.Y.values))))
    t_['fe5'] = np.square(t_['S'].values) + 2 * t_['A'].values * t_['Dis'].values  # N
    t_['fe7'] = np.arccos(np.clip(t_['X'].values / t_['Y'].values, -1, 1))  # N
    t_['fe8'] = t_['S'].values / np.clip(t_['fe1'].values, 0.6, None)
    radian_angle = (90 - t_['Dir']) * np.pi / 180.0
    t_['fe10'] = np.abs(t_['S'] * np.cos(radian_angle))
    t_['fe11'] = np.abs(t_['S'] * np.sin(radian_angle))
    return t_


In [None]:
def make_train_feature():
    train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
    outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()
    train_basetable = create_features(train, False, outcomes=outcomes)
    X = train_basetable.copy()

    X = process_two(X)

    important = ['back_from_scrimmage', 'min_dist', 'max_dist', 'mean_dist', 'std_dist',
           'def_min_dist', 'def_max_dist', 'def_mean_dist', 'def_std_dist', 'X',
           'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 'YardLine']

    cat = ['back_oriented_down_field', 'back_moving_down_field']

    num = ['back_from_scrimmage', 'min_dist', 'max_dist', 'mean_dist', 'std_dist', 'def_min_dist', 'def_max_dist', 'def_mean_dist', 'def_std_dist',
           'X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 'YardLine', 'Distance'] + ['fe1', 'fe5', 'fe7','fe8', 'fe10', 'fe11']
    num = [i for i in num if i in important]
    print(len(cat))
    print(len(num))
    scaler = StandardScaler()
    X[num] = scaler.fit_transform(X[num])
    return X, cat, num, scaler

# Train

In [None]:
def run_train(
        train_df, numerical_cols, cat_cols, cat_nuniques,
        target_mean, target_std, validation, fold_idx=None):
    if fold_idx is not None:
        logging.info(f"======= {fold_idx}th fold training =======")
    trn_play_ids, val_play_ids = validation.get_split(fold_idx)
    # dataset
    train_dataset = NFLDataSet(
        train_df,
        numerical_features=numerical_cols,
        cat_features=cat_cols,
        play_ids=trn_play_ids,
        mode="train",
        except_features=EXCEPT_FEATURES
    )
    valid_dataset = NFLDataSet(
        train_df,
        numerical_features=numerical_cols,
        cat_features=cat_cols,
        play_ids=val_play_ids,
        mode="valid",
        except_features=EXCEPT_FEATURES
    )
    # data loader
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=config.N_WORKERS
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        num_workers=config.N_WORKERS
    )
    # early stopping
    early_stopping = EarlyStopping(
        patience=config.N_EARLY_STOPPPING_PATIENCE,
        verbose=False)
    # criterion
    criterion = GaussianMixture(config.N_COMPONENTS)
    # model
    net = NFLNet(
        numerical_features=numerical_cols,
        cat_features=cat_cols,
        cat_nuniques=cat_nuniques,
        output_dim=config.N_COMPONENTS*3,
        target_mean=target_mean,
        target_std=target_std,
        n_components=config.N_COMPONENTS,
        n_emb_out=config.N_EMB_OUT,
        target_norm=config.TARGET_NORM
    )
    net.to(device)
    # lr scheduler
    # scheduler = ManualScheduler(
    #     lr=config.LEARNING_RATE,
    #     lr_decay=config.LR_DECAY,
    # )
    scheduler = NullScheduler(lr=config.LEARNING_RATE)
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, net.parameters()),
        lr=scheduler(0))
    # dataloaders
    dataloaders_dict = {
        "train": train_loader,
        "valid": valid_loader
    }
    # counter
    iteration = 1
    best_score = None
    # epoch loss
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    num_epochs = config.N_EPOCH
    valid_period = config.VAL_PERIOD
    # log
    logging.info(f"Optimizer\n  {optimizer}")
    logging.info(f"Scheduler\n  {scheduler}")
    logging.info(f"Batchsize\n  {config.BATCH_SIZE}")
    logging.info("** start training here! **")
    logging.info("                      |----- VALID -----|---- TRAIN -----")
    logging.info("rate     iter   epoch |  loss    CRPS   |  loss  | time  ")
    logging.info("---------------------------------------------------------")
    # epoch loop
    for epoch in range(num_epochs+1):
        t_epoch_start = time.time()
        val_pred_list = []
        val_true_list = []        
        # train
        lr = scheduler(epoch)
        if lr < 0:
            break
        adjust_learning_rate(optimizer, lr)
        net.train()
        for numerical_x, cat_x, targets in train_loader:
            numerical_x = numerical_x.float()
            cat_x = cat_x.long()
            targets = targets.float()
            numerical_x = numerical_x.to(device)
            cat_x = cat_x.to(device)
            targets = targets.to(device)
            # optimizer zero grad
            optimizer.zero_grad()
            with torch.set_grad_enabled(True):
                y_sigma, y_x_weight, y_mu = net(numerical_x, cat_x)
                loss = criterion(
                    (y_sigma, y_x_weight, y_mu),
                    targets)
                loss.backward()
                optimizer.step()
                epoch_train_loss += loss.item()
                iteration += 1
        ######################
        # valid
        ######################
        if((epoch+1) % valid_period == 0):
            net.eval()
        else:
            continue
        for numerical_x, cat_x, targets in valid_loader:
            numerical_x = numerical_x.float()
            cat_x = cat_x.long()
            targets = targets.float()
            numerical_x = numerical_x.to(device)
            cat_x = cat_x.to(device)
            targets = targets.to(device)
            with torch.set_grad_enabled(False):
                y_sigma, y_x_weight, y_mu = net(numerical_x, cat_x)
                loss = criterion(
                    (y_sigma, y_x_weight, y_mu),
                    targets)
                epoch_val_loss += loss.item()
                pred = net.predict(numerical_x, cat_x)
                val_pred_list.append(pred)
                val_true_list.append(targets.numpy())
        # valid score
        val_preds = np.concatenate(val_pred_list, axis=0)
        val_true = np.concatenate(val_true_list, axis=0)
        if config.TARGET_NORM:
            val_true = val_true * target_std + target_mean
        val_crps = crps(val_true, val_preds)
        t_epoch_finish = time.time()
        elapsed_time = t_epoch_finish - t_epoch_start
        lr_rate = get_learning_rate(optimizer)
        logging.info(
            "{0:1.5f}  {1:4d}    {2:3d}  | {3:4.1f}   {4:1.5f}  {5:4.1f}   {6:4.1f}"
            .format(
                lr_rate,
                iteration,
                epoch,
                epoch_val_loss,
                val_crps,
                epoch_train_loss,
                elapsed_time)
        )
        t_epoch_start = time.time()
        epoch_train_loss = 0.0
        epoch_val_loss = 0.0
        # early stopping
        model_save_path = \
            os.path.join(config.PROJECT_PATH, f"checkpoint_{fold_idx}.pt")
        early_stopping(val_crps, net, model_save_path)
        if best_score is None or val_crps < best_score:
            best_score = val_crps
        if early_stopping.early_stop:
            best_score = early_stopping.best_score*(-1)
            logging.info("******** Early stopping ********")
            logging.info(f"Best Score: {best_score}")
            net = early_stopping.get_best_model()
            break
    return net, best_score

# Prediction

In [None]:
def prediction(models, numerical_cols, cat_cols, scaler):
    env = nflrush.make_env()

    pd.options.mode.chained_assignment = None
    index = 0
    test_feats = []
    with torch.set_grad_enabled(False):
        for (test_original_df, sample_prediction_df) in env.iter_test():
            test_df = create_features(test_original_df, deploy=True)
            test_df = process_two(test_df)
            test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])
            cat_x = test_df[cat_cols].values
            numerical_x = test_df[numerical_cols].values
            cat_x = torch.from_numpy(cat_x).to(device).long()
            numerical_x = torch.from_numpy(numerical_x).to(device).float()
            test_feats.append(test_df)
            y_pred = np.zeros([len(models), 199])
            for i, model in enumerate(models):
                y_pred[i, :] = model.predict(numerical_x, cat_x)
            y_pred = y_pred.mean(axis=0)
            # y_pred = pred.numpy()
            pred_df = pd.DataFrame(
                data=y_pred.reshape([1, -1]),
                columns=sample_prediction_df.columns)
            env.predict(pred_df)
            index += 22
    env.write_submission_file()
    if config.DEBUG:
        all_test_feats = pd.concat(test_feats, axis=0)
        all_test_feats.to_csv("test_df.csv", index=False)

In [None]:
def main():
    with timer("make feature..."):
        train_df, cat_cols, numerical_cols, scaler = make_train_feature()
    logging.info("train shape: {}".format(train_df.shape))
    logging.info("numerical cols: {}".format(len(numerical_cols)))
    logging.info("categorical cols: {}".format(len(cat_cols)))
    if config.DEBUG:
        train_df.to_csv("train_df.csv", index=False)
        pd.Series(cat_cols).to_csv("cat_cols.csv", index=False)
        pd.Series(numerical_cols).to_csv("numerical_cols.csv", index=False)
    cat_nuniques = {}
    for c in cat_cols:
        cat_nuniques[c] = train_df[c].max() + 1
    # normalizing target
    target_std = train_df["Yards"].std()
    target_mean = train_df["Yards"].mean()
    if config.TARGET_NORM:
        train_df["Yards"] = (train_df["Yards"] - target_mean).values / target_std
    # validation
    validation = Validation(config)
    validation.make_validation()
    # training
    models = []
    cv_scores = []
    if config.VALID_TYPE == "GroupKFold":
        for fold_idx in range(config.N_FOLDS):
            # if fold_idx > 0:
            #     break
            model, best_score = run_train(
                train_df,
                numerical_cols=numerical_cols,
                cat_cols=cat_cols,
                cat_nuniques=cat_nuniques,
                target_mean=target_mean,
                target_std=target_std,
                validation=validation,
                fold_idx=fold_idx
            )
            models.append(model)
            cv_scores.append(best_score)
    else:
        model, best_score = run_train(
            train_df,
            numerical_cols=numerical_cols,
            cat_cols=cat_cols,
            cat_nuniques=cat_nuniques,
            target_mean=target_mean,
            target_std=target_std,
            validation=validation,
        )
        models.append(model)
        cv_scores.append(best_score)
    for i, score in enumerate(cv_scores):
        logging.info(f"Fold {i}: {score}")
    logging.info("Average Score: {}".format(np.mean(cv_scores)))
    if config.IS_KERNEL:
        prediction(models, numerical_cols, cat_cols, scaler)

In [None]:
if __name__ == "__main__":
    main()