In [None]:
# Imports

# Standard library
import collections
import gc
import pickle
import glob
import re
import string
import copy
import time
import os
import random

# Specific imports from standard library
from bisect import bisect
from collections import Counter
from functools import partial

# Basic imports
import numpy as np
import pandas as pd

# Scikit-learn
import sklearn
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline

# SciPy
from scipy.spatial.distance import squareform, pdist
from scipy.spatial import ConvexHull

# LightGBM
import lightgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor, LGBMClassifier

# Graphs
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# HyperOpt
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

# Progress bar
from tqdm.auto import tqdm
tqdm.pandas()

# Keras for NNs
import keras
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, BatchNormalization, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils.vis_utils import plot_model

In [None]:
# Advanced settings for graphs

# Seaborn advanced                                                                                                                                                           
sns.set(style='ticks',          # 'ticks', 'darkgrid'                                                                                                                        
        palette='colorblind',   # 'colorblind', 'pastel', 'muted', 'bright'                                                                                                  
        #palette=sns.color_palette('Accent'),   # 'Set1', 'Set2', 'Dark2', 'Accent'                                                                                          
        rc = {                                                                                                                                                               
           'figure.autolayout': True,   # Automaticky nastaví velikost grafu, aby se vešel do obrazu                                                                         
           'figure.figsize': (10, 6),    # Velikost obrázku - šířka, výška (v palcích)                                                                                       
           'legend.frameon': True,      # Rámeček okolo legendy                                                                                                              
           'patch.linewidth': 2.0,      # Velikost čáry okolo rámečku                                                                                                        
           'lines.markersize': 6,       # Velikost bodů                                                                                                                      
           'lines.linewidth': 2.0,      # Tloušťka čar                                                                                                                       
           'font.size': 20,             # Velikost hodnot na osách                                                                                                           
           'legend.fontsize': 20,       # Velikost textu v legendě                                                                                                           
           'axes.labelsize': 16,        # Velikost názvů os                                                                                                                  
           'axes.titlesize': 22,        # Velikost nadpisu                                                                                                                   
           'axes.grid': True,           # Mřížka                                                                                                                             
           'grid.color': '0.9',         # Světlost čar mřížky - 1 = bílá, 0 = černá                                                                                          
           'grid.linestyle': '-',       # Typ čárkování mřížka                                                                                                               
           'grid.linewidth': 1.0,       # Tloušťka čar mřížky                                                                                                                
           'xtick.labelsize': 20,       # Velikost popisů na x-ové ose                                                                                                       
           'ytick.labelsize': 20,       # Velikost popisů na y-ové ose                                                                                                       
           'xtick.major.size': 8,       # Velikost čárek na x-ové ose                                                                                                        
           'ytick.major.size': 8,       # Velikost čárek na y-ové ose                                                                                                        
           'xtick.major.pad': 10.0,     # Vzdálenost čísel na x-ové ose od osy                                                                                               
           'ytick.major.pad': 10.0,     # Vzdálenost čísel na y-ové ose od osy                                                                                               
           }                                                                                                                                                                 
       )                                                                                                                                                                     
plt.rcParams['image.cmap'] = 'viridis'  

In [None]:
# Set random seeds
# Note this is not 100 % reliable, starting weights still differ

from tensorflow import set_random_seed

seed = 312

def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    set_random_seed(seed)
    
set_seeds(seed)

In [None]:
# Helper functions

def is_in_hull(points, hull):
    """
    Datermine whether the list of points lies inside the hull.
    
    Returns
    =======
    list
        List of boolean where true means that the point is inside the convex hull.
    """
    A = hull.equations[:,0:-1]
    b = np.transpose(np.array([hull.equations[:,-1]]))
    isInHull = np.all((A @ np.transpose(points)) <= np.tile(-b,(1,len(points))),axis=0)
    return isInHull

# Preprocessing and FE

The dataset we are given is combination of dataset of plays and dataset of players. 
The description of all its features is [here](https://www.kaggle.com/c/nfl-big-data-bowl-2020/data).

Let's split the dataset at first, so we can work with both datasets separately.

I had also decided to remove certain features, e.g. wind direction is useless as I have written [here](https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112050#645868). Also, I don't find orientation of players (i.e. the direction in which they are looking) as a reliable feature, because it was measured differently in different season and it can also change very quickly unlike the direction in which they are running (as they have momentum).

In [None]:
%%time
players = pd.read_csv("../input/nfl-big-data-bowl-2020/train.csv")

In [None]:
# Define columns we wish to keep in the DataFrame of plays
basic_plays_columns = ["GameId", "PlayId", "Season", "YardLine", "Quarter", "GameClock", "PossessionTeam", "Down", 
        "Distance", "FieldPosition", "HomeScoreBeforePlay", "VisitorScoreBeforePlay", "NflIdRusher", 
        "OffenseFormation", "OffensePersonnel", "DefendersInTheBox", "DefensePersonnel", "PlayDirection",
        "TimeHandoff", "TimeSnap", "Yards", "HomeTeamAbbr", "VisitorTeamAbbr"]
# Define columns we wish to keep in the DataFrame of plays
basic_players_columns = ["GameId", "PlayId", "NflId", "Team", "X", "Y", "S", "A", "Dis", "Dir", "Orientation", "DisplayName",
        "JerseyNumber", "PlayerHeight", "PlayerWeight", "PlayerBirthDate", "PlayerCollegeName", "Position",
        "HomeTeamAbbr", "VisitorTeamAbbr", "PossessionTeam", "FieldPosition", "PlayDirection", "NflIdRusher"]

plays = copy.deepcopy(players)[basic_plays_columns].drop_duplicates()
players = players[basic_players_columns]

Now, there are three types of functions we can use to modify datasets:
1. Functions changing the DataFrame of players
2. Aggregations - Functions which take in DataFrame of players and return data about plays.
3. Functions changing the DataFrame of plays

## Dataset functions

Here, we define functions for modifying DataFrames. Functions are divided in three cells according to order in the preceeding list.

In [None]:
# Functions changing dataframe of players

def standardize_player_data(df):
    """
    Convert units to SI and add additional weight features.
    
    Modified features
    =================
    - PlayerWeight
        Converted to kilograms
    - PlayerHeight
        Converted to centimeters
    - PlayerBirthDate
        Transformed to datetime
    - PlayerAge
        Compute player age at the start of the competition
    
    New features
    ============
    - PlayerBMI
    - PlayerObesityClassification
        Note that majority players are overweight according to standard mean,
        average for NFL players is different
    """
    # Transfer to SI units
    df["PlayerWeight"] = df["PlayerWeight"] * 0.45359237   # In kilograms
    df["PlayerHeight"] = df["PlayerHeight"].str.split("-", expand=True)[0].astype(float) * 30.48 + df["PlayerHeight"].str.split("-", expand=True)[1].astype(float) * 2.54   # in centimeters
    # Set birthdate column as datetime and add age in years
    df["PlayerBirthDate"] = pd.to_datetime(df['PlayerBirthDate'])
    df["PlayerAge"] = (pd.Timestamp('20191010') - df["PlayerBirthDate"]).dt.days / 365   # In years
    # Compute body mass index (BMI) and add classification
    df["PlayerBMI"] = df["PlayerWeight"] / (df["PlayerHeight"] / 100)**2    # Weight has to be in kilograms, height in meters (factor 100 to transfer from centimeters)
    df["PlayerObesityClassification"] = pd.cut(df["PlayerBMI"], bins=[0, 18.5, 25, 30, 35, 40, 100], labels=["Underweight", "Normal weight", "Pre-obesity", "Obesity class 1", "Obesity class 2", "Obesity class 3"])
    return df

def get_team_abbreviations(df):
    """
    Map an abbreviation of a team to each player.
    
    New features
    ============
    - TeamAbbr
    """
    mask = df.eval("Team == 'home'")
    df.loc[mask, "TeamAbbr"] = df.loc[mask, "HomeTeamAbbr"]
    mask = df["Team"]=="away"
    df.loc[mask, "TeamAbbr"] = df.loc[mask, "VisitorTeamAbbr"] 
    return df

def unify_team_abbreviations(df):
    """
    Unify team abbreviations as they are different in different columns.
    
    Modified features
    =================
    - PossessionTeam
    - FieldPosition
    """
    df.loc[df["PossessionTeam"] == "ARZ", "PossessionTeam"] = "ARI"
    df.loc[df["PossessionTeam"] == "BLT", "PossessionTeam"] = "BAL"
    df.loc[df["PossessionTeam"] == "CLV", "PossessionTeam"] = "CLE"
    df.loc[df["PossessionTeam"] == "HST", "PossessionTeam"] = "HOU"
    df.loc[df["FieldPosition"] == "ARZ", "FieldPosition"] = "ARI"
    df.loc[df["FieldPosition"] == "BLT", "FieldPosition"] = "BAL"
    df.loc[df["FieldPosition"] == "CLV", "FieldPosition"] = "CLE"
    df.loc[df["FieldPosition"] == "HST", "FieldPosition"] = "HOU"
    return df

def determine_offense_defense(df):
    """
    Determine whether a player is in offense or defense.
    
    New features
    ============
    - OffDef
        Whether a player's team is in offense or defense
    """
    mask = df.eval("(Team == 'home' and HomeTeamAbbr == PossessionTeam) or (Team == 'away' and VisitorTeamAbbr == PossessionTeam)")
    df.loc[mask, "OffDef"] = "offense"
    mask = df.eval("(Team == 'home' and HomeTeamAbbr != PossessionTeam) or (Team == 'away' and VisitorTeamAbbr != PossessionTeam)")
    df.loc[mask, "OffDef"] = "defense"
    return df

def normalize_coordinates_and_directions(plays):
    """
    Normalize coordinates so that offending team is always moving right.
    
    To change coordinates for this, we also need to switch not only X,
    but also Y and direction Dir, because if we switched only X,
    players in offense may have been still moving left. If we just
    flip all variables, it is the easiest solution.
    
    Modified features
    =================
    - X
    - Y
    - Dir
    
    New features
    ============
    - M
        Manhattan distance from bottom left corner of the field, sum of X and Y
    """
    mask = plays["PlayDirection"] != "right"
    plays.loc[mask, "X"] = 120 - plays.loc[mask, "X"]
    plays["X"] -= 10
    plays.loc[mask, "Y"] = 160/3 - plays.loc[mask, "Y"]
    plays["M"] = plays["X"] + plays["Y"]
    plays.loc[mask, "Dir"] = (plays.loc[mask, "Dir"] + 180) % 360
    plays.loc[(plays["Dir"].isnull()) & (plays["OffDef"] == "offense"), "Dir"] = 90
    plays.loc[(plays["Dir"].isnull()) & (plays["OffDef"] == "defense"), "Dir"] = 270
    return plays

def compute_horizontal_and_vertical_speeds(players):
    """
    Compute projection of speed to X and Y.
    
    New features
    ============
    - S_horizontal
    - S_vertical
    """
    players["S_horizontal"] = np.sin(players["Dir"]) * players["S"]
    players["S_vertical"] = np.cos(players["Dir"]) * players["S"]
    return players

def compute_distance_to_rusher(df):
    """
    Compute distance of a player from the rusher.
    
    New features
    ============
    - DistanceToRusher
        Euclidean distance of a player from the rusher
    """
    rushers = df.loc[df["NflIdRusher"]==df["NflId"], ["PlayId", "X", "Y"]]
    df = pd.merge(df, rushers, how="left", left_on="PlayId", right_on="PlayId", suffixes=("", "_rusher"))
    df["dXtoRusher"] = df["X_rusher"] - df["X"]
    df["dYtoRusher"] = df["Y_rusher"] - df["Y"]
    df["DistanceToRusher"] = np.sqrt(df["dXtoRusher"]**2 + df["dYtoRusher"]**2)
    df = df.drop(["X_rusher", "Y_rusher"], axis=1)
    return df

def compare_speed_with_rusher(df):
    """
    Compare speed of a player with the rusher.
    
    New features
    ============
    - RusherSpeedFactorToPlayer
        How many times faster is the rusher running than a player
    """
    rushers = df.loc[df["NflIdRusher"]==df["NflId"], ["PlayId", "S"]]
    df = pd.merge(df, rushers, how="left", left_on="PlayId", right_on="PlayId", suffixes=("", "_rusher"))
    df["RusherSpeedFactorToPlayer"] = df["S_rusher"] / df["S"]
    df = df.drop(["S_rusher"], axis=1)
    return df

def compute_minimum_tackle_time(players):
    """
    Compute the minimum tackle time if the rusher stayed still.
    
    New features
    ============
    - MinTackleTime-Basic
    """
    players["MinTackleTime-Basic"] = players["DistanceToRusher"] / df["S"]
    return players

def compute_distances_to_opponents(players, N_closest_opponents=3, N_closest_teammates=3, N_PCA=3):
    """
    Compute distances between all players in a super fast way and select some of them as features.
    
    It is possible to set how many distances to opponents/teammates to save.
    Distances are sorted, so you will always save the distance to closest opponent/teammate.
    You can also save components of PCA of distance matrix.
    This function is really complicated and was taken from CPMP's kernel, 
    which is referenced later, however there was a bug in his kernel and I am not sure it is fixed yet.
    Anyway, this and his computation is super fast and works for arbitrary number of dimensions, 
    in his former kernel it is used for computing distances of atoms in a molecule in 3D, 
    this use is by one dimension easier.
    
    New features
    ============
    - ClosestEnemy-{i+1}th
    - ClosestMate-{i+1}th
    - DistancePCA-{i+1}th
    
    Parameters
    ==========
    - N_closest_opponents
        How many distances to closest opponents to save
    - N_closest_teammates
        How many distances to closest mates to save
    - N_PCA
        How many PCA components of the distance matrix to save
        
    References
    ==========
    - https://www.kaggle.com/cpmpml/ultra-fast-distance-matrix-computation (note there was a bug and I am not sure if it was fixed yet)
    """
    players = players.sort_values(['GameId','PlayId', "OffDef", "DistanceToRusher"], ascending=[True, True, False, True])
    values = np.zeros((len(players), N_closest_opponents+N_closest_teammates+N_PCA))
    xy = players[["X", "Y"]].values
    ss = players.groupby("PlayId").size()
    ss = ss.cumsum()
    ssx = np.zeros(len(ss) + 1, 'int')
    ssx[1:] = ss
    for idx in range(players["PlayId"].nunique()):
        start_player, end_player = ssx[idx], ssx[idx+1]
        locs = xy[start_player:end_player]    
        loc_tile = np.tile(locs.T, (22,1,1))
        dist_mat = np.sqrt(((loc_tile - loc_tile.T)**2).sum(axis=1))
        pca = PCA(n_components=N_PCA)
        principal_components = pca.fit_transform(dist_mat)
        for i in range(N_closest_opponents):          
            closest_defenders = np.partition(dist_mat[:11, 11:], i, axis=1)[:, i]
            closest_offenders = np.partition(dist_mat[11:, :11], i, axis=1)[:, i]
            values[start_player:end_player-11, i] = closest_defenders # to offenders
            values[start_player+11:end_player, i] = closest_offenders # to defenders
        for i in range(N_closest_teammates):
            closest_def_teammates = np.partition(dist_mat[:11, :11], i+1, axis=1)[:, i+1]
            closest_off_teammates = np.partition(dist_mat[11:, 11:], i+1, axis=1)[:, i+1]           
            values[start_player:end_player-11, N_closest_opponents+i] = closest_def_teammates # to offenders
            values[start_player+11:end_player, N_closest_opponents+i] = closest_off_teammates # to defenders
        values[start_player:end_player, (N_closest_opponents+N_closest_teammates):] = principal_components
    for i in range(N_closest_opponents):
        players[f"ClosestEnemy-{i+1}th"] = values[:, i]
    for i in range(N_closest_teammates):
        players[f"ClosestMate-{i+1}th"] = values[:, N_closest_opponents+i]
    for i in range(N_PCA):
        players[f"DistancePCA-{i+1}th"] = values[:, N_closest_opponents+N_closest_teammates+i]
    return players
            
def add_player_numbers(df):
    """
    Sort players according to Manhattan distance and label them.
    
    New features
    ============
    - PlayerNumber
    - DefenderNumber
    - OffenderNumber
    """
    df = df.sort_values(['GameId','PlayId', "OffDef", "M"], ascending=True)
    df["PlayerNumber"] = list(range(22)) * int(len(df) / 22)
    defenders = df.eval("OffDef == 'defense'")
    offenders = df.eval("OffDef == 'offense'")
    df.loc[defenders, "DefenderNumber"] = list(range(11)) * int(len(df) / 22)
    df.loc[offenders, "OffenderNumber"] = list(range(11)) * int(len(df) / 22)
    return df

In [None]:
# Aggregations - Functions which take in data about players and return data about plays.

# Note that new features in this section may have been already calculated in the previous section,
# but these are new features for the DataFrame of plays. Remember these functions are aggregations.

def get_who_is_offense(players):
    """
    For each team find out if they are in offense or defense.
    
    These new features are not used for ML, 
    but in other feature-generating functions.
    
    New features
    ============
    - HomeIsOffense
    - HomeIsDefense
    - VisitorIsDefense
    - VisitorIsOffense
    """
    new = players.loc[players["NflIdRusher"]==players["NflId"], ["PlayId", "Team", "OffDef"]]
    mask = new.eval("(Team == 'home' and OffDef == 'offense') | (Team == 'away' and OffDef == 'defense')")
    new.loc[mask, "HomeIsOffense"] = True
    new.loc[mask, "VisitorIsDefense"] = True
    new.loc[mask, "HomeIsDefense"] = False
    new.loc[mask, "VisitorIsOffense"] = False
    mask = new.eval("(Team == 'away' and OffDef == 'offense') | (Team == 'home' and OffDef == 'defense')")
    new.loc[mask, "HomeIsDefense"] = True
    new.loc[mask, "VisitorIsOffense"] = True
    new.loc[mask, "HomeIsOffense"] = False
    new.loc[mask, "VisitorIsDefense"] = False
    new = new.drop(["Team", "OffDef"], axis=1)
    return new    

def get_circles(df):
    """
    Draw circles around rusher and compute how many opponents/teammates are in a given circle.
    
    New features
    ============
    - DefendersInCircleAroundRusher-{i}
        i is for the circle radius
    - OffendersInCircleAroundRusher-{i}
        i is for the circle radius
    """
    pivoted = df[["PlayId", "DefenderNumber", "DistanceToRusher"]].dropna().pivot(index="PlayId", columns="DefenderNumber", values="DistanceToRusher")
    new = pd.DataFrame({"PlayId":pivoted.index})
    # Defenders
    for i in range(1, 21):
        new[f"DefendersInCircleAroundRusher-{i}"] = pivoted[pivoted < i].count(axis=1).values
    # Offenders
    pivoted = df[["PlayId", "OffenderNumber", "DistanceToRusher"]].dropna().pivot(index="PlayId", columns="OffenderNumber", values="DistanceToRusher")
    for i in range(1, 21):
        new[f"OffendersInCircleAroundRusher-{i}"] = pivoted[pivoted < i].count(axis=1).values
    return new

def get_rusher_info(players):
    """
    Save information about rusher from the DataFrame of players to the DataFrame of plays.
    
    New features
    ============
    - Rusher-MainRole
    - Rusher-Height
    - Rusher-Weight
    - Rusher-BMI
    - Rusher-Speed
    - Rusher-Acceleration
    """
    cols = ["PlayId", "Position", "PlayerHeight", "PlayerWeight", "PlayerBMI", "S", "A", "JerseyNumber", "X", "Y",
            "S_vertical", "S_horizontal", "PlayerAge"]
    new = players.loc[players["NflId"]==players["NflIdRusher"], cols]
    new = new.rename(columns={"Position":"MainRole", "PlayerHeight":"Height", "PlayerWeight":"Weight", "PlayerBMI":"BMI",
                      "S":"Speed", "A":"Acceleration"})
    new = new.add_prefix("Rusher-")
    new = new.rename(columns={"Rusher-PlayId":"PlayId"})
    return new

def get_players_features(players):
    """
    Save information about all players (including rusher) 
    from DataFrame of players to the DataFrame of plays.
    
    In feature descriptions which will follow, i is the
    number of the current player and j is value of j-th
    closest player fullfilling condition or j-th PCA value.
    
    New features
    ============
    - X_{i}
    - Y_{i}
    - DistanceToRusher_{i}
    - S_{i}
    - A_{i}
    - Dir_{i}
    - JerseyNumber_{i}
    - PlayerWeight_{i}
    - PlayerHeight_{i}
    - PlayerBMI_{i}
    - S_vertical_{i}
    - S_horizontal_{i}
    - PlayerAge_{i}
    - ClosestEnemy_{j+1}th_{i}
    - ClosestMate_{j+1}th_{i}
    - DistancePCA-{j+i}th_{i}
    """
    new = pd.DataFrame({"PlayId":players["PlayId"]}).drop_duplicates()
    features = ["X", "Y", "DistanceToRusher", "S", "A", "Dir", "JerseyNumber", "PlayerWeight", "PlayerHeight", "PlayerBMI",
                "S_vertical", "S_horizontal", "PlayerAge"] + [f"ClosestEnemy-{i+1}th" for i in range(3)] \
                + [f"ClosestMate-{i+1}th" for i in range(3)] + [f"DistancePCA-{i+1}th" for i in range(3)]
    for feature in features:
        pivoted = players.pivot(index="PlayId", columns="PlayerNumber", values=feature)
        pivoted = pivoted.loc[:, :].add_prefix(f"{feature}_")
        new = pd.merge(new, pivoted, how="left", left_on="PlayId", right_index=True)
    return new    

def get_player_types_numbers(players):
    """
    Get information about how many players of each type are playing on each side.
    
    NFL players have roles in every play.
    Their roles (types) are listed e.g. here:
    https://en.wikipedia.org/wiki/American_football_positions
    
    New features
    ============
    - OffensePlayerTypes-{type}
    - DefensePlayerTypes-{type}
    """
    new = pd.DataFrame({"PlayId":players["PlayId"]}).drop_duplicates()
    offenders = players.loc[players["OffDef"]=="offense"]
    offender_types = players.loc[players["OffDef"]=="offense"].groupby("PlayId")["Position"].value_counts().unstack().fillna(0)
    defender_types = players.loc[players["OffDef"]=="defense"].groupby("PlayId")["Position"].value_counts().unstack().fillna(0)
    player_roles = ['CB', 'WR', 'G', 'T', 'DE', 'DT', 'OLB', 'TE', 
                    'FS', 'C', 'RB', 'QB', 'SS', 'ILB', 'MLB', 'NT', 'LB', 
                    'OT', 'FB', 'OG', 'DB', 'S', 'HB', 'SAF', 'DL']
    for role in player_roles:
        if role not in offender_types.columns:
            offender_types[role] = 0
        if role not in defender_types.columns:
            defender_types[role] = 0
    offender_types = offender_types.add_prefix("OffensePlayerTypes-")
    defender_types = defender_types.add_prefix("DefensePlayerTypes-")
    new = pd.merge(new, offender_types, how="left", left_on="PlayId", right_index=True)
    new = pd.merge(new, defender_types, how="left", left_on="PlayId", right_index=True)  
    return new.astype(int)

def get_advanced_distance_features(players):
    """
    Save information connected with convex hulls.
    
    Offensive/defensive convex hull in this case is the smallest area
    if you connect all offending/defending players and leave out players
    which are inside the area. 
    Rusher is not counted as edge for offensive convex hull.
    
    New features
    ============
    - OffensiveHull-area
        Area of convex hull of offending team
    - RusherInOffenseHull
        Whether rusher is in convex hull of his team
    - DefendersInOffenseHull
        How many defenders are in the offending convex hull
    - DefensiveHull-area
        Area of convex hull of defending team
    - RusherInDefenseHull
        Whether rusher is in convex hull of opposing team
    - OffendersInDefenseHull
        How many offenders are inside the defending convex hull
    """
    players = players.sort_values(['GameId','PlayId', "OffDef", "DistanceToRusher"], ascending=[True, True, False, True])
    ids = players[["PlayId"]].values
    xy = players[["X", "Y"]].values
    ss = players.groupby("PlayId").size()
    ss = ss.cumsum()
    ssx = np.zeros(len(ss) + 1, 'int')
    ssx[1:] = ss
    data = {"PlayId":[],
            "OffensiveHull-area":[], "RusherInOffenseHull":[], "DefendersInOffenseHull":[],
            "DefensiveHull-area":[], "RusherInDefenseHull":[], "OffendersInDefenseHull":[]}
    for idx in range(players["PlayId"].nunique()):
        start_player, end_player = ssx[idx], ssx[idx+1]
        locs = xy[start_player:end_player]    
        off_hull = ConvexHull(locs[1:11])
        rusher_in_offense_hull = is_in_hull(locs[:1], off_hull)
        defenders_in_offense_hull = is_in_hull(locs[11:], off_hull)
        def_hull = ConvexHull(locs[11:])
        rusher_in_defense_hull = is_in_hull(locs[:1], def_hull)
        offenders_in_defense_hull = is_in_hull(locs[:11], def_hull)
        data["PlayId"].append(ids[start_player][0])
        data["OffensiveHull-area"].append(off_hull.area)
        data["RusherInOffenseHull"].append(rusher_in_offense_hull.sum())
        data["DefendersInOffenseHull"].append(defenders_in_offense_hull.sum())
        data["DefensiveHull-area"].append(def_hull.area)
        data["RusherInDefenseHull"].append(rusher_in_defense_hull.sum())
        data["OffendersInDefenseHull"].append(offenders_in_defense_hull.sum())
    new = pd.DataFrame(data)
    return new

def compute_centroids(players):
    """
    Get average position for every team and extract features.
    
    In this function we treat a team as a single point or rather
    a distribution which is spread a bit.
    Spread is for difference between maximal and minimal position.
    
    New features
    ============
    - Centroid-Offense-X-mean
    - Centroid-Offense-X-std
    - Centroid-Offense-X-spread
    - Centroid-Offense-Y-mean
    - Centroid-Offense-Y-std
    - Centroid-Offense-Y-spread
    - Centroid-Defense-X-mean
    - Centroid-Defense-X-std
    - Centroid-Defense-X-spread
    - Centroid-Defense-Y-mean
    - Centroid-Defense-Y-std
    - Centroid-Defense-Y-spread    
    """
    new = pd.DataFrame({"PlayId":players["PlayId"]}).drop_duplicates()
    offensive_groupby = players.loc[players["OffDef"]=="offense"].groupby("PlayId")
    new["Centroid-Offense-X-mean"] = offensive_groupby["X"].mean().values
    new["Centroid-Offense-X-std"] = offensive_groupby["X"].std().values
    new["Centroid-Offense-X-spread"] = (offensive_groupby["X"].max() - offensive_groupby["X"].min()).values
    new["Centroid-Offense-Y-mean"] = offensive_groupby["Y"].mean().values
    new["Centroid-Offense-Y-std"] = offensive_groupby["Y"].std().values
    new["Centroid-Offense-Y-spread"] = (offensive_groupby["Y"].max() - offensive_groupby["Y"].min()).values
    defensive_groupby = players.loc[players["OffDef"]=="defense"].groupby("PlayId")
    new["Centroid-Defense-X-mean"] = defensive_groupby["X"].mean().values
    new["Centroid-Defense-X-std"] = defensive_groupby["X"].std().values
    new["Centroid-Defense-X-spread"] = (defensive_groupby["X"].max() - defensive_groupby["X"].min()).values
    new["Centroid-Defense-Y-mean"] = defensive_groupby["Y"].mean().values
    new["Centroid-Defense-Y-std"] = defensive_groupby["Y"].std().values
    new["Centroid-Defense-Y-spread"] = (defensive_groupby["Y"].max() - defensive_groupby["Y"].min()).values
    return new

def get_global_rusher_features(players):
    """
    Compute minimal distance between rusher and defense team.
    
    New features
    ============
    - Defense-MinDistanceToRusher
    """
    new = pd.DataFrame({"PlayId":players["PlayId"]}).drop_duplicates()
    rushers = players.loc[players["NflIdRusher"]==players["NflId"], ["PlayId", "ClosestEnemy-1th"]]
    new["Defense-MinDistanceToRusher"] = rushers["ClosestEnemy-1th"]
    return new

def rusher_vs_groups(players, plays):
    """
    Get distance of rusher to team centroids.
    
    New features
    ============
    - Rusher-ToOffenseCentroid-X
    - Rusher-ToOffenseCentroid-Y
    - Rusher-ToOffenseCentroid-Distance
    - Rusher-ToDefenseCentroid-X
    - Rusher-ToDefenseCentroid-Y
    - Rusher-ToDefenseCentroid-Distance
    """
    rushers = players.loc[players["NflIdRusher"]==players["NflId"], ["PlayId", "X", "Y"]]
    plays = pd.merge(plays, rushers, how="left", left_on="PlayId", right_on="PlayId")
    plays["Rusher-ToOffenseCentroid-X"] = plays["X"] - plays["Centroid-Offense-X-mean"]
    plays["Rusher-ToOffenseCentroid-Y"] = plays["Y"] - plays["Centroid-Offense-Y-mean"]
    plays["Rusher-ToOffenseCentroid-Distance"] = np.sqrt(plays["Rusher-ToOffenseCentroid-X"]**2 + plays["Rusher-ToOffenseCentroid-Y"]**2)
    plays["Rusher-ToDefenseCentroid-X"] = plays["X"] - plays["Centroid-Defense-X-mean"]
    plays["Rusher-ToDefenseCentroid-Y"] = plays["Y"] - plays["Centroid-Defense-Y-mean"]
    plays["Rusher-ToDefenseCentroid-Distance"] = np.sqrt(plays["Rusher-ToDefenseCentroid-X"]**2 + plays["Rusher-ToDefenseCentroid-Y"]**2)
    plays = plays.drop(["X", "Y"], axis=1)
    return plays

In [None]:
# Functions changing dataframe of plays

def count_defense_personnel(df):
    """
    Count defense personnel.
    
    There are certain types of players in the defense.
    In the DataFrame we have a string column, which needs
    to be parsed, so we know how many players is there
    of each type.
    
    New features
    ============
    - DefensePersonnel-DL
    - DefensePersonnel-LB
    - DefensePersonnel-DB
    - DefensePersonnel-OL
    """
    defense_personnel_types = ["DL", "LB", "DB", "OL"]
    defense_personnel_extraction = "^(?:(?P<DPDL>[0-9])DL)?,?(?:(?P<DPLB>[0-9])LB)?,?(?:(?P<DPDB>[0-9])DB)?,?(?:(?P<DPOL>[0-9])OL)?$"
    defpersdf = df["DefensePersonnel"].str.replace(" ","").str.extract(defense_personnel_extraction, expand=True)
    defpersdf = defpersdf.fillna(0)
    defpersdf = defpersdf.astype({"DPDL": int, "DPLB": int, "DPDB": int, "DPOL": int})
    df[["DefensePersonnel-DL", "DefensePersonnel-LB", "DefensePersonnel-DB", "DefensePersonnel-OL"]] = defpersdf[["DPDL", "DPLB", "DPDB", "DPOL"]]
    return df

def count_offense_personnel(df):
    """
    Count offense personnel.
    
    There are certain types of players in the offense.
    In the DataFrame we have a string column, which needs
    to be parsed, so we know how many players is there
    of each type.
    
    New features
    ============
    - OffensePersonnel-OL
    - OffensePersonnel-QB
    - OffensePersonnel-RB
    - OffensePersonnel-TE
    - OffensePersonnel-WR
    - OffensePersonnel-DL
    - OffensePersonnel-LB
    - OffensePersonnel-DB
    """
    offense_personnel_types = ["OL", "QB", "RB", "TE", "WR", "DL", "LB", "DB"]
    offense_personnel_extraction = "^(?:(?P<OPOL>[0-9])OL)?,?(?:(?P<OPQB>[0-9])QB)?,?(?:(?P<OPRB>[0-9])RB)?,?(?:(?P<OPTE>[0-9])TE)?,?(?:(?P<OPWR>[0-9])WR)?,?(?:(?P<OPDL>[0-9])DL)?,?(?:(?P<OPLB>[0-9])LB)?,?(?:(?P<OPDB>[0-9])DB)?,?$"
    offpersdf = df["OffensePersonnel"].str.replace(" ","").str.extract(offense_personnel_extraction, expand=True)
    offpersdf = offpersdf.fillna(0)
    offpersdf = offpersdf.astype({"OPOL": int, "OPQB": int, "OPRB": int, "OPTE": int, "OPWR": int, "OPDL": int, "OPLB": int, "OPDB": int})
    df[["OffensePersonnel-OL", "OffensePersonnel-QB", "OffensePersonnel-RB", "OffensePersonnel-TE",
           "OffensePersonnel-WR", "OffensePersonnel-DL", "OffensePersonnel-LB", "OffensePersonnel-DB"]] = offpersdf[["OPOL", "OPQB", "OPRB", "OPTE", "OPWR", "OPDL", "OPLB", "OPDB"]]
    return df

def get_personnel_frequencies(plays):
    """
    Get a dict with how often is a formation with 
    given types of players played.
    
    Note this is not a functions changing the DataFrame of plays,
    but it makes a couple with 'set_personnel_frequencies'.
    """
    frequencies = dict()
    frequencies["DefensePersonnel"] = dict(plays["DefensePersonnel"].value_counts())
    frequencies["OffenseFormation"] = dict(plays["OffenseFormation"].value_counts())    
    frequencies["OffensePersonnel"] = dict(plays["OffensePersonnel"].value_counts())
    frequencies["Rusher-MainRole"] = dict(plays["Rusher-MainRole"].value_counts())
    return frequencies

def set_personnel_frequencies(df, frequencies):
    """
    Set how often is a formation with given types
    of players played.
    
    New features
    ============
    - DefensePersonnel-frequency
    - OffenseFormation
    - OffenseFormation-frequency
    - OffensePersonnel-frequency
    - Rusher-MainRole-frequency
    - Rusher-MainRole
    """
    df["DefensePersonnel-frequency"] = df["DefensePersonnel"].apply(lambda x: frequencies["DefensePersonnel"][x] if x in frequencies["DefensePersonnel"].keys() else 0)
    df["OffenseFormation"] = df["OffenseFormation"].fillna("EMPTY")
    df["OffenseFormation-frequency"] = df["OffenseFormation"].apply(lambda x: frequencies["OffenseFormation"][x] if x in frequencies["OffenseFormation"].keys() else 0)
    df["OffensePersonnel-frequency"] = df["OffensePersonnel"].apply(lambda x: frequencies["OffensePersonnel"][x] if x in frequencies["OffensePersonnel"].keys() else 0)
    df["Rusher-MainRole-frequency"] = df["Rusher-MainRole"].apply(lambda x: frequencies["Rusher-MainRole"][x])
    df["Rusher-MainRole"] = df["Rusher-MainRole"].astype("category")
    return df

def compute_score_difference(df):
    """
    Compute score difference between teams.
    
    New features
    ============
    - OffToDefScore
        How much more score does the offense have than the defense.
    """
    mask = (df["HomeIsOffense"]) | (df["VisitorIsDefense"])
    df.loc[mask, "OffToDef-Score"] = df.loc[mask, "HomeScoreBeforePlay"] - df.loc[mask, "VisitorScoreBeforePlay"]
    mask = (df["VisitorIsOffense"]) | (df["HomeIsDefense"])
    df.loc[mask, "OffToDef-Score"] = df.loc[mask, "VisitorScoreBeforePlay"] - df.loc[mask, "HomeScoreBeforePlay"]
    return df

def standardize_game_features(df):
    """
    Transform game clock to second and fill 
    nans in DefendersInTheBox column.
    
    New features
    ============
    - GameClock
        How many second till the end of a quarter
    - DefendersInTheBox
    """
    df["GameClock"] = df["GameClock"].str.split(":", expand=True)[0].astype(int) * 60 + df["GameClock"].str.split(":", expand=True)[1].astype(int)   # == time remaining in seconds
    df.loc[df["DefendersInTheBox"].isna(), "DefendersInTheBox"] = df.loc[df["DefendersInTheBox"].isna(), "DefendersInTheBox"].fillna(df["DefendersInTheBox"].mean())
    return df

def compute_yards_to_go(df):
    """
    Compute how many yards left to go for the offending team.
    
    For YardLine==50, YardsToGo need to be specified specifically,
    as there is no PossessionTeam.
    
    New features
    ============
    - YardsToGo
    """
    mask = df.eval("FieldPosition == PossessionTeam")
    df.loc[mask, "YardsToGo"] = 100 - df.loc[mask, "YardLine"]
    mask = df.eval("FieldPosition != PossessionTeam")
    df.loc[mask, "YardsToGo"] = df.loc[mask, "YardLine"]
    df.loc[df["YardLine"] == 50, "YardsToGo"] = 50
    return df

## Generating features

Now run all defined functions and create features. Also define a functions which does the same, as we will need to apply it later when we get data for a single play after another single play. I was also thinking of a faster version of preceeding functions. They are vectorized, so they are really fast for big dataset

In [None]:
%%time
players = standardize_player_data(players)
players = get_team_abbreviations(players)
players = unify_team_abbreviations(players)
players = determine_offense_defense(players)
players = normalize_coordinates_and_directions(players)
players = compute_horizontal_and_vertical_speeds(players)
players = add_player_numbers(players)
players = compute_distance_to_rusher(players)
players = compute_distances_to_opponents(players, 3, 3)

plays = pd.merge(plays, get_who_is_offense(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, get_circles(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, get_rusher_info(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, get_players_features(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, get_player_types_numbers(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, get_advanced_distance_features(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, compute_centroids(players), how="left", left_on="PlayId", right_on="PlayId")
plays = pd.merge(plays, get_global_rusher_features(players), how="left", left_on="PlayId", right_on="PlayId")
plays = rusher_vs_groups(players, plays)

plays = count_defense_personnel(plays)
plays = count_offense_personnel(plays)
frequencies = get_personnel_frequencies(plays)
plays = set_personnel_frequencies(plays, frequencies)
plays = compute_score_difference(plays)
plays = standardize_game_features(plays)
plays = compute_yards_to_go(plays)

In [None]:
def prepare_data(players, plays, frequencies):
    """
    Given starting dataframe, generate all features and return DataFrame of plays.
    
    We also need to provide frequencies of roles (e.g. how often there is a quarterback in a play).
    These frequencies can not be computed on test data as it would be leak. 
    Frequencies of occurences always need to be computed on train data.
    
    Parameters
    ==========
    - players
        Starting DataFrame of players
    - plays
        Starting DataFrame of plays
    - frequencies
        Frequencies of roles
        
    Returns
    =======
    - pandas.DataFrame
        DataFrame od plays with a lot of features ready for ML
    """
    players = standardize_player_data(players)
    players = get_team_abbreviations(players)
    players = unify_team_abbreviations(players)
    players = determine_offense_defense(players)
    players = normalize_coordinates_and_directions(players)
    players = compute_horizontal_and_vertical_speeds(players)
    players = add_player_numbers(players)
    players = compute_distance_to_rusher(players)
    players = compute_distances_to_opponents(players, 3, 3)
    plays = pd.merge(plays, get_who_is_offense(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, get_circles(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, get_rusher_info(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, get_players_features(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, get_player_types_numbers(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, get_advanced_distance_features(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, compute_centroids(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = pd.merge(plays, get_global_rusher_features(players), how="left", left_on="PlayId", right_on="PlayId")
    plays = rusher_vs_groups(players, plays)
    plays = count_defense_personnel(plays)
    plays = count_offense_personnel(plays)
    frequencies = get_personnel_frequencies(plays)
    plays = set_personnel_frequencies(plays, frequencies)
    plays = compute_score_difference(plays)
    plays = standardize_game_features(plays)
    plays = compute_yards_to_go(plays)
    return plays

# Preparing model input

We have a dataset of features, but now, we need to extract features to a suitable form ready for machine learning and also prepare training values which the model will learn.

At the start of the competitions, very basic ML performed very bad, but empirical distribution performed very well. So I thought that maybe a delta learning approach would suit this problem, i.e. teach the model only to fix an empirical distribution. In the end I ended predicting full distribution, delta to empirical distribution and number of yards gained/lost. 

In [None]:
df = plays

In [None]:
print(list(df.columns))

In [None]:
# First 80 values are always zero as no team in train data lost 20+ yards on a rush play.
# ML model would predict noise here, so in this constant you can set how many values will be skipped.
# Less values to predict means that the model can focus more on important parts.
N_zeroed = 80

In [None]:
# I computed empirical distribution as in https://www.kaggle.com/ryches/model-free-benchmark
# but I didn't use all data at once, but in groups divided by number of yards to go.
checkpoints = [0, 14, 21, 25, 30, 34, 38, 43, 48, 54, 60, 67, 75, 83, 91, 101]
distributions = [np.histogram(df.query(f"{start} < YardsToGo <= {last}").loc[:, "Yards"], bins=199, range=(-99,100), density=True)[0].cumsum() for start, last in zip(checkpoints[:-1], checkpoints[1:])]

In [None]:
%%time
dummy = np.zeros((len(df), 199))
Y_distribution = np.zeros((len(df), 199-N_zeroed))   # Y values for full distribution
# Y values for delta with empirical distribution
# Y_delta = np.zeros((len(df), 199-N_zeroed))   # Predict full delta with empirical distribution
Y_delta = np.zeros((len(df), 30))   # Biggest delta with empirical distribution is between 11 and 40
# Y values for number of yards gained/lost
Y_yards = np.zeros((len(df), 1))
distros = np.zeros((len(df), 199-N_zeroed))
for i, (playid, row) in enumerate(df[["YardsToGo", "Yards"]].iterrows()):
    Y_yards[i, :] = row["Yards"]
    bisection_index = bisect(checkpoints, row["YardsToGo"]) - 1
    dummy[i, (99+int(np.ceil(row["Yards"]))):] = 1.0
    Y_distribution[i, :] = dummy[i, N_zeroed:]
    dummy[i, :] -= distributions[bisection_index]
#     Y_distribution[i, :] = dummy[i, N_zeroed:]
    Y_delta[i, :] = dummy[i, 89:119]
#     Y_delta[i, :] = dummy[i, N_zeroed:]
Y = [Y_distribution, Y_delta, Y_yards]

In [None]:
# My ML model is NN and it is described later.
# Nevertheless, I do not input all features at once, but in groups
# which I think that belong together.
# So these groups need to be extracted from the DataFrame
# and this is the purpose of this cell.

def get_player_features(df):
    """
    Get player features and the number of them..
    """
    selected_columns = list()
    unique_features = sorted(["X", "Y", "DistanceToRusher", "S", "A", "Dir", "JerseyNumber", "PlayerWeight", "PlayerHeight", "PlayerBMI",
                       "S_vertical", "S_horizontal", "PlayerAge"] + [f"ClosestEnemy-{i+1}th" for i in range(3)] \
                        + [f"DistancePCA-{i+1}th" for i in range(3)])
    for i in range(22):
        for feature in unique_features:
            selected_columns.append(f"{feature}_{i}")
    N_player_features = len(selected_columns)
    N_unique_player_features = len(unique_features)
    player_features = df[selected_columns].values
    return player_features, N_unique_player_features

def get_offense_features(df):
    """
    Get offense features and the number of them.
    """
    offense_features_list = sorted([column for column in df.columns if re.match("OffensePersonnel-[A-Z][A-Z]?", column)] \
            + [column for column in df.columns if re.match("OffensePlayerTypes-.*", column)])
    N_offense_features = len(offense_features_list)
    offense_features = df[offense_features_list].values
    return offense_features, N_offense_features

def get_defense_features(df):
    """
    Get defense features and the number of them
    """
    defense_features_list = sorted([column for column in df.columns if re.match("DefensePersonnel-[A-Z][A-Z]?", column)] \
            + [column for column in df.columns if re.match("DefensePlayerTypes-.*", column)])
    N_defense_features = len(defense_features_list)
    defense_features = df[defense_features_list].values
    return defense_features, N_defense_features

def get_global_features(df):
    """
    Get global features and the number of them.
    """
    global_features_list = sorted(["YardsToGo", "GameClock", "Down", "Quarter", "DefendersInTheBox", "OffToDef-Score", 
                                   "Defense-MinDistanceToRusher"])
#             "OffensiveHull-area", "RusherInOffenseHull", "DefendersInOffenseHull",
#             "DefensiveHull-area", "RusherInDefenseHull", "OffendersInDefenseHull"]
    N_global_features = len(global_features_list)
    global_features = df[global_features_list].values
    return global_features, N_global_features

def get_offense_circle_features(df):
    """
    Get features connected with number of offenders in the circle around the rusher.
    And the number of them.
    """
    offense_features_list = sorted([column for column in df.columns if re.match("OffendersInCircleAroundRusher-.*", column)])
    N_offense_circle_features = len(offense_features_list)
    offense_features = df[offense_features_list].values
    return offense_features, N_offense_circle_features

def get_defense_circle_features(df):
    """
    Get features connected with number of defenders in the circle around the rusher.
    And the number of them.
    """
    defense_features_list = sorted([column for column in df.columns if re.match("DefendersInCircleAroundRusher-.*", column)])
    N_defense_circle_features = len(defense_features_list)
    defense_features = df[defense_features_list].values
    return defense_features, N_defense_circle_features
    
def get_rusher_features(df):
    """
    Get rusher features and the number of them.
    """
    rusher_features_list = sorted([column for column in df.columns if re.match("Rusher-.*", column)])
    rusher_features_list.remove("Rusher-MainRole")
    rusher_features_list.remove("Rusher-MainRole-frequency")
    N_rusher_features = len(rusher_features_list)
    rusher_features = df[rusher_features_list].values
    return rusher_features, N_rusher_features

def get_offense_centroid_features(df):
    """
    Get offense centroid features and the number of them.
    """
    offense_centroid_list = sorted([column for column in df.columns if re.match("Centroid-Offense-.*", column)])
    N_offense_centroid_features = len(offense_centroid_list)
    offense_centroid_features = df[offense_centroid_list].values
    return offense_centroid_features, N_offense_centroid_features

def get_defense_centroid_features(df):
    """
    Get defense centroid features and the number of them.
    """
    defense_centroid_list = sorted([column for column in df.columns if re.match("Centroid-Defense-.*", column)])
    N_defense_centroid_features = len(defense_centroid_list)
    defense_centroid_features = df[defense_centroid_list].values
    return defense_centroid_features, N_defense_centroid_features

In [None]:
def prepare_for_predictions(df, distros=None, scaler=None):
    """
    Transform DataFrame with engineered features to 
    input needed by ML model.
    
    Parameters
    ==========
    - df
        DataFrame with engineered features.
    - distros
        Possible empirical distribution to add as features.
    - scaler
        If None, new StandardScaler will be created, else
        provided scaler will be used for scalling features.
    """
    # Extract features from the DataFrame as not all are needed
    player_features, N_unique_player_features = get_player_features(df)
    rusher_features, N_rusher_features = get_rusher_features(df)
    offense_features, N_offense_features = get_offense_features(df)
    defense_features, N_defense_features = get_defense_features(df)
    global_features, N_global_features = get_global_features(df)
    circle_offense_features, N_offense_circle_features = get_offense_circle_features(df)
    circle_defense_features, N_defense_circle_features = get_defense_circle_features(df)
    centroid_offense_features, N_offense_centroid_features = get_offense_centroid_features(df)
    centroid_defense_features, N_defense_centroid_features = get_defense_centroid_features(df)
    # Save input lenghts in a list and in a dict
    input_lengths = 22*[N_unique_player_features] + [N_rusher_features,
            N_global_features, N_offense_circle_features, N_defense_circle_features, N_offense_centroid_features,
            N_defense_centroid_features, N_offense_features, N_defense_features]  
    input_lengths_dict = {"Players":N_unique_player_features, "Rusher":N_rusher_features, "Offense":N_offense_features,
                          "Defense":N_defense_features, "Global":N_global_features, "Circles-Off":N_offense_circle_features,
                          "Circles-Def":N_defense_circle_features, "Centroids-Off":N_offense_centroid_features,
                          "Centroids-Def":N_defense_centroid_features}
    # Concatenate extracted features
    X = np.concatenate((player_features, rusher_features, global_features,
                        circle_offense_features, circle_defense_features, 
                        centroid_offense_features, centroid_defense_features,
                       ), axis=1)    
    # Scale data
    if scaler is not None:   # If scaler is provided use it
        Xscaler = scaler
    else:   # If scaler is not provided, create a new one and fit data with it
        Xscaler = StandardScaler()
        Xscaler.fit(X)
    X = Xscaler.transform(X)
    X = np.concatenate((X, offense_features, defense_features), axis=1)
    # If an empirical distribution is provided as features for the model, concatenate it with current features
    if distros is not None:
        X = np.concatenate((X, distros), axis=1)
        input_lengths = input_lengths + [199-N_zeroed]
        input_lengths_dict["DistributionLength"] = 199-N_zeroed
    return X, input_lengths, Xscaler, input_lengths_dict

def split_to_inputs(X, feature_lengths):
    """
    Split input for model to input layers.
    
    The NN takes more Input groups,
    this function takes care of providing each Input layer
    with its corresponding input in array of all input values called X.
    """
    arrays = list()
    start = 0
    for i in feature_lengths:
        end = start + i 
        arrays.append(X[:, start:end])
        start = end
    return arrays

In [None]:
X, input_lengths, Xscaler, feature_lengths = prepare_for_predictions(df)

# Define model

This part of the notebook is rather weak and I think that much better model could have been created by e.g. ensembling models. Anyway, I created NN network that takes a lot of different inputs and returns multiple outputs. Features for Input layers are grouped in groups that made sense for me (e.g. features about player, features about offending/defending team, global features...). Multiple outputs are predicted as I wrote earlier. The base of the NN are layers for players. There are 22 players, so there are 22 groups of player's features. These are processed by 22 layers which share weights, later they are concatenated and processed further. Then more features are concatenated etc. etc. It is better to see the architecture of the model in the picture generated by the notebook.

In [None]:
def crps(y_true, y_pred):
    """
    Define CRPS metric for Keras.
    
    Returns
    =======
    - float
        CRPS loss
    """
    loss = K.mean((K.cumsum(y_pred, axis=1) - y_true)**2) * (199-N_zeroed) / 199
    return loss

def define_model():
    """
    Define and return neural network.
    
    This is not the most clean part of the code,
    but you can play with it as I did and turn off 
    and on layers and see whether it has better 
    or worse score. 
    The architecture is described above this code cell
    and a figure is saved later. However, it needs to be
    opened in a separate window and zoomed as it is 
    really wide, because there is a separate Input layer
    for every single player.
    
    Returns
    =======
    keras.NN
    """
    player_inputs = [Input(shape=(feature_lengths["Players"],), name=f'PlayerInput-{i}') for i in range(22)] 
    rusher_input = Input(shape=(feature_lengths["Rusher"],), name="RusherInput")
    # Player part
    player_layer_1 = Dense(10, activation='relu', name="Player-Dense-1")
    player_batch_norm_1 = BatchNormalization(name="Player-BatchNorm-1")
#     player_dropout_1 = Dropout(0.2, name="Player-Dropout-1")
#     player_layer_2 = Dense(12, activation='relu', name="Player-Dense-2")
#     player_batch_norm_2 = BatchNormalization(name="Player-BatchNorm-2")
#     player_dropout_2 = Dropout(0.2, name="Player-Dropout-2")
#     player_layer_3 = Dense(10, activation='relu', name="Player-Dense-3")
#     player_batch_norm_3 = BatchNormalization(name="Player-BatchNorm-3")
#     player_dropout_3 = Dropout(0.2, name="Player-Dropout-3")
    player_outputs = list()
    for i in range(22):
        x1 = player_layer_1(player_inputs[i])
        x1 = player_batch_norm_1(x1)
#         x1 = player_dropout_1(x1)
#         x2 = player_layer_2(x1)
#         x2 = player_batch_norm_2(x2)
#         x2 = player_dropout_2(x2)
#         x3 = player_layer_3(x2)
#         x3 = player_batch_norm_3(x3)
#         x3 = player_dropout_3(x3)
        player_outputs.append(x1)
#         player_outputs.append(x2)
#         player_outputs.append(x3)
    #x = Dense(10, activation="relu", name="Rusher-Dense")(rusher_input)
    #x = BatchNormalization(name="Rusher-BatchNorm")(x)
    #player_outputs.append(x)
    player_values = Concatenate(name="ConcatenatePlayerOutputs")(player_outputs)
#     player_values = Dropout(0.4)(player_values)
#     player_values = Dense(400, activation='relu', name="AllPlayers-Dense-1")(player_values)
#     player_values = BatchNormalization(name="AllPlayer-BatchNorm-1")(player_values)
#     player_values = Dense(200, activation='relu', name="AllPlayers-Dense-2")(player_values)
#     player_values = BatchNormalization(name="AllPlayer-BatchNorm-2")(player_values)
#     player_values = Dense(100, activation='relu', name="AllPlayers-Dense-3")(player_values)
#     player_values = BatchNormalization(name="AllPlayer-BatchNorm-3")(player_values)
#     out_delta = Dense(199-N_zeroed, activation=None, name="PredictingDelta")(player_values)
    out_delta = Dense(30, activation=None, name="PredictingDelta")(player_values)
    # Team part - Offense
    offense_input = Input(shape=(feature_lengths["Offense"],), name="OffenseInput")
    offense_embedding_1 = Embedding(feature_lengths["Offense"], 5, name="Offense-Embedding")
#     offense_layer_1 = Dense(5, activation="relu", name="Offense-Dense")
    offense_flatten_1 = Flatten(name="OffenseFlatten")
    offense_outputs_1 = offense_flatten_1(offense_embedding_1(offense_input))
#     offense_outputs_1 = offense_layer_1(offense_input)
#     offense_outputs_1 = BatchNormalization(name="Offense-BatchNorm")(offense_outputs_1)
    # Team part - Defense
    defense_input = Input(shape=(feature_lengths["Defense"],), name="DefenseInput")
    defense_embedding_1 = Embedding(feature_lengths["Defense"], 5, name="Defense-Embedding")
#     defense_layer_1 = Dense(5, activation="relu", name="Defense-Dense")
    defense_flatten_1 = Flatten(name="DefenseFlatten")
    defense_outputs_1 = defense_flatten_1(defense_embedding_1(defense_input))
#     defense_outputs_1 = defense_layer_1(defense_input)
#     defense_outputs_1 = BatchNormalization(name="Defense-BatchNorm")(defense_outputs_1)  
    # Teams together
    team_values = Concatenate(name="Concatenate-OffDef")([offense_outputs_1, defense_outputs_1])
    team_values = Dense(5, activation="relu", name="OffDef-Dense")(team_values)
    team_values = BatchNormalization(name="OffDef-BatchNorm")(team_values)
#     Global part
    global_input = Input(shape=(feature_lengths["Global"],), name="GlobalInput")
    global_values = Dense(15, activation="relu", name="Global-Dense")(global_input)
    global_values = BatchNormalization(name="Global-BatchNorm")(global_values)
    # Global and team
    gt_values = Concatenate(name="Concatenate-TeamAndGlobal")([team_values, global_values])
    gt_values = Dense(20, activation="relu", name="TeamAndGlobal-Dense")(gt_values)
    gt_values = BatchNormalization(name="TeamAndGlobal-BatchNorm")(gt_values)
    # Advanced distances
    # Circular offense features
    offense_circle_input = Input(shape=(feature_lengths["Circles-Off"],), name="CirclesOffenseInput")
    offense_circle_distances = Dense(10, activation="relu", name="CirclesOffense-Dense")(offense_circle_input)
    offense_circle_distances = BatchNormalization(name="CirclesOffense-BatchNorm")(offense_circle_distances)    
    # Circular defense features
    defense_circle_input = Input(shape=(feature_lengths["Circles-Def"],), name="CirclesDefenseInput")
    defense_circle_distances = Dense(10, activation="relu", name="CirclesDefense-Dense")(defense_circle_input)
    defense_circle_distances = BatchNormalization(name="CirclesDefense-BatchNorm")(defense_circle_distances)
    # Centroid offense features
    offense_centroid_input = Input(shape=(feature_lengths["Centroids-Off"],), name="CentroidsOffenseInput")
    offense_centroid_distances = Dense(5, activation="relu", name="CentroidsOffense-Dense")(offense_centroid_input)
    offense_centroid_distances = BatchNormalization(name="CentroidsOffense-BatchNorm")(offense_centroid_distances)    
    # Centroid defense features
    defense_centroid_input = Input(shape=(feature_lengths["Centroids-Def"],), name="CentroidsDefenseInput")
    defense_centroid_distances = Dense(5, activation="relu", name="CentroidsDefense-Dense")(defense_centroid_input)
    defense_centroid_distances = BatchNormalization(name="CentroidsDefense-BatchNorm")(defense_centroid_distances)
    # Distances together
    advanced_distances = Concatenate(name="Concatenate-AdvancedDistances")([
        offense_circle_distances, defense_circle_distances, offense_centroid_distances, defense_centroid_distances])
    advanced_distances = Dense(20, activation="relu", name="AdvancedDistances-Dense")(advanced_distances)
    advanced_distances = BatchNormalization(name="AdvancedDistances-BatchNorm")(advanced_distances)
#     Everything together
#     basic_distribution_input = Input(shape=(feature_lengths["DistributionLength"],), name="EmpiricalDistribution")
#     basic_distribution = Dense(20, activation="relu", name="Distribution-Dense")(basic_distribution_input)
#     basic_distribution = BatchNormalization(name="Distribution-BatchNorm")(basic_distribution)
    x = Concatenate(name="Concatenate-Everything-1")([player_values, gt_values, advanced_distances])#, team_values, global_values, offense_outputs_1, defense_outputs_1])#, advanced_distances, basic_distribution])
    x1 = Dense(200, activation='relu', name="All-Dense-1")(x)
    x1 = BatchNormalization(name="All-BatchNorm-1")(x1)
#     x1 = Dropout(0.1, name="All-Dropout-1")(x1)
    x2 = Dense(100, activation='relu', name="All-Dense-2")(x1)
    x2 = BatchNormalization(name="All-BatchNorm-2")(x2)
    x2 = Dropout(0.1, name="All-Dropout-2")(x2)
    prex = Concatenate(name="Concatenate-ForPrePrediction")([x1, x2])
    x3 = Dense(100, activation='relu', name="All-Dense-3")(x2)
    x3 = BatchNormalization(name="All-BatchNorm-3")(x3)
    x3 = Dropout(0.1, name="All-Dropout-3")(x3)
    x4 = Dense(100, activation='relu', name="All-Dense-4")(x3)
    x4 = BatchNormalization(name="All-BatchNorm-4")(x4)
    x4 = Dropout(0.1, name="All-Dropout-4")(x4)
    x5 = Dense(100, activation='relu', name="All-Dense-5")(x4)
    x5 = BatchNormalization(name="All-BatchNorm-5")(x5)
    x5 = Dropout(0.1, name="All-Dropout-5")(x5)
    x = Concatenate(name="Concatenate-Everything-2")([x1, x2, x3, x4, x5])
    x = Dense(100, name="FinalDense")(x2)
    out_distribution = Dense(199-N_zeroed, activation="softmax", name="PredictingFullDistribution")(x)
#     out_delta = Dense(199-N_zeroed, activation=None, name="PredictingDelta")(x)
    out_yards = Dense(1, activation=None, name="PredictingYards")(prex)
#     print(player_inputs)
    model = Model(inputs=[player_input for player_input in player_inputs]+[rusher_input, global_input,
                          offense_circle_input, defense_circle_input, offense_centroid_input, defense_centroid_input,
                          offense_input, defense_input],#, basic_distribution_input], 
                  outputs=[out_distribution, out_delta, out_yards]) 
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005, decay=1e-4),
                  loss=[crps, keras.losses.mse, keras.losses.mae],
                  loss_weights=[100.0, 100.0, 0.2])
    return model

# Training

I chose simple KFold validation, however I found out that when I train the same model multiple times with a different seed and then choose the best performing model of them, the score will improve.

In [None]:
def split_Y_for_model(Y, train_indices, test_indices):
    """
    Split Y values to training and test.
    
    The NN predicts more types of outputs:
    - full distribution
    - delta to empirical distribution
    - yards gained/lost
    This functions is a helper for folding,
    as KFold only generates train and test indices,
    but we need to create three arrays for train
    and three for test ourselves.
    """
    train_Y = list()
    test_Y = list()
    for Y_type in Y:
        train_Y.append(Y_type[train_indices, :])
        test_Y.append(Y_type[test_indices, :])
    return train_Y, test_Y

In [None]:
models = []

folds = 8   # Define number of folds here
kf = KFold(n_splits=folds, random_state=seed)
CV_scores = []
i = 1
for train_indices, validation_indices in kf.split(X):
    X_train, X_valid = X[train_indices, :], X[validation_indices,:]
    Y_train, Y_valid = split_Y_for_model(Y, train_indices, validation_indices)
    cmodels = []
    clossess = []
    # Training the same model on same data multiple times with different seeds
    # and then selecting the best performing one helps to improve the score.
    while len(cmodels) < 2:
        seed += 1
        set_seeds(seed)
        checkpoint = ModelCheckpoint('ModelCheckpoints.hdf5', monitor='val_PredictingFullDistribution_loss', verbose=1, save_best_only=True, mode='auto')
        early_stopping = EarlyStopping(monitor='val_PredictingFullDistribution_loss', mode='min', verbose=1, patience=5)
        callbacks_list = [checkpoint, early_stopping]
        model = define_model()
        model.fit(split_to_inputs(X_train, input_lengths), Y_train, 
                  epochs=30, verbose=0, callbacks=callbacks_list,
                  validation_data=(split_to_inputs(X_valid, input_lengths), Y_valid))
        print("HISTORY:", model.history.history)
        loss = np.min(model.history.history["val_PredictingFullDistribution_loss"])
        print("Score:",loss)
        print("-"*20)
        model.load_weights("ModelCheckpoints.hdf5")
        cmodels.append(model)
        clossess.append(loss)
    loss = min(np.array(clossess))
    i = np.argmin(np.array(clossess))
    model = cmodels[i]
    CV_scores.append(loss)
    models.append(model)
CV = np.mean(CV_scores)
print(f"CV: {CV} +- {np.std(CV_scores)}")

In [None]:
plot_model(model, to_file='ModelArchitecture.png', show_shapes=True)

# Predictions

There are a few tricks you may have used for predictions. 

1. Most important trick - You need to realize that you do not need to predict all 200 values. You need to predict only 100 values and the rest can be computed analytically. A team can not have more than 100 yards in the end of a play and they can not loose more yards than they currently have. So you should go through values after your models predicts them and set those which you know for sure. There was a discussion about it in the forum, but almost noone reacted.
2. Second trick - First eighty values are always zero. They don't have to be as a team can loose more than 20 yards, but it is very rare and your model will predict some noise in this area anyway, so why not to set it to zero right away.

In [None]:
def set_analytic_half(output, yards_to_go):
    """
    Half of values are zeros or ones, set those which we know for sure.
    
    All values lower than -YardsToGo are zero, as team can not loose
    more yards than they currently have. All values higher than
    100-YardsToGo are ones as team can not have in the end more yards
    than one hundred.
    """
    if int(np.ceil(yards_to_go)) - N_zeroed > 0:
        output[:int(int(np.ceil(yards_to_go)) - N_zeroed)] = 0.0
    output[-int(100-np.floor(yards_to_go)):] = 1.0
    return output

def correct_growth1(output):
    """
    The output has to be strictly monotone, correct it.
    
    This is very single correction, if a value is lower
    than preceeding value, set it to the preceeding value instead.
    """
    new_values = list()
    for i in range(len(output)):
        if i != 0 and output[i] < new_values[i-1]:
            new_values.append(new_values[i-1])    
        else:
            new_values.append(output[i])
    return np.array(new_values)

def is_increasing(arr):
    """
    Check if array is monotonously increasing.
    """
    arr = np.diff(arr)
    return np.all(arr >= 0.0)

def correct_growth2(output):
    """
    The output has to be strictly monotone, correct it.
    
    This is more complicated and in the end not used version.
    """
    diff = np.diff(output)
    while not np.all(diff >= 0.0):
        for i in range(1, len(output)-1):
            if output[i] < output[i-1]:
                if 0.0 > diff[i-1] > -1e-10:
                    output[i] = output[i-1]
                else:
                    output[i] = np.mean(output[i-1:i+1])
                    output[i-1] += (output[i]-output[i-1])*0.1
                    output[i+1] += (output[i]-output[i+1])*0.1
        if output[len(output)-1] < output[len(output)-2]:
            output[len(output)-1] = 1.0
        diff = np.diff(output)
    return output

In [None]:
# Mandatory cell

from kaggle.competitions import nflrush
env = nflrush.make_env()

In [None]:
%%time
basic_plays_columns.remove("Yards")
for (test_players, sample_prediction_df) in tqdm(env.iter_test()):
    # Process the test dataset
    test_plays = copy.deepcopy(test_players).loc[:, basic_plays_columns].drop_duplicates()
    test_players = test_players.loc[:, basic_players_columns]
    model_input = prepare_data(test_players, test_plays, frequencies)    
    # Get empirical distribution
    bisection_index = bisect(checkpoints, model_input["YardsToGo"].values[0]) - 1
    basic_distribution = distributions[bisection_index]
    # Get feature for the model
    X_test, _, _, _ = prepare_for_predictions(model_input, scaler=Xscaler)
    # Predict delta of empirical distribution (next commented line) or predict full distribution
    input_arrays = split_to_inputs(X_test, input_lengths)
    y_predicted = np.mean([np.cumsum(model.predict(input_arrays)[0], axis=1) for model in models], axis=0)
    # y_predicted = np.mean([model.predict(input_arrays)[1] for model in models], axis=0)
    # Add empirical distribution if you computed delta
    # y_predicted += basic_distribution[N_zeroed:]
    y_predicted = y_predicted.flatten()
    # Some values can be computed analytically, exactly 100 values are zero or one.
    y_predicted = set_analytic_half(y_predicted, model_input["YardsToGo"].values[0])
    # Fix values which are 0-epsilon or 1+epsilon, so that they fit in zero-one range
    y_predicted = np.clip(y_predicted, 0, 1)
    # Result has to be monotonously incresing
    y_predicted = correct_growth1(y_predicted)
    y_predicted = set_analytic_half(y_predicted, model_input["YardsToGo"].values[0])
    # First 80 values are always zero, no need to learn them.
    y_predicted = np.concatenate((np.zeros(N_zeroed), y_predicted))   
    final_output = pd.DataFrame(data=np.array(y_predicted).reshape(1,-1), columns=sample_prediction_df.columns)
    env.predict(final_output)
env.write_submission_file()

# Errors you may have done (or I did)

- Sort features - If you always sort your features, you will be sure that you provide them to a model everytime in the same order. 
- Use the same scaler - You need to save the scaler used for training data and use it for the test data, do not generate a new one.