In [None]:
"""
v2: 只保留player 11, 22
v6: 使用model1, 减少训练次数
v7: 删除target mean: 0.01686
v8: use GroupKFold: 0.01692
v12: 改了bug，删了11号player，没加上了target_mean
v15: 改了bug，删了11号player，加上了target_mean: 0.01384
v16: 删了一些线性相关的特征 0.01396
v18: v16 + 正则项 cv:  0.01286 lb: 0.01415
v20: v18 + one-hot Down/Quarter cv: 0.012878 lb: 14.10
"""

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np
import datetime, math
from kaggle.competitions import nflrush
import seaborn as sns
import matplotlib.pyplot as plt
env = nflrush.make_env()

In [None]:
import keras
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.utils import plot_model
import keras.backend as K
import matplotlib.patches as patches

In [None]:
"""
# missing
Dir - angle of player motion (deg) 
Orientation - orientation of player (deg)
DefendersInTheBox - number of defenders lined up near the line of scrimmage, spanning the width of the ---offensive line
OffenseFormation - offense formation // 进攻方战术
Humidity - humidity p- WindSpeed - wind speed in miles/hour WindDirection - wind direction
FieldPosition - which side of the field the play is happening on
StadiumType - description of the stadium environment
GameWeather - description of the game weather
Temperature - temperature (deg F)
WindSpeed
WindDirection
"""

TARGET = 'Yards'
dropcols = [TARGET]
dropcols += ['HomeTeamAbbr', 'VisitorTeamAbbr', 'DisplayName', 'PlayerCollegeName',
             "NflId", "GameId", 'PlayId', "Season", "NflIdRusher",
            "JerseyNumber", 'PlayerBirthDate'] # identity infomation
dropcols += ["GameWeather", 'FieldPosition', 'StadiumType', 'Location', 'Stadium',
             'WindSpeed', "GameClock", 'Humidity', "Temperature", 
             'WindDirection', 'Turf'] # environment variables

dropcols += ['FieldPosition', 'OffensePersonnel', 'DefensePersonnel', 
             'PossessionTeam', 'Week', 'PlayDirection', 
             'Team', 'TeamName', 'TimeHandoff', 'TimeSnap', 'OffenseFormation'] # useless without processing. only used to generate features

In [None]:
train_df = pd.read_csv("../input/nfl-big-data-bowl-2020/train.csv", low_memory=False)
print('Train size: ', train_df.shape)

In [None]:
train_df['TeamName'] = np.nan
train_df.loc[train_df['Team'] == 'home', 'TeamName'] = train_df.loc[train_df['Team'] == 'home', 'HomeTeamAbbr']
train_df.loc[train_df['Team'] == 'away', 'TeamName'] = train_df.loc[train_df['Team'] == 'away', 'VisitorTeamAbbr']

In [None]:
# global trained variables
fillnan_dict_OffenseFormation = train_df.groupby('TeamName')['OffenseFormation'].apply(pd.Series.mode)
fillnan_dict_StadiumType = train_df.groupby('HomeTeamAbbr')['StadiumType'].apply(pd.Series.mode)
fillnan_dict_WindSpeed = train_df.groupby(['Week','GameId'])['WindSpeed'].apply(pd.Series.mode)
fillnan_dict_WindDirection = train_df.groupby(['Week','GameId'])['WindDirection'].apply(pd.Series.mode)

fillnan_dict_OffenseFormation.index = fillnan_dict_OffenseFormation.index.droplevel(1)
fillnan_dict_OffenseFormation = fillnan_dict_OffenseFormation.to_dict()

fillnan_dict_StadiumType.index = fillnan_dict_StadiumType.index.droplevel(1)
fillnan_dict_StadiumType = fillnan_dict_StadiumType.to_dict()

fillnan_dict_WindSpeed.index = fillnan_dict_WindSpeed.index.droplevel(1)
fillnan_dict_WindSpeed = fillnan_dict_WindSpeed.to_dict()

fillnan_dict_WindDirection.index = fillnan_dict_WindDirection.index.droplevel(1)
fillnan_dict_WindDirection = fillnan_dict_WindDirection.to_dict()

yardsleft_target_mean = None
oh = OneHotEncoder(sparse=False)
le = LabelEncoder()
scaler = StandardScaler()
position_cols = list(train_df['Position'].unique())
train_df['IsRusher'] = train_df.NflId == train_df.NflIdRusher
dropcols += ['IsRusher']

In [None]:
def create_football_field(linenumbers=True,
                          endzones=True,
                          highlight_line=False,
                          highlight_line_number=50,
                          highlighted_name='Line of Scrimmage',
                          fifty_is_los=False,
                          figsize=(12*2, 6.33*2)):
    """
    Function that plots the football field for viewing plays.
    Allows for showing or hiding endzones.
    """
    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0,  alpha=0.5)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)

    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
    # Endzones
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)

    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')

    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax

def get_dx_dy(radian_angle, dist):
    dx = dist * math.cos(radian_angle)
    dy = dist * math.sin(radian_angle)
    return dx, dy


def show_play_std(play_id, train=train_df):
    df = train[train.PlayId == play_id]
    YardLine = df.YardLine_std.values[0]
    fig, ax = create_football_field(highlight_line=True,
                                    highlight_line_number=YardLine,
                                   )
    ax.scatter(df.X, df.Y, cmap='rainbow', c=~(df.Team == 'home'), s=100)
    rusher_row = df[df.NflIdRusher == df.NflId]
    ax.scatter(rusher_row.X, rusher_row.Y, color='black', s=100)
    yards_covered = rusher_row["Yards"].values[0]

    for (x, y, Dir, S) in zip(df.X, df.Y, df.Dir, df.S):       
        dx, dy = get_dx_dy(Dir, S)
        ax.arrow(x, y, dx, dy, length_includes_head=False, width=0.2, color='black', alpha=0.5)

    ax.arrow(x, y, dx, dy, length_includes_head=True, width=0.3, color='black')
    left = 'left' if np.any(df.PlayDirection == 'left')  else 'right'
    plt.title(f'Play # {play_id} moving to {left}, yard distance is {yards_covered}', fontsize=20)
    plt.legend()
    plt.show()

def feature_engineering(train_df, test=False):
    global dropcols, yardsleft_target_mean
    train_df['IsOnOffence'] = train_df['PossessionTeam'] == train_df['TeamName']
    dropcols += ['IsOnOffence']
    #new features
    train_df['DefendersInTheBox_vs_Distance'] = train_df['DefendersInTheBox'] / train_df['Distance']
    train_df['PlayerBMI'] = 703 * ( train_df['PlayerWeight'] / ( train_df['PlayerHeight'])**2 )
    cols = ['PlayerHeight', 'PlayerWeight', 'PlayerAge', 'PlayerBMI', 'S', 'A']
    aggs = ['mean']
    for c in cols:
        for agg in aggs:
            if agg == 'mean':
                func = np.mean
            tmp = train_df.pivot_table(index='PlayId', columns='IsOnOffence', values=c, aggfunc=func)
            tmp = tmp.rename({True: f"Offensive_{c}_{agg}", False: f"Defensive_{c}_{agg}"}, axis=1)
            train_df[f'Offensive_{c}_{agg}'] = train_df['PlayId'].map(tmp[f"Offensive_{c}_{agg}"])
            train_df[f'Defensive_{c}_{agg}'] = train_df['PlayId'].map(tmp[f"Defensive_{c}_{agg}"])
            
    if not test:
        yardsleft_target_mean = train_df.groupby('YardsLeft')['Yards'].mean()
        train_df['YardsLeft_target_mean'] = train_df['YardsLeft'].map(yardsleft_target_mean)
    else:
        train_df['YardsLeft_target_mean'] = train_df['YardsLeft'].map(yardsleft_target_mean)
        
    train_df['S_horizontal'] = train_df['S'] * np.cos(train_df['Dir']* (np.pi/180))
    train_df['S_vertically'] = train_df['S'] * np.sin(train_df['Dir']* (np.pi/180))   
    train_df['A_horizontal'] = train_df['A'] * np.cos(train_df['Dir']* (np.pi/180))
    train_df['A_vertically'] = train_df['A'] * np.sin(train_df['Dir']* (np.pi/180))   
     
    rusher_data = train_df.loc[train_df['IsRusher'], ['PlayId', 'X', 'Y', 'S', 'A', 'Dir', 'S_horizontal', 'S_vertically', 'A_horizontal', 'A_vertically']].copy().set_index('PlayId')
    train_df = pd.merge(train_df, rusher_data, left_on='PlayId', right_index=True, how='left', suffixes=("", "_rusher"))
    defensive_data = train_df.loc[~train_df['IsOnOffence'], ['X_rusher', 'Y_rusher', 'S_rusher', 'A_rusher','PlayId', 'X', 'Y', 'S', 'A']].copy().set_index('PlayId')

    defensive_data['rusher_dist_defense'] = (defensive_data['X_rusher'] - defensive_data['X'])**2 + (defensive_data['Y_rusher'] - defensive_data['Y'])**2
    defensive_dist_stats = defensive_data.groupby('PlayId')['rusher_dist_defense'].agg(['mean'])
    defensive_dist_stats = defensive_dist_stats.rename({'mean': 'rusher_dist_defense_mean', 'min': 'rusher_dist_defense_min', 'max': 'rusher_dist_defense_max'}, axis=1)
    train_df = pd.merge(train_df, defensive_dist_stats, left_on='PlayId', right_index=True)

#     defensive_data['rusher_S_defense'] = defensive_data['S_rusher'] - defensive_data['S']
#     defensive_S_stats = defensive_data.groupby('PlayId')['rusher_S_defense'].agg(['mean', 'min', 'max'])
#     defensive_S_stats = defensive_S_stats.rename({'mean': 'rusher_S_defense_mean', 'min': 'rusher_S_defense_min', 'max': 'rusher_S_defense_max'}, axis=1)
#     train_df = pd.merge(train_df, defensive_S_stats, left_on='PlayId', right_index=True)

#     defensive_data['rusher_A_defense'] = defensive_data['A_rusher'] - defensive_data['A']
#     defensive_A_stats = defensive_data.groupby('PlayId')['rusher_A_defense'].agg(['mean', 'min', 'max'])
#     defensive_A_stats = defensive_A_stats.rename({'mean': 'rusher_A_defense_mean', 'min': 'rusher_A_defense_min', 'max': 'rusher_A_defense_max'}, axis=1)
#     train_df = pd.merge(train_df, defensive_A_stats, left_on='PlayId', right_index=True)
        
#     b = defensive_data.groupby('PlayId')['rusher_dist_to_defense'].idxmin()
#     defensive_data.loc[b, 'IsClosestDenfensive'] = True
#     defensive_data['IsClosestDenfensive'] = defensive_data['IsClosestDenfensive'].fillna(False)
#     defensive_data = defensive_data.set_index(['PlayId', 'NflId'])
#     closest_defensive_data = defensive_data[defensive_data['IsClosestDenfensive']]
#     train_df = pd.merge(train_df, defensive_data[['IsClosestDenfensive']], left_on=['PlayId', 'NflId'], right_index=True)
    return train_df

In [None]:
def preprocess(train_df, test=False):
    global dropcols,position_cols
        
    train_df.loc[train_df.VisitorTeamAbbr == "ARI", 'VisitorTeamAbbr'] = "ARZ"
    train_df.loc[train_df.HomeTeamAbbr == "ARI", 'HomeTeamAbbr'] = "ARZ"

    train_df.loc[train_df.VisitorTeamAbbr == "BAL", 'VisitorTeamAbbr'] = "BLT"
    train_df.loc[train_df.HomeTeamAbbr == "BAL", 'HomeTeamAbbr'] = "BLT"

    train_df.loc[train_df.VisitorTeamAbbr == "CLE", 'VisitorTeamAbbr'] = "CLV"
    train_df.loc[train_df.HomeTeamAbbr == "CLE", 'HomeTeamAbbr'] = "CLV"

    train_df.loc[train_df.VisitorTeamAbbr == "HOU", 'VisitorTeamAbbr'] = "HST"
    train_df.loc[train_df.HomeTeamAbbr == "HOU", 'HomeTeamAbbr'] = "HST"
    
    WindMap={'SW':"SW", 'NNE':"NE", 'SE':"SE",'East':"E", np.nan:np.nan, 'NE':"NE", 'North':"N", 'S':"S", 'Northwest':"NW",
           'SouthWest':"SW", 'ENE':"NE", 'ESE':"SE", 'SSW':"SW", 'NW':"NW", 'Northeast':"NE", 'From S':"S", 'W':"W",
           'South':"S", 'West-Southwest':"SW", 'E':"E", '13':'13','N':"N", 'NNW':"NW",
           'South Southeast':"SE", 'SSE':"SE", 'West':"W", 'WSW':"SW", 'From SW':"SW", 'WNW':"NW", 's':"S",
           'NorthEast':"NE", 'from W':"W", 'W-NW':"NW", 'South Southwest':"SW", 'Southeast':"SE",
           'From WSW':"SW", 'West Northwest':"NW", 'Calm':"Calm", 'From SSE':"SE", 'From W':"W",
           'East North East':"NE", 'From ESE':"SE", 'EAST':"E", 'East Southeast':"SE",
           'From SSW':"SW", '8':"8", 'North East':"NE", 'Southwest':"SW", 'North/Northwest':"NW",
           'From NNE':"NE", '1':"NE", 'N-NE':"NE", 'W-SW':"SW", 'From NNW':"NW"}

    Turf_map={"Grass":"Grass","grass":"Grass",'Twenty-Four/Seven Turf':"FieldTurf",
              'DD GrassMaster':'DD GrassMaster','A-Turf Titan':'A-Turf Titan',
              "FieldTurf 360":"FieldTurf","FieldTurf360":"FieldTurf",'UBU Sports Speed S5-M':'UBU Sports Speed S5-M',
             "Field Turf":"FieldTurf","Natural":"Natural", "Natural Grass":"Natural",
             "Natural grass":"Natural","Naturall Grass":"Natural",'SISGrass':'Grass',
              'UBU Speed Series-S5-M':"UBU Speed Series-S5-M","Artifical":"Artifical","Artificial":"Artifical"}

    Type_map={'Outdoor':'Outdoor','Outdoors':'Outdoor','Oudoor':'Outdoor'
             ,'Ourdoor':'Outdoor','Outside':'Outdoor','Outdor':'Outdoor'
             ,'Outddors':'Outdoor','Retractable Roof':'Retractable Roof',
             'Retr. Roof-Closed':'Retractable Roof','Retr. Roof - Closed':'Retractable Roof',
             'Retr. Roof Closed':'Retractable Roof',  "Outdoor Retr Roof-Open":"Outdoor",
             "Indoors":"Indoor","Indoor":"Indoor","Indoor, Roof Closed":"Indoor",
             "Doom":"Doom","Domed, closed":"Doom","Domed, open":"Doom","Closed Dome":"Doom"
             ,"Bowl":"Bowl","Open":"Outdoor","Cloudy":"Outdoor","Domed, Open":"Doom"
             ,'Retr. Roof-Open':'Retractable Roof',"Retr. Roof - Open":"Retractable Roof",
              "Indoor, Open Roof":"Indoor","Heinz Field" :"Heinz Field"}
    if test:
        train_df['TeamName'] = np.nan
        train_df.loc[train_df['Team'] == 'home', 'TeamName'] = train_df.loc[train_df['Team'] == 'home', 'HomeTeamAbbr']
        train_df.loc[train_df['Team'] == 'away', 'TeamName'] = train_df.loc[train_df['Team'] == 'away', 'VisitorTeamAbbr']
    
    train_df.StadiumType = train_df.StadiumType.map(Type_map)
    train_df.Turf = train_df.Turf.map(Turf_map)
    train_df.WindDirection = train_df.WindDirection.map(WindMap)
        
    # fillna
    train_df['OffenseFormation'] = train_df.set_index('TeamName')['OffenseFormation'].fillna(fillnan_dict_OffenseFormation).values
    train_df['WindSpeed'] = train_df.set_index(['Week','GameId'])['WindSpeed'].fillna(fillnan_dict_WindSpeed).values
    train_df['WindDirection'] = train_df.set_index(['Week','GameId'])['WindDirection'].fillna(fillnan_dict_WindDirection).values

    # get a continuous value
    def transform_gameclock(txt):
        txt = txt.split(':')
        ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
        return ans

    train_df['GameClock'] = train_df['GameClock'].apply(transform_gameclock)

    # time related
    transform_time = lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")
    train_df['TimeHandoff'] = train_df['TimeHandoff'].apply(transform_time)
    train_df['TimeSnap'] = train_df['TimeSnap'].apply(transform_time)
    train_df['TimeDelta'] = (train_df['TimeHandoff'] - train_df['TimeSnap']).dt.seconds

    transform_birthdaytime = lambda x: datetime.datetime.strptime(x, "%m/%d/%Y")

    # birth date
    train_df['PlayerBirthDate'] = train_df['PlayerBirthDate'].apply(transform_birthdaytime)
    seconds_in_year = 60*60*24*365.25
    train_df['PlayerAge'] = (train_df['TimeHandoff'] - train_df['PlayerBirthDate']).dt.seconds / seconds_in_year

    # height
    transform_playerheight = lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1])
    train_df['PlayerHeight'] = train_df['PlayerHeight'].apply(transform_playerheight)
    
    train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    
    import re
    def personnel_split(x, pattern):
        s = re.findall(f"([1-9]) {pattern}", x)
        if len(s) > 0:
            return int(s[0])
        else:
            return 0
    
    train_df['OffensePersonnel_RB'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='RB')
    train_df['OffensePersonnel_DB'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='DB')
    train_df['OffensePersonnel_DL'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='DL')
    train_df['OffensePersonnel_LB'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='LB')
    train_df['OffensePersonnel_OL'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='OL')
    train_df['OffensePersonnel_QB'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='QB')
    train_df['OffensePersonnel_TE'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='TE')
    train_df['OffensePersonnel_WR'] = train_df['OffensePersonnel'].apply(personnel_split, pattern='WR')

    train_df['DefensePersonnel_DB'] = train_df['DefensePersonnel'].apply(personnel_split, pattern='DB')
    train_df['DefensePersonnel_DL'] = train_df['DefensePersonnel'].apply(personnel_split, pattern='DL')
    train_df['DefensePersonnel_LB'] = train_df['DefensePersonnel'].apply(personnel_split, pattern='LB')
    train_df['DefensePersonnel_OL'] = train_df['DefensePersonnel'].apply(personnel_split, pattern='OL')
    
    #direction problem
    train_df['Orientation'] = np.where(train_df['Season'] == 2017, (360+train_df['Orientation']+90).mod(360), train_df['Orientation'])
    
    train_df['Dir_rad'] = np.mod(90 - train_df.Dir, 360) * math.pi/180.0
    
    train_df['YardLine_std'] = np.where(train_df.FieldPosition == train_df.PossessionTeam, train_df.YardLine, 100 - train_df.YardLine)
    train_df['X'] = np.where(train_df['PlayDirection'] == 'right', train_df['X'], 120-train_df['X'])
    train_df['Y'] = np.where(train_df['PlayDirection'] == 'right', train_df['Y'], 160/3 - train_df['Y']) 
    train_df['Orientation'] = np.where(train_df['PlayDirection'] == 'right', train_df['Orientation'], (360 - train_df['Orientation']).mod(360))
    train_df['Dir'] = np.where(train_df['PlayDirection'] == 'right', train_df['Dir'], np.mod(np.pi + train_df['Dir_rad'], 2*np.pi))
    
    dropcols += ['YardLine']
    
    tmp = np.where(train_df['FieldPosition'] == train_df['HomeTeamAbbr'], 100 - train_df['YardLine'], train_df['YardLine'])
    train_df['YardsLeft'] = np.where(train_df['PlayDirection'] == 'right', tmp, 100 - tmp)
    if not test:
        train_df.drop(train_df.index[(train_df['YardsLeft'] < train_df['Yards']) | ((train_df['YardsLeft']-100) > train_df['Yards'])], inplace=True)
    
    del tmp
    
    #feature
    train_df['IsRusher'] = (train_df['NflId'] == train_df['NflIdRusher'])
    dropcols.append('IsRusher')
    if not test:
        oh.fit(train_df[['OffenseFormation']])
        
    val = oh.transform(train_df[['OffenseFormation']])
    count = 0
    for j in range(val.shape[1]):
        train_df[f'x{count}'] = val[:, j]
        count += 1

#     train_df['Position'] = le.fit_transform(train_df['Position'])
    # deal with position
    if not test:
        tmp = train_df.groupby(by=['PlayId', 'Position']).size().unstack(level=1)
        tmp = tmp.fillna(0)
        tmp = tmp.astype(int)
        tmp.columns = [f"Position_{i}" for i in tmp.columns]
        train_df = pd.merge(train_df, tmp, left_on='PlayId', right_index=True, how='left')
    else:
        tmp = train_df.groupby(by=['PlayId', 'Position']).size().unstack(level=1)
        for c in position_cols:
            if c not in tmp.columns:
                tmp[c] = np.nan
                
        tmp.columns = [f"Position_{i}" for i in tmp.columns]
        tmp = tmp.fillna(0)
        tmp = tmp.astype(int)
        train_df = pd.merge(train_df, tmp, left_on='PlayId', right_index=True, how='left')
        
    dropcols += ['Position']
        
    # transform down and quater to one-hot
    if not test:
        tmp = pd.get_dummies(train_df['Down'])
        tmp.columns = [f"Down_{i}" for i in tmp.columns]
        train_df = pd.concat([train_df, tmp], axis=1)
        
    else:
        downs = [1, 2, 3, 4]
        down = train_df['Down'].iloc[0]
        downs.remove(down)
        train_df[f'Down_{down}'] = 1
        for d in downs:
            train_df[f'Down_{d}'] = 0
        
    dropcols += ['Down']
    
    # transform down and quater to one-hot
    if not test:
        tmp = pd.get_dummies(train_df['Quarter'])
        tmp.columns = [f"Quarter_{i}" for i in tmp.columns]
        train_df = pd.concat([train_df, tmp], axis=1)
        
    else:
        downs = [1, 2, 3, 4, 5]
        down = train_df['Quarter'].iloc[0]
        downs.remove(down)
        train_df[f'Quarter_{down}'] = 1
        for d in downs:
            train_df[f'Quarter_{d}'] = 0
        
    dropcols += ['Quarter']

    train_df = feature_engineering(train_df, test)
    train_df = train_df.sort_values(by=['PlayId', 'Team', 'IsRusher', 'JerseyNumber'])

    return train_df

In [None]:
%%time
train_df = preprocess(train_df)

In [None]:
cat_features = []
dense_features = []
for col in train_df.columns:
    if col in dropcols:
        continue
    if train_df[col].dtype =='object':
        cat_features.append(col)
        if train_df[col].isnull().sum() > 0:
            print("fillna %s", col)
            train_df[col] = train_df[col].fillna(train_df[col].mode())
        print("*cat*", col, len(train_df[col].unique()))
    else:
        dense_features.append(col)
        if train_df[col].isnull().sum() > 0:
            print("fillna %s", col)
            train_df[col] = train_df[col].fillna(train_df[col].mean())
        print("!dense!", col, len(train_df[col].unique()))

In [None]:
# now we need to compress the data to play-level data
play_train_df = None
## dense features for play
game_features = []
## dense features for each player
player_features = []
for col in train_df.columns:
    if col not in dropcols:
        if train_df[col].dtype != np.object and train_df[col][:22].std() > 1e-10:
            player_features.append(col)
            
        if train_df[col].dtype == np.object and train_df[col][:22].nunique() != 1:
            player_features.append(col)

game_features = [i for i in train_df.columns if i not in dropcols and i not in player_features]

In [None]:
def transform(train_df, player_features, game_features,test=False):
    X_train = np.array(train_df[player_features]).reshape(-1, len(player_features)*22)
    X_play_col = np.zeros(shape=(X_train.shape[0], len(game_features)))
    for i, col in enumerate(game_features):
        X_play_col[:, i] = train_df[col][::22]

    X_train = X_play_col
    y_train = np.zeros(shape=(X_train.shape[0], 199))
    if not test:
        for i,yard in enumerate(train_df[TARGET][::22]):
            y_train[i, yard+99:] = np.ones(shape=(1, 100-yard))
        
    if not test:
        scaler.fit(X_train)
        
    X_train = scaler.transform(X_train)
    return X_train, y_train

In [None]:
def crps(y_true, y_pred):
    return K.mean(K.square(y_true - K.cumsum(y_pred, axis=1)), axis=1)

In [None]:
def get_model():
    x = keras.layers.Input(shape=[X_train.shape[1]])
    a = keras.layers.Dense(units=512, input_shape=[X_train.shape[1]], name="dens_1")(x)
    a = keras.layers.ReLU()(a)
    a = keras.layers.BatchNormalization()(a)
    a = keras.layers.Dense(units=1024)(a)
    a = keras.layers.ReLU()(a)
    a = keras.layers.BatchNormalization()(a)
    a = keras.layers.Dense(units=512)(a)
    a = keras.layers.ReLU()(a)
    a = keras.layers.BatchNormalization()(a)
    output = keras.layers.Dense(units=199, activation='softmax', name = "real_output")(a)
    out_reg = keras.layers.Dense(1, activation=None, name = "out_reg")(a)
    model = keras.models.Model(inputs=[x], outputs=[output, out_reg])
    return model

def train_model(X_train, y_train, X_val, y_val, batch_size=64, epochs=200):
    model = get_model()
    model.compile(optimizer=Adam(learning_rate=0.005, beta_1=0.9, beta_2=0.99), loss=[crps, keras.losses.mae],
                  loss_weights=[1.0, 0.01])
    er = EarlyStopping(patience=10, min_delta=1e-4, restore_best_weights=True, monitor='val_loss')
    model.fit(X_train, y_train, epochs=epochs, callbacks=[er], validation_data=[X_val, y_val], batch_size=batch_size)
    return model

In [None]:
X_train, y_train = transform(train_df, player_features, game_features)

In [None]:
feature_name = [f"{i}_{k}" for k in range(int((X_train.shape[1]-len(game_features)) / len(player_features))) for i in player_features]  + game_features

In [None]:
X_train_df = pd.DataFrame(X_train, columns=feature_name)

In [None]:
X_train_df_corr = X_train_df.corr()

In [None]:
y_train_raw = train_df['Yards'].iloc[::22].values

In [None]:
# # train the model
# from sklearn.model_selection import train_test_split, KFold, GroupKFold
# models = []
# losses = []
# batch_size = 64
# epochs = 100
# groups = train_df['GameId'].iloc[::22].values
# kfold = GroupKFold(n_splits=5)
# for tr_inds, val_inds in kfold.split(X_train, y_train, groups):
#     print(tr_inds, val_inds)
#     trainX, valX = X_train[tr_inds], X_train[val_inds]
#     trainY, valY = y_train[tr_inds], y_train[val_inds]
#     y_train_raw_tr = y_train_raw[tr_inds]
#     y_train_raw_val = y_train_raw[val_inds]
#     model = train_model(trainX, [trainY, y_train_raw_tr], valX, [valY, y_train_raw_val], batch_size, epochs)
#     models.append(model)
#     losses.append(model.history.history['val_real_output_loss'][-1])
    
# print("CROSS-VALID: ", np.mean(np.array(losses)))

In [None]:
# train the model
models = []
losses = []
batch_size = 64
epochs = 100
groups = train_df['GameId'].iloc[::22].values
tr_inds = (groups != 2018123011) & (groups != 2018123012) & (groups != 2018123013) & (groups != 2018123014) & (groups != 2018123015)
val_inds = ~tr_inds
trainX, valX = X_train[tr_inds], X_train[val_inds]
trainY, valY = y_train[tr_inds], y_train[val_inds]
y_train_raw_tr = y_train_raw[tr_inds]
y_train_raw_val = y_train_raw[val_inds]
model = train_model(trainX, [trainY, y_train_raw_tr], valX, [valY, y_train_raw_val], batch_size, epochs)
models.append(model)
losses.append(model.history.history['val_real_output_loss'][-1])

print("CROSS-VALID: ", np.mean(np.array(losses)))

In [None]:
def make_pred(df, sample, env, models):
    ## pred
    pred = np.mean([model.predict(df)[0] for model in models], axis=0)
    pred /= len(models)
    pred = np.clip(pred, 0, 1)
    return pred

In [None]:
def post_process(y_pred, df):
    yardsleft = df['YardsLeft'].iloc[0]
    max_back = -(99 - yardsleft) + 99
    max_push = yardsleft + 99
    y_pred[0, :max_back] = 0
    y_pred[max_push+1:] = 0
    y_pred = y_pred / np.sum(y_pred, axis=1)
    return y_pred

In [None]:
import tqdm
for test, sample in tqdm.tqdm(env.iter_test()):
    test = preprocess(test, test=True)
    test_tmp = test.copy()
    df, _ = transform(test_tmp, player_features, game_features, test=True)
    pred = make_pred(df, sample, env, models)
    pred = post_process(pred, test)
    pred = np.cumsum(pred, axis=1)
    pred = pd.DataFrame(data=pred,columns=sample.columns)
    pred = pred.clip(0, 1)
    env.predict(pred)
    
env.write_submission_file()

In [None]:
from keras.utils import plot_model
plot_model(model)

In [None]:
beta = None
for model in models:
    if beta is None:
        beta = model.get_layer('dens_1').get_weights()[0]
    else:
        beta += model.get_layer('dens_1').get_weights()[0]

feature_importance_ = np.sum(np.square(beta), axis=1)
feature_name = [f"{i}_{k}" for k in range(int((X_train.shape[1]-len(game_features)) / len(player_features))) for i in player_features]  + game_features
fea = pd.Series(feature_importance_, feature_name)
fea = fea.sort_values(ascending=False)
print(fea.head(10))
print(fea.tail(10))
plt.figure(figsize = (8, 80))
plt.scatter(feature_importance_, feature_name)
plt.show()

In [None]:
train_df['PlayerAge']