In [63]:
import os
import sys
import timeit
import pandas as pd
import numpy as np
import math
import tensorflow as tf
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Dropout, PReLU
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from kaggle.competitions import nflrush
import lightgbm as lgb
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 100)
pd.set_option('display.width', 1000)
np.set_printoptions(linewidth=1000)

In [64]:
class PersistentFeatures:

    def __init__(self, nfl_df):
        self.nfl = nfl_df
        self.rushers = self.nfl.loc[self.nfl.NflId == self.nfl.NflIdRusher]
        self.rushers.set_index('PlayId', inplace=True)
        self.rusher_features = pd.DataFrame(self.rushers['NflId'].unique(), columns=['NflId'])
        self.columns = []
        self.scaler = StandardScaler()

    def build_features(self):
        self._rusher_features()

    def attach_columns(self, columns):
        self.columns = columns

    def _rusher_features(self):
        self._rusher_mean_yards()

    def _rusher_mean_yards(self):
        mean_dict = self.rushers.groupby(['Season', 'NflId'])['Yards'].mean()
        for year in range(2017, 2020):
            try:
                self.rusher_features["RusherMeanYards_{}".format(year)] = self.rusher_features['NflId'].map(mean_dict[year])
            except IndexError:
                pass
        mean_dict = self.rushers.groupby('NflId')['Yards'].mean().to_dict()
        self.rusher_features['RusherMeanYards'] = self.rusher_features['NflId'].map(mean_dict)

In [65]:
class FeatureExtractor:
    def __init__(self, nfl_df, persistent_features, is_test=False, model_type='nn'):
        self.nfl = nfl_df
        self.is_test = is_test
        self.persistent_features = persistent_features
        self.model_type = model_type
        self._normalise_starting_df()
        self.rushers = self.nfl.loc[self.nfl.IsRusher]
        self.features = self.rushers[['PlayId', 'GameId', 'NflId']]
        self.rushers.set_index('PlayId', inplace=True)
        self.features.set_index('PlayId', inplace=True)
        self.results = np.zeros((self.features.shape[0], 199))

    def run(self):
        self.attach_persistent_features()
        self.rusher_features()
        self.game_features()
        self.defense_features()
        self.offense_features()
        self.play_features()
        self.features = self.features.fillna(self.features.mean())

        if self.is_test:
            full_columns = self.persistent_features.columns
            missing_cols = set(full_columns) - set(self.features.columns)
            for c in missing_cols:
                self.features[c] = 0
        else:
            self._make_result_set()
            self.persistent_features.scaler.fit(self.features)

    def get_final_features(self):
        return self.persistent_features.scaler.transform(self.features)

    def attach_persistent_features(self):
        self.features.reset_index().merge(self.persistent_features.rusher_features, on='NflId').set_index('PlayId')

    def _make_result_set(self):
        if self.model_type == 'lgbm':
            self.results = np.zeros((self.features.shape[0])) #, 199))
        
        for i, yard in enumerate(self.rushers.Yards):
            if self.model_type == 'lgbm':
                self.results[i] = yard
            else:
                self.results[i, yard + 99:] = np.ones(shape=(1, 100 - yard))

    def play_features(self):
        self.features['DistanceToQB'] = np.sqrt(np.sum(
            self.nfl.loc[(self.nfl.Position == 'QB') | self.nfl.IsRusher, ['PlayId', 'X_std', 'Y_std']].groupby(
                'PlayId').agg(['min', 'max']).diff(axis=1).drop([('X_std', 'min'), ('Y_std', 'min')], axis=1) ** 2,
            axis=1))

    def defense_features(self):
        self.features['Def_DL'] = np.array(self.rushers['DefensePersonnel'].str[:1], dtype='int8')
        self.features['Def_LB'] = np.array(self.rushers['DefensePersonnel'].str[6:7], dtype='int8')
        self.features['Def_DB'] = np.array(self.rushers['DefensePersonnel'].str[12:13], dtype='int8')
        self._defenders_in_the_box()

    def _defenders_in_the_box(self):
        self.features['DefendersInTheBox'] = self.rushers['DefendersInTheBox']
        self.features['DITB_Centroid_X'] = \
            self.nfl.loc[self.nfl.IsDefenderInBox].groupby('PlayId')[['X_std']].mean()
        self.features['DITB_Centroid_Y'] = \
            self.nfl.loc[self.nfl.IsDefenderInBox].groupby('PlayId')[['Y_std']].mean()
        self.features['DITB_Spread_X'] = \
            self.nfl.loc[self.nfl.IsDefenderInBox].groupby('PlayId')['X_std'].agg(['min', 'max']).diff(axis=1)['max']
        self.features['DITB_Spread_Y'] = \
            self.nfl.loc[self.nfl.IsDefenderInBox].groupby('PlayId')['Y_std'].agg(['min', 'max']).diff(axis=1)['max']

    def offense_features(self):
        self.features['Off_RB'] = np.array(self.rushers['OffensePersonnel'].str.extract('(\d) RB'), dtype='int8')
        self.features['Off_TE'] = np.array(self.rushers['OffensePersonnel'].str.extract('(\d) TE'), dtype='int8')
        self.features['Off_WR'] = np.array(self.rushers['OffensePersonnel'].str.extract('(\d) WR'), dtype='int8')
        off_formations = [pd.get_dummies(self.rushers['OffenseFormation'], prefix='Off_Formation')]
        self.features = self.features.join(off_formations)

    def game_features(self):
        self.features['Week'] = self.rushers['Week']
        self.features['Season_2017'] = self.features['Season_2018'] = self.features['Season_2019'] = 0
        self.features.loc[self.rushers.Season == 2017, 'Season_2017'] = 1
        self.features.loc[self.rushers.Season == 2018, 'Season_2018'] = 1
        self.features.loc[self.rushers.Season == 2019, 'Season_2019'] = 1

        self.features['Quarter'] = self.rushers['Quarter']
        self.features['GameClock_std'] = (900.0 - self.rushers['GameClock'].apply(stringtomins)) / 900.0
        self.features['FullGameClock_std'] = (self.features['GameClock_std'] / 4.0) + (
                (self.features['Quarter'] - 1) * 0.25)

        self.features['OffenseScoreDelta'] = self.rushers['HomeScoreBeforePlay'] - self.rushers[
            'VisitorScoreBeforePlay']
        self.features.loc[self.rushers.PossessionTeam != self.rushers.HomeTeamAbbr, 'OffenseScoreDelta'] \
            = -1 * self.features.loc[self.rushers.PossessionTeam != self.rushers.HomeTeamAbbr, 'OffenseScoreDelta']

        self.features['YardLine_std'] = self.rushers['YardLine_std']
        self.features['Down'] = self.rushers['Down']
        self.features['IsFirstAndTen'] = 1
        self.features.loc[(self.rushers.Distance != 10.0) | (self.rushers.Down != 1), 'IsFirstAndTen'] = 0

    def rusher_features(self):
        rushers_features = self.rushers[['S', 'A', 'Dis', 'Orientation', 'Dir', 'PlayerHeight',
                                         'PlayerWeight', 'X_std', 'Y_std', 'Dir_rad', 'Dir_std', 'S_std']].copy(deep=True)
        rushers_features['PlayerHeight'] = rushers_features['PlayerHeight'] \
            .apply(lambda x: 12 * int(x.split('-')[0]) + int(x.split('-')[1]))

        self.features = self.features.join(rushers_features)
        self._rusher_position_ohe()

    def _rusher_position_ohe(self):
        rusher_position = [pd.get_dummies(self.rushers['Position'], prefix='Rusher_Position')]
        self.features = self.features.join(rusher_position)

    def _normalise_starting_df(self):
        self._fix_team_abbr()
        self._fix_orientation()
        self._fix_speed()
        self._add_possession_columns()
        self._flip_left_plays()
        self._distance_to_centers()

    def _fix_team_abbr(self):
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in self.nfl['PossessionTeam'].unique():
            map_abbr[abb] = abb
        self.nfl['PossessionTeam'] = self.nfl['PossessionTeam'].map(map_abbr)
        self.nfl['HomeTeamAbbr'] = self.nfl['HomeTeamAbbr'].map(map_abbr)
        self.nfl['VisitorTeamAbbr'] = self.nfl['VisitorTeamAbbr'].map(map_abbr)

    def _fix_orientation(self):
        self.nfl.loc[self.nfl['Season'] == 2017, 'Orientation'] \
            = np.mod(90 + self.nfl.loc[self.nfl['Season'] == 2017, 'Orientation'], 360)

    def _fix_speed(self):
        self.nfl['S_std'] = self.nfl['S']
        self.nfl.loc[self.nfl['Season'] == 2017, 'S'] \
            = (self.nfl['S'][self.nfl['Season'] == 2017] - 2.4355) / 1.2930 * 1.4551 + 2.7570

    def _flip_left_plays(self):
        self.nfl['ToLeft'] = self.nfl.PlayDirection == "left"
        self.nfl['YardLine_std'] = 100 - self.nfl.YardLine
        self.nfl.loc[self.nfl.FieldPosition.fillna('') == self.nfl.PossessionTeam, 'YardLine_std'] \
            = self.nfl.loc[self.nfl.FieldPosition.fillna('') == self.nfl.PossessionTeam, 'YardLine']

        self.nfl['X_std'] = self.nfl.X
        self.nfl.loc[self.nfl.ToLeft, 'X_std'] = 120 - self.nfl.loc[self.nfl.ToLeft, 'X']
        self.nfl['Y_std'] = self.nfl.Y - 160 / 6
        self.nfl.loc[self.nfl.ToLeft, 'Y_std'] = 160 / 6 - self.nfl.loc[self.nfl.ToLeft, 'Y']

        self.nfl['Dir_rad'] = np.mod(90 - self.nfl.Dir, 360) * math.pi / 180.0
        self.nfl['Dir_std'] = self.nfl.Dir_rad
        self.nfl.loc[self.nfl.ToLeft, 'Dir_std'] = np.mod(np.pi + self.nfl.loc[self.nfl.ToLeft, 'Dir_rad'], 2 * np.pi)

    def _add_possession_columns(self):
        self.nfl['IsRusher'] = self.nfl.NflId == self.nfl.NflIdRusher
        self.nfl['TeamOnOffense'] = "home"
        self.nfl.loc[self.nfl.PossessionTeam != self.nfl.HomeTeamAbbr, 'TeamOnOffense'] = "away"
        self.nfl['IsOnOffense'] = self.nfl.Team == self.nfl.TeamOnOffense

    def _distance_to_centers(self):
        self.nfl['DisFromPlayStart'] = np.sqrt(
            (self.nfl.X_std - self.nfl.YardLine_std - 10) ** 2 + (self.nfl.Y_std ** 2))
        ranks = self.nfl.groupby(['PlayId', 'IsOnOffense'])['DisFromPlayStart'] \
            .rank(ascending=True, method='first')
        ranks.name = 'RankDisFromPlayStart'
        self.nfl = pd.concat([self.nfl, ranks], axis=1)
        self.nfl['IsDefenderInBox'] = False
        self.nfl.loc[(~self.nfl.IsOnOffense) &
                     (self.nfl.DefendersInTheBox >= self.nfl.RankDisFromPlayStart), ['IsDefenderInBox']] = True


def stringtomins(x):
    h, m, s = map(int, x.split(':'))
    return (h * 60) + m + (s / 60)


In [66]:
class Model:
    def __init__(self, input_shape, type='nn', batch_size=32, epochs=13, feature_columns=None):
        self.type = type
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.epochs = epochs
        self.feature_importance_df = pd.DataFrame()
        self.feature_columns = feature_columns
        self.fold = 0

    def next(self, x_train, y_train, x_test, y_test):
        if self.type == 'nn':
            model = self._create_nn(x_train, y_train, x_test, y_test)
        elif self.type == 'lgbm':
            model = self._create_lgbm(x_train, y_train, x_test, y_test)
        self.fold += 1
        return model

    def _create_nn(self, x_train, y_train, x_test, y_test):
        model = Sequential()
        model.add(Dense(256, input_shape=[self.input_shape], activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(199, activation='sigmoid'))

        model.compile(optimizer='adam', loss=['mse'])
        
        earlystop_callback = EarlyStopping(monitor='mse', min_delta=0.0001, patience=3)
        model.fit(x_train,
                  y_train,
#                       callbacks=[earlystop_callback],
                  epochs=self.epochs,
                  verbose=1,
                  validation_data=(x_test, y_test))
        return model

    def _create_lgbm(self, x_train, y_train, x_test, y_test):
        best_params_lgb = {'lambda_l1': 0.13413394854686794,
                               'lambda_l2': 0.0009122197743451751,
                               'num_leaves': 44,
                               'feature_fraction': 0.4271070738920401,
                               'bagging_fraction': 0.9999128827046064,
                               'bagging_freq': 3,
                               'learning_rate': 0.005,
                               'min_child_samples': 43,
                               'objective': 'regression',
                               'metric': 'mae',
                               'verbosity': -1,
                               'boosting_type': 'gbdt',
                               "boost_from_average": False,
                               'random_state': 42}
        model = lgb.LGBMRegressor(**best_params_lgb, 
                                  n_estimators = 300, 
                                  n_jobs = -1)
        model.fit(x_train,
                  y_train,
                  eval_set=[(x_test, y_test)],
                  early_stopping_rounds=self.epochs,
                  eval_metric='mae',
                  verbose=False)
        self._lgbm_feature_importance(model)
        return model
    
    def _lgbm_feature_importance(self, model):
        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = self.feature_columns
        fold_importance_df["importance"] = model.feature_importances_[:len(self.feature_columns)]
        fold_importance_df["fold"] = self.fold
        self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], axis=0)

    def eval_feature_importance(self):
        print("Features importance...")
        cols_imp = (self.feature_importance_df[["Feature", "importance"]]
                    .groupby("Feature")
                    .mean()
                    .sort_values(by="importance", ascending=False)[:50].index)
        best_features = self.feature_importance_df.loc[self.feature_importance_df.Feature.isin(cols_imp)]

        plt.figure(figsize=(14, 26))
        sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance", ascending=False))
        plt.title('LightGBM Features (averaged over folds)')
        plt.tight_layout()
        plt.show()


In [67]:
def train(model_generator, fe, k_fold_splits=5):
    print("Training model for {} folds".format(5))
    models = []

    games_df = fe.features.reset_index()['GameId']
    games = np.unique(games_df.values)

    x = fe.get_final_features()
    y = fe.results
    kf = KFold(n_splits=k_fold_splits)
    kf.get_n_splits(games)
    for game_train_index, game_test_index in kf.split(games):
        train_index = games_df.loc[games_df.isin(games[game_train_index])].index.tolist()
        test_index = games_df.loc[games_df.isin(games[game_test_index])].index.tolist()
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = model_generator.next(x_train, y_train, x_test, y_test)
        models.append(model)
    return models


In [68]:
def predict(test_features, models, yardline):
    pred = np.zeros((1, 199))
    for model in models:
        _pred = model.predict(test_features)
        pred += _pred

    pred /= len(models)
    prev = 0
    for i in range(len(pred[0])):
        if pred[0][i]<prev:
            pred[0][i-1]=pred[0][i]
            pred[0][i]=prev
        if pred[0][i] > 1.0:
            pred[0][i] = 1.0
        prev=pred[0][i]
    # Clip predictions past the yardline, as a player cannot go further than endzone
#     pred[0][0:99 - yardline] = 0.0
#     pred[0][100 + yardline:-1] = 1.0
    return pred

In [69]:
class App:
    def __init__(self, model_type='nn'):
        self.models = []
        self.model_type = model_type

    def run(self, make_submission=False):
        train_df = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
        train_df = train_df.loc[train_df.Season != '2017']

        pf = PersistentFeatures(train_df)
        self._timer(pf.build_features, "Persistent features")
        
        fe = FeatureExtractor(train_df, pf, model_type=self.model_type)
        self._timer(fe.run, "Training features")

        pf.attach_columns(fe.features.columns)

        features = fe.get_final_features()
        m = Model(features.shape[1], epochs=10, type=self.model_type, feature_columns=fe.features.columns)
        self.models = train(m, fe)
        if self.model_type == 'lgbm':
            m.eval_feature_importance()
        if make_submission:
            self.predict_on_env(pf)
    
    def _timer(self, function, description):
        t0 = timeit.default_timer()
        function()
        t1 = timeit.default_timer()
        print('Finished on {}. It took {}s'.format(description, t1 - t0))

    def predict_on_env(self, pf):
        env = nflrush.make_env()
        i = 0
        preds = None
        t0 = timeit.default_timer()
        for (test_df, sample) in env.iter_test():
            fe = FeatureExtractor(test_df, pf, is_test=True)
            fe.run()
            pred = predict(fe.get_final_features(), self.models, fe.features['YardLine_std'].values[0])
            pred_df = pd.DataFrame(data=pred, columns=sample.columns)
            env.predict(pred_df)
#             if preds is not None:
#                 preds = pd.concat([preds, pred_df], sort=False)
#             else:
#                 preds = pred_df
                
            i += 1
            if i % 100 == 0:
                t1 = timeit.default_timer()
                print("Processed {} plays. {} to go. Current elapsed time is {} seconds".format(i, 3438-i, t1-t0))
        env.write_submission_file()
#         return preds


In [None]:
app = App(model_type='nn')
app.run()
app.run(make_submission=True)

  result = method(y)


Finished on Persistent features. It took 0.01178058200457599s
Finished on Training features. It took 1.96995665000577s
Training model for 5 folds
Train on 18368 samples, validate on 4803 samples
Train on 18407 samples, validate on 4764 samples
Train on 18602 samples, validate on 4569 samples
Train on 18694 samples, validate on 4477 samples
Train on 18613 samples, validate on 4558 samples


  result = method(y)


Finished on Persistent features. It took 0.015076288997079246s
Finished on Training features. It took 1.4535197960067308s
Training model for 5 folds
Train on 18368 samples, validate on 4803 samples
Train on 18407 samples, validate on 4764 samples
Train on 18602 samples, validate on 4569 samples
Train on 18694 samples, validate on 4477 samples
Train on 18613 samples, validate on 4558 samples
Processed 100 plays. 3338 to go. Current elapsed time is 42.56859615200665 seconds
Processed 200 plays. 3238 to go. Current elapsed time is 83.24097229300241 seconds
