# Big Data Bowl Data EDA

This notebook was created during a live coding session. [Check it out the stream here.](https://www.twitch.tv/medallionstallion_)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

pd.set_option('max_columns', 100)
plt.style.use('ggplot')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']

## Data Overview
In this competition we are provided with player, game and player stats for special teams plays in the 2018-2020 NFL Seasons. We are also provided tracking data with each players position during the plays.

In [None]:
!ls -lh ../input/nfl-big-data-bowl-2022/

In [None]:
# Loading in game, players and plays files
games = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')

plays = plays.merge(games, on=['gameId'],
            how='left',
            validate='m:1')

# Player Counts
- What are the most common positions found in the player file?

In [None]:
players['Position'].value_counts() \
    .sort_values(ascending=True) \
    .plot(kind='barh', figsize=(10, 15
                               ),
         title='Count of Players by Position')
plt.show()

# Play Info
- Almost 20,000 plays.
- 7,800 kickoffs, 6,000 Punts, 3,400 Extra points, and 2,600 Field Goals.

In [None]:
plays['specialTeamsPlayType'].value_counts().to_frame()

# What Do we know about kickoffs?
- 60% Touchback / 37% Returned

In [None]:
(plays.query('specialTeamsPlayType == "Kickoff"')['specialTeamsResult'] \
    .value_counts() / len(plays.query('specialTeamsPlayType == "Kickoff"'))) \
    .to_frame()

# What do we know about Punts?
- 38% Returned, 27% Fair Catch, 13.8% Out of Bounds
- 3,926 Punts resulted in a fair catch or return

In [None]:
(plays.query('specialTeamsPlayType == "Punt"')['specialTeamsResult'] \
    .value_counts() / len(plays.query('specialTeamsPlayType == "Punt"'))) \
    .to_frame()

# Processing Play Data
To assist in reviewing the play data we will apply some processing to add features about each play.

In [None]:
def add_seconds_into_game(df):
    """
    Takes in a dataframe with "gameClock" column.
    
    Adds secondsOfGameTime column
    """
    game_clock_minutes = df['gameClock'].str.split(':', expand=True)[0].astype('int')
    game_clock_sec = df['gameClock'].str.split(':', expand=True)[1].astype('int')
    gameClockSeconds = game_clock_minutes * 60 + game_clock_sec
    df['secondsOfGameTime'] = (df['quarter'] * 15 * 60) - gameClockSeconds
    return df

def process_play_data(plays, players):
    plays['returnTeam'] = plays \
        .apply(lambda row: row['visitorTeamAbbr'] if row['possessionTeam'] == row['homeTeamAbbr'] else row['homeTeamAbbr'],
                                                        axis=1)
    
    # Calculate the absolute yardline relative to the possession team.
    plays['yardlineNumberAbs'] = plays['yardlineNumber']
    plays.loc[plays['yardlineSide'] == plays['returnTeam'],
              'yardlineNumberAbs'] = \
        (50 - plays.loc[plays['yardlineSide'] == plays['returnTeam']]['yardlineNumberAbs']) + 50

    # Mapping Players to positions
    player_pos_map = players.set_index('nflId')['Position'].to_dict()
    plays['kickerPos'] = plays['kickerId'].map(player_pos_map)
    # Expand 
    plays['returnerId1'] = plays['returnerId'].str.split(';', expand=True)[0]
    plays['returnerId2'] = plays['returnerId'].str.split(';', expand=True)[1]
    
    # Seconds within game
    plays = add_seconds_into_game(plays)
    
    # Team Scores and score differential.
    plays['preSnapPossessionTeamScore'] = plays.apply(lambda row: row['preSnapHomeScore'] if row['possessionTeam'] == row['homeTeamAbbr'] else row['preSnapVisitorScore'], axis=1)
    plays['preSnapReturnTeamScore'] = plays.apply(lambda row: row['preSnapVisitorScore'] if row['possessionTeam'] == row['homeTeamAbbr'] else row['preSnapHomeScore'], axis=1)
    plays['preSnapScoreDifferential'] = plays['preSnapPossessionTeamScore'] - plays['preSnapReturnTeamScore']
    
    return plays

plays = process_play_data(plays, players)

In [None]:
plays_punt_fc_r = plays.query('specialTeamsPlayType == "Punt" and specialTeamsResult in ("Return", "Fair Catch")').copy()
plays_punt_fc_r = plays_punt_fc_r.reset_index(drop=True)

fig, ax = plt.subplots(figsize=(15, 8))
plays_punt_fc_r.groupby(['specialTeamsResult'])['yardlineNumberAbs'] \
    .plot(kind='hist', bins=30, alpha=0.5, ax=ax)
ax.set_title('Punt Yardline by Result', fontsize=20)
plt.legend()
plt.show()

In [None]:
plays_punt_fc_r.query('quarter <= 4') \
    .groupby(['quarter','specialTeamsResult']).size().unstack() \
    .plot(kind='bar',
          figsize=(15, 6),
          title='Fair Catch vs. Returned by Quarter',
          stacked=True)
plt.show()

# Return Yardarge on Returned Punts
- 2286 Plays

In [None]:
returned_punts = plays \
    .query('specialTeamsPlayType == "Punt" and specialTeamsResult == "Return"') \
    .query('kickReturnYardage == kickReturnYardage') \
    .copy() \
    .reset_index(drop=True)

ax = returned_punts['kickReturnYardage'] \
    .plot(kind='hist', bins=60, figsize=(15, 5),
          title='Distribution of Punt Return Yards', color=color_pal[2])
ax.axvline(returned_punts['kickReturnYardage'].median(), color='black', ls='--')
plt.show()

# Lets model to predict return yards
We only want to model punt plays that do not involve a pentalty (1881 plays)

Potential Features:
- kickerId
- possessionTeam
- returnTeam
- yardsToGo
- yardlineNumberAbs
- returnerId
- returner1Pos
- secondsOfGameTime
- preSnapReturnTeamScore
- preSnapPossessionTeamScore
- preSnapScoreDifferential

In [None]:
player_pos_map = players.set_index('nflId')['Position'].to_dict()
returned_punt_nopenalty = returned_punts.loc[returned_punts['penaltyCodes'].isna()] \
    .reset_index(drop=True)
returned_punt_nopenalty['returner1Pos'] = returned_punt_nopenalty['returnerId1'].astype('int').map(player_pos_map)
returned_punt_nopenalty['returner1Pos'].value_counts()

# Simple Linear Model

In this section we use a simple regression model (ElasticNet) to predict the outcome of punt plays based on the features we've created above.

In [None]:
FEATURES = ['kickerId',
            'possessionTeam',
            'returnTeam',
            'yardsToGo',
            'yardlineNumberAbs',
            'returnerId',
            'returner1Pos',
            'secondsOfGameTime',
            'preSnapPossessionTeamScore',
            'preSnapReturnTeamScore',
            'preSnapScoreDifferential']

TARGET = ['kickReturnYardage']

numeric_features = ['yardsToGo','yardlineNumberAbs','secondsOfGameTime',
                    'preSnapPossessionTeamScore','preSnapReturnTeamScore',
                    'preSnapScoreDifferential']

df = returned_punt_nopenalty.copy()
X = df[FEATURES].copy()
y = df[TARGET].copy()
oof = df[['gameId','playId'] + TARGET].copy()
X_num = X[numeric_features].values
y = y.values

In [None]:
from sklearn.linear_model import ElasticNetCV

reg = ElasticNetCV()
reg.fit(X_num, y.reshape(-1))

# Pull the coeffiencts from the model
en_coef = pd.DataFrame(index=numeric_features,
             data=reg.coef_,
            columns=['coef'])

Reviewing our model coefficents we can see the `yardlineNumberAbs` and `preSnapScoreDifferential` are the most correlated with the play's outcome.

In [None]:
en_coef.sort_values('coef')

## Plot the predictions vs. Actual of the linear model

In [None]:
returned_punt_nopenalty['en_pred'] = reg.predict(X_num)
returned_punt_nopenalty.plot(x=TARGET[0],
                             y='en_pred',
                             style='.',
                             figsize=(10, 10),
                            color=color_pal[5])

## Machine Learning Model
Next we will use a machine learning model.
- 5 kfold cross validation.
- Fit a final model on the average best iteration across folds.

In [None]:
import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error


LEARNING_RATE = 0.001
CAT_COLS = ['kickerId','possessionTeam','returnTeam','returnerId','returner1Pos']

for c in CAT_COLS:
    X[c] = X[c].astype('category')

kf = KFold(n_splits=5, shuffle=True)

fold = 0
best_iters = []
for tr_idx, val_idx in kf.split(X, y):
    df.loc[val_idx, 'fold'] = fold
    X_tr = X.loc[tr_idx]
    y_tr = y[tr_idx]

    X_val = X.loc[val_idx]
    y_val = y[val_idx]
    reg = lgb.LGBMRegressor(n_estimators=1000,
                            learning_rate=LEARNING_RATE,
                            random_state=529
                           )
    reg.fit(X_tr, y_tr, eval_set=(X_val, y_val),
            verbose=False,
            early_stopping_rounds=100)
    preds = reg.predict(X_val)
    oof.loc[val_idx, 'pred'] = preds
    
    # Scoring
    mae_score = mean_absolute_error(y_val, preds)
    mse_score = mean_squared_error(y_val, preds)
    best_iter = reg.best_iteration_
    best_iters.append(best_iter)
    print(f'Fold {fold}: MAE {mae_score:0.4f} - MSE {mse_score:0.4f} - Best Iteration {best_iter}')
    fold += 1
    
best_avg_iteration = np.mean(best_iters)
mae_oof = mean_absolute_error(oof['kickReturnYardage'], oof['pred'])
mse_oof = mean_squared_error(oof['kickReturnYardage'], oof['pred'])

print(f'The average best iteraction across folds is {best_avg_iteration}')
print(f'OOF Score MAE {mae_oof:0.2f} - MSE {mse_oof:0.2f}')

Out model on average is 6.77 yards off from the predicted target. Finally, we fit a single model on all of the training data using the average best iteration across folds.

In [None]:
reg = lgb.LGBMRegressor(n_estimators=round(best_avg_iteration),
                        learning_rate=LEARNING_RATE)
reg.fit(X, y)
preds = reg.predict(X)
oof['fullfit_pred'] = reg.predict(X)
oof.plot(x='fullfit_pred', y=TARGET[0], kind='scatter',
         figsize=(10, 10), title='Predictions vs Target for LGBMRegressor Model')


## Model Feature Importances

In [None]:
pd.DataFrame(index=reg.feature_name_,
             data=reg.feature_importances_,
            columns=['importance']).sort_values('importance') \
    .plot(kind='barh', title='LGBM Feature Importance', figsize=(12, 8))
plt.legend().remove()

# PFF Data Exploration

In [None]:
pff = pd.read_csv('../input/nfl-big-data-bowl-2022/PFFScoutingData.csv')
punt_pff = oof[['gameId','playId']].merge(pff, validate='1:1').copy()
# See if punt where the punter intended
punt_pff['puntedWhereIntended'] = punt_pff['kickDirectionIntended'] == punt_pff['kickDirectionActual']
punt_pff['puntedWhereIntended'].value_counts()

In [None]:
punt_pff['puntedWhereIntended'].value_counts() / punt_pff.shape[0]

## Snap time vs. Operation Time

In [None]:
sns.jointplot(x='snapTime', y='operationTime',
              data=punt_pff, hue='kickType',
              alpha=0.5, height=10)
plt.show()

# How to improve this model?
In the next section we will explore the tracking data, and features that may help improve the model score.
- Up next I'll explore the tracking data.
- More to come.

In [None]:
tracking2018 = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2018.csv')
my_play = tracking2018.query('gameId == 2018090600 and playId == 2599').reset_index(drop=True).copy()