In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import time 
import pandas as pd
import numpy as np
import os
import math
import datetime

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import Counter
from sklearn.model_selection import train_test_split 
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
offense_pos = ['QB','WR','RB','TE','FB','HB','P','LS','K']

coverage_pd = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2021-bonus/coverages_week1.csv",
                    header = 0)
display(coverage_pd)

In [None]:
# Get Play Position Dictionary

wk1_pd = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/week1.csv',header=0)
play_data = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv',header=0)
game_data = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/games.csv', header=0)


positions = wk1_pd.position.unique()
label_encode_dict = dict()
for i in range(len(positions)):
    label_encode_dict[positions[i]] = i
    
label_encode_dict['P'] = 19
label_encode_dict['LS'] = 20
label_encode_dict['K'] = 21
label_encode_dict['DT'] = 22

In [None]:
# Convert times from strings (lame) to datetimes
wk1_pd['time'] = wk1_pd.time.astype('datetime64[ns]')

# See the shortest play
x = wk1_pd.copy()
x = x.groupby(['gameId','playId'])['frameId'].max().values
plt.hist(x)
plt.gca().axvline(min(x),linestyle='--',color='gray',label='Min Frame ({})'.format(min(x)))
plt.gca().axvline(np.mean(x),linestyle='--',color='green',label='Avg Frame ({})'.format(int(np.mean(x))))
plt.legend(frameon=False)
plt.show()

In [None]:
# for i in range(1,7):
#     cntr = 0
#     for j in dif_secs:
#         if j>=i:
#             cntr += 1
#     print('At least',i,'seconds: {:.2f}% of plays'.format(100*cntr/len(dif_secs)))

# Play-Level Man/Zone Predictions

Using a GBM trained on player position for the first several seconds of the play with provided labels to determine if play is a Man or Zone play

### Step 1: Preprocessing 1 Week

In [None]:
# NEW PLAYER POSITION DATASET
plyr_pd = wk1_pd.copy()

# Convert times from strings (lame) to datetimes
plyr_pd['time'] = plyr_pd.time.astype('datetime64[ns]')

# Pos at Ball Snap
d_schema_pd = plyr_pd[plyr_pd.event == 'ball_snap'][['gameId','playId','x','y','displayName','position']]
d_schema_pd.columns = ['gameId','playId','x0','y0','displayName','position']

# Time at Ball Snap
time_data = plyr_pd[plyr_pd.event == 'ball_snap'][['gameId','playId','time']].drop_duplicates()
time_data.columns = ['gameId','playId','snap_time']

# time file
plyr_time = plyr_pd.merge(time_data, on=['gameId','playId'])


## Remove plays that run longer than 3 seconds
# plyr_time = plyr_time.merge(plyr_time.groupby(['gameId','playId'])['time'].max(), on=['gameId','playId']) \
#                      .merge(plyr_time.groupby(['gameId','playId'])['time'].min(), on=['gameId','playId'])
# plyr_time['diff'] = (plyr_time.time_y - plyr_time.time)/ np.timedelta64(1, 's')
# plyr_time = plyr_time[plyr_time['diff'] > 3]
# plyr_time = plyr_pd.merge(time_data, on=['gameId','playId'])

# Pos at 1-3 Seconds
for i in range(1,4):
    sec_loc = plyr_time[plyr_time.time <= plyr_time.snap_time + datetime.timedelta(seconds=i)]
    sec_loc = sec_loc.groupby(['gameId','playId'])['frameId'].max()

    sec_loc = plyr_time.merge(sec_loc, on=['gameId','playId'])
    sec_loc = sec_loc[sec_loc.frameId_x == sec_loc.frameId_y][['gameId','playId','x','y','displayName']]
    sec_loc.columns = ['gameId','playId','x{}'.format(i),'y{}'.format(i),'displayName']

    d_schema_pd = d_schema_pd.merge(sec_loc, on=['gameId','playId','displayName'], how='inner')


# Add Offense Column
d_schema_pd['offense'] = [i in offense_pos for i in d_schema_pd.position]

# Get Football Position
football_pos = wk1_pd[wk1_pd.team == 'football'][wk1_pd.frameId == 1][['gameId','playId','x']]
football_pos.columns = ['gameId','playId','fball_x']

d_schema_pd = d_schema_pd.merge(football_pos, on=['gameId','playId']) 

# If I just want to use Defense
d_schema_pd = d_schema_pd[~d_schema_pd.offense]

# Found out max players on a play
x = d_schema_pd.copy()
x['cntr'] = 1

play_group = x.groupby(['gameId','playId'])['cntr'].sum().values
print(max(play_group))

# Make all coordinates relative to Football axis
for i in range(4):
    col = 'x{}'.format(i)
    d_schema_pd[col] = d_schema_pd[col] - d_schema_pd['fball_x']
    d_schema_pd.loc[(d_schema_pd.offense) & (d_schema_pd[col] > 0), col] *= -1
    d_schema_pd.loc[(~d_schema_pd.offense) & (d_schema_pd[col] < 0), col] *= -1
    

d_schema_pd = d_schema_pd[d_schema_pd.displayName != 'Football']
d_schema_pd['position'] = [label_encode_dict[i] for i in d_schema_pd.position]

d_schema_pd = d_schema_pd.drop(['fball_x','offense','displayName'],axis=1) \
                         .groupby(['gameId','playId']).agg(lambda x: list(x))

p = play_data[['gameId','playId','quarter','down','yardsToGo','yardlineNumber','defendersInTheBox','numberOfPassRushers']] \
               .set_index(['gameId','playId'])

d_schema_pd2 = d_schema_pd.join(p).fillna(-1) \
                         .join(coverage_pd.set_index(['gameId','playId'])).dropna()

# d_schema_pd3 = d_schema_pd.join(p).dropna()

In [None]:
def row_maker3(row):
    x0 = row['x0']
    y0 = row['y0']
    x1 = row['x1']
    y1 = row['y1']
    x2 = row['x2']
    y2 = row['y2']
    x3 = row['x3']
    y3 = row['y3']
    pos =  row['position']
    
    l = sorted(zip(x0, pos, y0, x1, y1, x2, y2, x3,y3))
    x0, pos, y0, x1, y1, x2, y2, x3, y3 = zip(*l)
        
    row_dict ={
        'position':pos,
        'x0':x0,
        'y0':y0,
        'x1':x1,
        'y1': y1,
        'x2':x2,
        'y2':y2,
        'x3':x3,
        'y3':y3
    }
            
    for i in range(18):
        if len(x0)-1 < i:
            for j in range(4):
                for k in ['x','y']:
                    row[k+str(j)+'_'+str(i)] = 99
            row['pos_'+str(i)] = 99
        else:
            for j in range(4):
                for k in ['x','y']:
                    row[k+str(j)+'_'+str(i)] = row_dict[k+str(j)][i]
            row['pos_'+str(i)] = row_dict['position'][i]
         
    row['coverage_label'] = 'Man' in row['coverage']
    
    for i in row_dict.keys():
        row = row.drop([i])
    return(row)
 
t = time.time()
bs_list_bin = d_schema_pd2.apply(row_maker3, axis=1)
print(int(time.time()-t),'seconds to run')

In [None]:
X = bs_list_bin.drop(['coverage','coverage_label'],axis=1).values
y = bs_list_bin.coverage_label.values

Counter(y)

### Step 2: Building Model & Producing Accuracy Metrics

In [None]:
X = bs_list_bin.drop(['coverage','coverage_label'],axis=1).values
y = bs_list_bin.coverage_label.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

dtree_model = DecisionTreeClassifier(max_depth = 150).fit(X_train, y_train) 
# dtree_model = GradientBoostingClassifier(max_depth = 150).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test) 

disp = plot_confusion_matrix(dtree_model, X_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize='true')

In [None]:
acc = sum(dtree_predictions == y_test)/len(y_test)
f1 = f1_score(y_test,dtree_predictions)

plot_roc_curve(dtree_model, X_test, y_test)
plt.plot([0,1],[0,1],linestyle='--',color='red')
plt.ylim([0,1])
plt.xlim([0,1])
plt.title('ACC: {}% || F1: {:.2f}'.format(int(acc*100),f1))

### Step 3: Collecting Predictions for all 17 Weeks

In [None]:
# # Collect all results for all weeks
# week1 = pd.read_csv('../input/nfl-big-data-bowl-2021/week1.csv')
# week2 = pd.read_csv('../input/nfl-big-data-bowl-2021/week2.csv')
# week3 = pd.read_csv('../input/nfl-big-data-bowl-2021/week3.csv')
# week4 = pd.read_csv('../input/nfl-big-data-bowl-2021/week4.csv')
# week5 = pd.read_csv('../input/nfl-big-data-bowl-2021/week5.csv')
# week6 = pd.read_csv('../input/nfl-big-data-bowl-2021/week6.csv')
# week7 = pd.read_csv('../input/nfl-big-data-bowl-2021/week7.csv')
# week8 = pd.read_csv('../input/nfl-big-data-bowl-2021/week8.csv')
# week9 = pd.read_csv('../input/nfl-big-data-bowl-2021/week9.csv')
# week10 = pd.read_csv('../input/nfl-big-data-bowl-2021/week10.csv')
# week11 = pd.read_csv('../input/nfl-big-data-bowl-2021/week11.csv')
# week12 = pd.read_csv('../input/nfl-big-data-bowl-2021/week12.csv')
# week13 = pd.read_csv('../input/nfl-big-data-bowl-2021/week13.csv')
# week14 = pd.read_csv('../input/nfl-big-data-bowl-2021/week14.csv')
# week15 = pd.read_csv('../input/nfl-big-data-bowl-2021/week15.csv')
# week16 = pd.read_csv('../input/nfl-big-data-bowl-2021/week16.csv')
# week17 = pd.read_csv('../input/nfl-big-data-bowl-2021/week7.csv')

# week = pd.concat([week1,week2], ignore_index=True)

In [None]:
def row_maker4(row):
    x0 = row['x0']
    y0 = row['y0']
    x1 = row['x1']
    y1 = row['y1']
    x2 = row['x2']
    y2 = row['y2']
    x3 = row['x3']
    y3 = row['y3']
    pos =  row['position']
    
    l = sorted(zip(x0, pos, y0, x1, y1, x2, y2, x3,y3))
    x0, pos, y0, x1, y1, x2, y2, x3, y3 = zip(*l)
        
    row_dict ={
        'position':pos,
        'x0':x0,
        'y0':y0,
        'x1':x1,
        'y1': y1,
        'x2':x2,
        'y2':y2,
        'x3':x3,
        'y3':y3
    }
            
    for i in range(18):
        if len(x0)-1 < i:
            for j in range(4):
                for k in ['x','y']:
                    row[k+str(j)+'_'+str(i)] = 99
            row['pos_'+str(i)] = 99
        else:
            for j in range(4):
                for k in ['x','y']:
                    row[k+str(j)+'_'+str(i)] = row_dict[k+str(j)][i]
            row['pos_'+str(i)] = row_dict['position'][i]
         
#     row['coverage_label'] = 'Man' in row['coverage']
    
    for i in row_dict.keys():
        row = row.drop([i])
    return(row)

In [None]:
play_pred_pd = pd.DataFrame(columns=['gameId','playId','isManPlay'])

for week_num in range(1,18):
    print('Starting wk', week_num)
    t = time.time()
    week = pd.read_csv('../input/nfl-big-data-bowl-2021/week{}.csv'.format(week_num))
    
    # Prepro from above
    # NEW PLAYER POSITION DATASET
    plyr_pd = week.copy()

    # Convert times from strings (lame) to datetimes
    plyr_pd['time'] = plyr_pd.time.astype('datetime64[ns]')

    # Pos at Ball Snap
    d_schema_pd = plyr_pd[plyr_pd.event == 'ball_snap'][['gameId','playId','x','y','displayName','position']]
    d_schema_pd.columns = ['gameId','playId','x0','y0','displayName','position']

    # Time at Ball Snap
    time_data = plyr_pd[plyr_pd.event == 'ball_snap'][['gameId','playId','time']].drop_duplicates()
    time_data.columns = ['gameId','playId','snap_time']

    # time file
    plyr_time = plyr_pd.merge(time_data, on=['gameId','playId'])


    ## Remove plays that run longer than 3 seconds
#     plyr_time = plyr_time.merge(plyr_time.groupby(['gameId','playId'])['time'].max(), on=['gameId','playId']) \
#                          .merge(plyr_time.groupby(['gameId','playId'])['time'].min(), on=['gameId','playId'])
#     plyr_time['diff'] = (plyr_time.time_y - plyr_time.time)/ np.timedelta64(1, 's')
#     plyr_time = plyr_time[plyr_time['diff'] > 3]
#     plyr_time = plyr_pd.merge(time_data, on=['gameId','playId'])

    # Pos at 1-3 Seconds
    for i in range(1,4):
        sec_loc = plyr_time[plyr_time.time <= plyr_time.snap_time + datetime.timedelta(seconds=i)]
        sec_loc = sec_loc.groupby(['gameId','playId'])['frameId'].max()

        sec_loc = plyr_time.merge(sec_loc, on=['gameId','playId'])
        sec_loc = sec_loc[sec_loc.frameId_x == sec_loc.frameId_y][['gameId','playId','x','y','displayName']]
        sec_loc.columns = ['gameId','playId','x{}'.format(i),'y{}'.format(i),'displayName']

        d_schema_pd = d_schema_pd.merge(sec_loc, on=['gameId','playId','displayName'], how='inner')


    # Add Offense Column
    d_schema_pd['offense'] = [i in offense_pos for i in d_schema_pd.position]

    # Get Football Position
    football_pos = week[week.team == 'football'][week.frameId == 1][['gameId','playId','x']]
    football_pos.columns = ['gameId','playId','fball_x']

    d_schema_pd = d_schema_pd.merge(football_pos, on=['gameId','playId']) 

    # If I just want to use Defense
    d_schema_pd = d_schema_pd[~d_schema_pd.offense]

    # Make all coordinates relative to Football axis
    for i in range(4):
        col = 'x{}'.format(i)
        d_schema_pd[col] = d_schema_pd[col] - d_schema_pd['fball_x']
        d_schema_pd.loc[(d_schema_pd.offense) & (d_schema_pd[col] > 0), col] *= -1
        d_schema_pd.loc[(~d_schema_pd.offense) & (d_schema_pd[col] < 0), col] *= -1


    d_schema_pd = d_schema_pd[d_schema_pd.displayName != 'Football']
    d_schema_pd['position'] = [label_encode_dict[i] for i in d_schema_pd.position]

    d_schema_pd = d_schema_pd.drop(['fball_x','offense','displayName'],axis=1) \
                             .groupby(['gameId','playId']).agg(lambda x: list(x))

    p = play_data[['gameId','playId','quarter','down','yardsToGo','yardlineNumber','defendersInTheBox','numberOfPassRushers']] \
                   .set_index(['gameId','playId']).fillna(-1)

    d_schema_pd2 = d_schema_pd.join(p)
    
    # Apply Function from above 
    bs_list_bin = d_schema_pd2.apply(row_maker4, axis=1)
    print(int(time.time()-t),'seconds to process week',week_num,'...')
    
    # Predict
    bs_list_bin['isManPlay'] = dtree_model.predict_proba(bs_list_bin.values)[:,0]
    week_preds = bs_list_bin[['isManPlay']].reset_index()
    
    play_pred_pd = play_pred_pd.append(week_preds)
    
    print('Number of Total Plays Score: {:,}'.format(len(play_pred_pd)))


In [None]:
play_pred_pd

In [None]:
100*len(play_pred_pd[play_pred_pd.isManPlay == 1.0])/19227

In [None]:
# play_pred_pd.to_csv('play_pred.csv')

# Player-level Man-Zone Predictions

Using an unsupervised method described here and implemented here, determine likelihood that a player is in man coverage

In [None]:
# Build dataset
# Build player-level labels
    # Based on coverage type
# unsupervised, check groupings 

In [None]:
x_pd = wk1_pd[~wk1_pd.position.isin(offense_pos)][['gameId','playId','position','displayName']].drop_duplicates().copy()
x_pd

In [None]:
pos_num_pd = x_pd.copy()

# Count the number of safeties
safety_pos = ['SS','FS','S']
pos_num_pd['isSafety'] = 0
pos_num_pd['isSS'] = 0
pos_num_pd.loc[pos_num_pd.position.isin(safety_pos), 'isSafety'] = 1

# Count the number of DBs
db_pos = ['CB','DB']
pos_num_pd['isDB'] = 0
pos_num_pd.loc[pos_num_pd.position.isin(db_pos), 'isDB'] = 1


pos_num_pd = pos_num_pd.groupby(['gameId','playId']).agg({'isSafety':['sum'],'isDB':['sum']}) \
                      .reset_index()
pos_num_pd.columns = ['gameId','playId','numS','numDB']

x_pd2 = x_pd.merge(coverage_pd, on=['gameId','playId']) \
            .merge(pos_num_pd, on=['gameId','playId']) 

x_pd2['playerCoverage'] = np.nan

for i in range(6):
    if i == 0:
        x_pd2.loc[(x_pd2.coverage=='Cover 0 Man'), 'playingMan'] = True
        x_pd2.loc[(x_pd2.coverage=='Prevent Zone' ), 'playingMan'] = False
        continue
        
    # If Cover X Man == i, we can check if we know coverages
    x_pd2.loc[(x_pd2.coverage=='Cover {} Man'.format(i))&
              (x_pd2.numS <= i)&
              (x_pd2.position.isin(safety_pos)), 'playingMan'] = False

    x_pd2.loc[(x_pd2.coverage=='Cover {} Man'.format(i))&
              (x_pd2.numS >= i)&
              (x_pd2.position.isin(db_pos)), 'playingMan'] = True
    
x_pd2.loc[x_pd2.coverage.str.contains('Zone')&((x_pd2.position.isin(safety_pos))|(x_pd2.position.isin(db_pos))),'playingMan'] = False
        
len(x_pd2[~x_pd2.playingMan.isna()]),100*len(x_pd2[~x_pd2.playingMan.isna()])/len(x_pd2)         
# x_pd2

In [None]:
x_pd2.groupby('playingMan').count()

In [None]:
# Generate Dataset

def process_player_data(week):
    weekArray = np.array(week)
    previousEvent = 'ball_snap'
    for i, instance in enumerate(weekArray):
        event = instance[8]
        frameId = instance[13]
        if (previousEvent == 'ball_snap' and event != 'ball_snap') or frameId == 1:
            weekArray[i][8] = 'ball_snap'
            previousEvent = 'ball_snap'
        elif (event == 'ball_snap'):
            previousEvent = 'between_snap'
        elif (previousEvent == 'between_snap' and event != 'pass_forward'):
            weekArray[i][8] = 'between_snap'
            previousEvent = 'between_snap'
        elif (event == 'pass_forward'):
            weekArray[i][8] = 'after_thrown'
            previousEvent = 'after_thrown'
        elif (previousEvent == 'after_thrown' and frameId != 1):
            weekArray[i][8] = 'after_thrown'
            previousEvent = 'after_thrown'

    weekMod = pd.DataFrame(weekArray, columns=week.columns)
    week['event'] = weekMod['event']
    weekMod = week

    varX = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['x'].agg(['var']).reset_index().rename(columns={"var": "varX"})
    varY = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['y'].agg(['var']).reset_index().rename(columns={"var": "varY"})
    varS = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['s'].agg(['var']).reset_index().rename(columns={"var": "varS"})

    groupedWeek = weekMod.groupby(['gameId', 'playId', 'frameId'])
    playerXY = {}
    for name, group in groupedWeek:
        playerXY[name] = []
        for row in group.iterrows():
            data = [row[1]['nflId'], row[1]['team'], row[1]['x'], row[1]['y'], row[1]['dir']]
            playerXY[name].append(data)

    features = list(weekMod.columns)
    weekArray = np.array(weekMod)
    minOppDist = []
    for player in weekArray:
        try:
            if player[features.index('team')] != 'football':
                opponentPositions = playerXY[(player[features.index('gameId')], player[features.index('playId')], player[features.index('frameId')])]
                distances = []
                directions = []
                opponents = []
                xs = []
                ys = []
                for oppPos in opponentPositions: 
                    if player[features.index('team')] != oppPos[1] and player[features.index('team')] != 'football' and oppPos[1] != 'football':
                        dx = (player[features.index('x')] - oppPos[2])**2
                        dy = (player[features.index('y')] - oppPos[3])**2
                        dist = np.sqrt(dx+dy)
                        distances.append(dist)
                        directions.append(oppPos[4])
                        opponents.append(oppPos[0])
                        xs.append(oppPos[2])
                        ys.append(oppPos[3])
                minDist = min(distances)
                closestOpponent = opponents[np.argmin(distances)]
                opponentDir = directions[np.argmin(distances)]
                opponentX = xs[np.argmin(distances)]
                opponentY = ys[np.argmin(distances)]
                summary = [player[features.index('gameId')], player[features.index('playId')], player[features.index('frameId')], player[features.index('nflId')], minDist, closestOpponent, opponentDir, opponentX, opponentY]
                minOppDist.append(summary)
        except:
            continue

    minOppDist = pd.DataFrame(minOppDist, columns=['gameId', 'playId', 'frameId', 'nflId', 'oppMinDist', 'closestOpp(nflId)', 'oppDir', 'oppX', 'oppY'])
    weekMod = pd.merge(weekMod, minOppDist, how='left', on=['gameId', 'frameId', 'playId', 'nflId'])
    oppVar = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['oppMinDist'].agg(['var']).reset_index().rename(columns={"var": "oppVar"})
    oppMean = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['oppMinDist'].agg(['mean']).reset_index().rename(columns={"mean": "oppMean"})

    features = list(weekMod.columns)
    weekArray = np.array(weekMod)
    minMateDist = []
    for player in weekArray:
        if player[features.index('team')] != 'football':
            matePositions = playerXY[(player[features.index('gameId')], player[features.index('playId')], player[features.index('frameId')])]
            distances = []
            mates = []
            xs = []
            ys = []
            for matePos in matePositions: 
                if player[features.index('team')] == matePos[1] and player[features.index('nflId')] != matePos[0] and player[features.index('team')] != 'football' and matePos[1] != 'football':
                    dx = (player[features.index('x')] - matePos[2])**2
                    dy = (player[features.index('y')] - matePos[3])**2
                    dist = np.sqrt(dx+dy)
                    distances.append(dist)
                    mates.append(matePos[0])
                    xs.append(oppPos[2])
                    ys.append(oppPos[3])
            minDist = min(distances)
            closestMate = mates[np.argmin(distances)]
            mateX = xs[np.argmin(distances)]
            mateY = ys[np.argmin(distances)]
            summary = [player[features.index('gameId')], player[features.index('playId')], player[features.index('frameId')], player[features.index('nflId')], minDist, closestMate, mateX, mateY]
            minMateDist.append(summary)

    minMateDist = pd.DataFrame(minMateDist, columns=['gameId', 'playId', 'frameId', 'nflId', 'mateMinDist', 'closestMate(nflId)', 'mateX', 'mateY'])
    weekMod = pd.merge(weekMod, minMateDist, how='left', on=['gameId', 'frameId', 'playId', 'nflId'])
    mateVar = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['mateMinDist'].agg(['var']).reset_index().rename(columns={"var": "mateVar"})
    mateMean = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['mateMinDist'].agg(['mean']).reset_index().rename(columns={"mean": "mateMean"})

    diffDir = np.absolute(weekMod['dir'] - weekMod['oppDir'])
    weekMod['diffDir'] = diffDir
    oppDirVar = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['diffDir'].agg(['var']).reset_index().rename(columns={"var": "oppDirVar"})
    oppDirMean = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['diffDir'].agg(['mean']).reset_index().rename(columns={"mean": "oppDirMean"})

    ratio = weekMod['oppMinDist'] / np.sqrt((weekMod['oppX'] - weekMod['mateX'])**2 + (weekMod['oppY'] - weekMod['mateY'])**2)
    weekMod['oppMateDistRatio'] = ratio
    oppMateDistRatioMean = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['oppMateDistRatio'].agg(['mean']).reset_index().rename(columns={"mean": "meanOppMateDistRatio"})
    oppMateDistRatioVar = weekMod.groupby(['gameId', 'playId', 'event', 'nflId'])['oppMateDistRatio'].agg(['var']).reset_index().rename(columns={"var": "varOppMateDistRatio"})

    features = [varX, varY, varS, oppVar, oppMean, mateVar, mateMean, oppDirVar, oppDirMean, oppMateDistRatioMean, oppMateDistRatioVar]
    for feature in features:
        weekMod = pd.merge(weekMod, feature, how='left', on=['gameId', 'event', 'playId', 'nflId'])
       
    weekMod = weekMod.dropna(subset=['nflId']).fillna(0)

    return(weekMod)

start = time.time()
weekMod = process_player_data(wk1_pd)
print(' {:.2f} minutes to process'.format((time.time()-start)/60))

In [None]:
# Training

samples = x_pd2[~x_pd2.playingMan.isna()][['gameId','playId','displayName','playingMan']]

samples['playingMan'] = samples['playingMan'].astype(int)

WeekMod = samples \
                .merge(weekMod, on=['gameId','playId','displayName']) \
                .drop(['time','event','jerseyNumber','route','team','playDirection'],axis=1)

WeekMod = WeekMod[~WeekMod.position.isin(offense_pos)]

WeekMod['position'] = [label_encode_dict[i] for i in WeekMod.position]

msk = np.random.rand(len(samples)) < 0.8

train_samples = samples[msk].drop(['playingMan'],axis=1)
test_samples = samples[~msk].drop(['playingMan'],axis=1)

X_test = WeekMod.merge(test_samples, on=['gameId','playId','displayName']).dropna(subset=['nflId'])
y_test = X_test.playingMan.values
y_labels = X_test[['gameId','playId','displayName','playingMan']]
X_test = X_test.drop(['playingMan','gameId','playId','displayName','nflId'],axis=1)

X_train = WeekMod.merge(train_samples, on=['gameId','playId','displayName']).dropna(subset=['nflId'])
y_train =  X_train.playingMan.values
X_train = X_train.drop(['playingMan','gameId','playId','displayName','nflId'],axis=1)

dtree_model = GradientBoostingClassifier(max_depth = 150).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test) 

print(' {:.2f} minutes to process & train'.format((time.time()-start)/60))

In [None]:
# plt.figure(figsize=[16,10])
# plt.plot(threshold,acc_list, linestyle='--', label='Acc')
# plt.plot(threshold,f1_list,  linestyle='--', label='F1')
# plt.legend()
# plt.title('Best: {:.2f} ({:.2f})'.format(best_idx/100,best_val))
# plt.ylim([0,1])

In [None]:
# Validate

y_labels['isManPred'] = dtree_predictions

y = y_labels.groupby(['gameId','playId','displayName'])[['isManPred']].mean() \
            .merge(
                    y_labels[['gameId','playId','displayName','playingMan']].drop_duplicates(),
                    on=['gameId','playId','displayName']
                  )
acc_list = []
f1_list = []
threshold = [i for i in range(1,100)]
best_idx = 1
best_val = 0
for i in threshold:
    p = (y.isManPred >= (i/100)).astype(int)
    
    acc = sum(p == y.playingMan)/len(p)
    f1 = f1_score(p,y.playingMan)
    
    acc_list.append(acc)
    f1_list.append(f1)
    if f1 > best_val:
        print(i,f1)
        best_val = f1
        best_idx = i
    
plt.title('Best: {:.2f} ({:.2f})'.format(best_idx/100,best_val))

cm = confusion_matrix((y.isManPred >= (best_idx/100)).astype(int), y.playingMan)

df_cm = pd.DataFrame(cm)
# plt.figure(figsize=(10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

plt.title('{:.0f}% Acc || {:.2f} F1'.format(100*(cm[0][0]+cm[1][1])/len(y), f1_score((y.isManPred >= (best_idx/100)).astype(int),y.playingMan)))
plt.show()

In [None]:
# Thresholding is super low so there are very few False Positives
# If the model predicts Zone, it absolutely is a Zone player
# If the model predicts Man, check if the play-level expects man as well

In [None]:
# Predict
player_pd = pd.DataFrame()
start = time.time()
for week_num in range(1,18):
    print('Starting wk', week_num)
    t = time.time()
    week = pd.read_csv('../input/nfl-big-data-bowl-2021/week{}.csv'.format(week_num))
    
    week = process_player_data(week).dropna(subset=['nflId'])
    
    week = week[~week.position.isin(offense_pos)]
    
    context = week[['playId','gameId','displayName']]
    
    week = week.drop(['time','gameId','playId','event','jerseyNumber','route',
                      'nflId','displayName','team','playDirection'],axis=1) 
    
    
    week['position'] = [label_encode_dict[i] for i in week.position]
    
    dtree_predictions = dtree_model.predict(week) 
    
    context['isManCoverage'] = dtree_predictions
    
    context = context.groupby(['gameId','playId','displayName'])[['isManCoverage']].mean()
    
    context['isManCoverage'] = (context.isManCoverage >= (best_idx/100)).astype(int)
    
    player_pd = player_pd.append(context)
    
    print('Processing & Scoring took {:.2f} minutes ({:.2f}min, {:,} players total)'.format((time.time()-t)/60,
                                                                                            (time.time()-start)/60,
                                                                                             len(player_pd)))
# player_pd

In [None]:
player_pd.to_csv('player_level_man_pred.csv')

<a href="./player_level_man_pred.csv"> Download File </a>

# Find Targetted Receivers

In [None]:
shortHandDict = dict()
offense_pos = ['QB','WR','RB','TE','FB','HB']

for week_num in range(1,18):
    print('Starting wk', week_num)
    week = pd.read_csv('../input/nfl-big-data-bowl-2021/week{}.csv'.format(week_num))
    
    off_data = week[week.position.isin(offense_pos)]
    for name in off_data.displayName.unique():

        shorthand = name[0]+'.'+name.split()[1]

        
        # If there is a name duplication, mark it for later cleaning
        # Else, add it to dictionary
        if shorthand in shortHandDict:
            if shortHandDict[shorthand] != name:
                shortHandDict[shorthand] = shorthand+'*'
        else:
            shortHandDict[shorthand] = name
  
trouble_makers = ['Ty.Williams','Dam. Williams','D.Cruikshank','De.Thomas','M.Crosby',
                  'R.Kelly','J.Hardee','B.Sowell','S.Shields','D.Dawkins','J.Lewis',
                  'S.Pulley','A.Villanueva','T.Decker','Dar.Williams','J.Staley','G.Fant',
                  'S.Hubbard','K.Williams']

for name in trouble_makers:
    shortHandDict[name] = name+'*'
    
shortHandDict['E.St'] = 'Equanimeous St. Brown'
# shortHandDict['O.Beckham'] = 'Odell Beckham Jr.'

In [None]:
new_plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv',header=0)
new_plays['target_receiver'] = new_plays.playDescription.str.extract(r'[o|t][r|o]\s([A-Z][a-z]*\.\s?\S+[a-z])[\s\.]')
new_plays = new_plays.replace({"target_receiver": shortHandDict}).fillna('None')
   
new_plays

In [None]:
from tqdm.notebook import tqdm
start = time.time()

dup_plays = new_plays[new_plays.target_receiver.str.contains('\*')]
for week_num in range(1,18):
    week = pd.read_csv('../input/nfl-big-data-bowl-2021/week{}.csv'.format(week_num))
    print('Checking wk {}'.format(week_num))
    
    for i in range(len(dup_plays)):
        bad_play = dup_plays.iloc[i]
        g = bad_play.gameId
        p = bad_play.playId
        abbrev_name = bad_play.target_receiver.split('.')
        first_name = abbrev_name[0]
        last_name = abbrev_name[1][:-1].strip()

        play = week.query('gameId == @g and playId == @p')
        
        if len(play) > 1:

            # This misses a few plays because of Defense playing Offense,
            # But lowers runtime + avoids labelling an offensive player as defense
            players = play[play.position.isin(offense_pos)].displayName.unique()

            name_options = 0
            for name in players:
                if (name[:len(first_name)] == first_name) and (name.split()[-1] == last_name):
                    new_plays.loc[(new_plays['gameId'] == g) & (new_plays['playId'] == p), 'target_receiver'] = name
                    name_options+=1
            if name_options > 1:
                print("Yikes, Play ({}, {}, Week {}) has {} name options".format(g,p,wk,name_options))
            if name_options == 0:
                # The missing names that trigger this are not labelled in play_level data... so it is fine
                # to label a "None"
                print("Yikes, Could not find replacement for {} ({}, {}, Week {})".format(abbrev_name,g,p, week_num))
                new_plays.loc[(new_plays['gameId'] == g) & (new_plays['playId'] == p), 'target_receiver'] = "None"

In [None]:
# Validation
actual = pd.read_csv('../input/nfl-big-data-bowl-2021-bonus/targetedReceiver.csv',header=0)
conversion_pd = pd.read_csv('../input/nfl-big-data-bowl-2021/players.csv', header=0)

new_plays.loc[new_plays.target_receiver.str.contains('DJ Moore'), 'target_receiver'] = 'D.J. Moore'

actual_pd = conversion_pd[['nflId','displayName']].merge(actual, left_on='nflId',right_on='targetNflId')
actual_pd.loc[actual_pd.displayName.str.contains('Odell Beckham'), 'displayName'] = 'Odell Beckham'
actual_pd.loc[actual_pd.displayName.str.contains('DJ Moore'), 'displayName'] = 'D.J. Moore'
testr_pd = actual_pd.merge(new_plays[['gameId','playId','target_receiver']], on=['gameId','playId'])

right = testr_pd[(testr_pd.target_receiver != 'None')&(testr_pd.displayName == testr_pd.target_receiver)]
100*len(right)/len(testr_pd[testr_pd.target_receiver != 'None'])

# Finding Defenders for Targetted Receiver

In [None]:
# Get the Coverage Player for every Receiver
import time
start = time.time()

match_pd = pd.DataFrame()
id_to_name = pd.DataFrame() 
press = pd.DataFrame() 

# Check if coverage is < 5yrds from scrimmage "press"
coverage_pos = ['DB','CB','S','SS','FS','ILB','OLB']

for week_num in range(1,18):
    print('Starting wk', week_num)
    week = pd.read_csv('../input/nfl-big-data-bowl-2021/week{}.csv'.format(week_num))

    fball_pos = week[(week.displayName == 'Football')&(week.event=='ball_snap')][['gameId','playId','x']].drop_duplicates() \
                 .rename(columns={"x": "los"})

    week = week.merge(fball_pos, on=['gameId','playId'])

    week['press'] = False
    week.loc[(week.event == 'ball_snap') & (week.position.isin(coverage_pos)) & (abs(week.x-week.los) <=2), 'press'] = True

    week['chuck'] = False
    week.loc[(week.event == 'ball_snap') & (week.position.isin(coverage_pos)) & (abs(week.x-week.los) <=5), 'chuck'] = True

    _press = week[(week.event == 'ball_snap')][['gameId','playId','nflId','press','chuck']].drop_duplicates()

    press = press.append(_press)

    start_frame_pd = week.query("event == 'ball_snap'") \
                            [['gameId','playId','frameId']].drop_duplicates() \
                            .rename(columns={"frameId": "startFrame"})

    end_frame_pd = week.query("event == 'pass_forward'") \
                            [['gameId','playId','frameId']].drop_duplicates() \
                            .rename(columns={"frameId": "endFrame"})

    week = week.merge(start_frame_pd, on=['gameId','playId']) \
                     .merge(end_frame_pd, on=['gameId','playId']) \
                     .query("frameId >= startFrame and frameId <= endFrame and team != 'football'")


    # Find coverage for every receiver
    id_to_name = id_to_name.append(week[['nflId','displayName']]).drop_duplicates()

    wrs = week[week.position=='WR']['nflId'].unique()

    for wr in wrs:
        target_pd = week[week.nflId == wr][['gameId','playId','frameId','x','y']] \
               .rename(columns={"x": "target_x", 'y':'target_y'})

        wr_play = week.merge(target_pd, on=['gameId','playId','frameId'])

        wr_play['dist'] = np.sqrt((wr_play.x-wr_play.target_x)**2+(wr_play.y-wr_play.target_y)**2)

        wr_play.loc[wr_play.position.isin(offense_pos), 'dist'] = 100

        tot_dists = wr_play.groupby(['gameId','playId','nflId'])[['dist']].sum()

        closest_pd = tot_dists.groupby(['gameId','playId']).min() 

        cov_pd = tot_dists.reset_index().merge(closest_pd, on=['gameId','playId','dist']) \
                         [['gameId','playId','nflId']] \
                         .rename(columns={'nflId':'cov_nflId'})

        cov_pd['nflId'] = wr

        cov_pd = cov_pd.merge(id_to_name, on='nflId',how='left') \
                       .drop('nflId',axis=1) \
                       .rename(columns={'displayName':'covering','cov_nflId':'nflId'})

        match_pd = match_pd.append(cov_pd)

    print(int(time.time()-start),'total seconds...')
    
x = match_pd.merge(new_plays[['gameId','playId','passResult','playResult','epa','target_receiver']],
              on=['gameId','playId'],how='left') \
        .dropna() \
        .merge(press,on=['gameId','playId','nflId'],how='left') \
        .merge(id_to_name.drop_duplicates(),on='nflId') \
        .rename(columns={'displayName':'defender'}) 
x['targetted'] = x.target_receiver == x.covering
x = x.drop(['target_receiver','nflId'], axis=1)
x = x[['gameId','playId','defender','covering','press','chuck','targetted','passResult','playResult','epa']]
x

In [None]:
x.to_csv('coverage_pairings.csv')

In [None]:
merge_pd = new_plays[['gameId','playId','target_receiver']].copy()

new_plays2 = new_plays.copy()

offense_pos = ['QB','WR','RB','TE','FB','HB']
merge_pd = new_plays[['gameId','playId','target_receiver']].copy()

wk_cntr = 1
targ_plays = pd.DataFrame()
for week_num in range(1,18):
    print('Starting wk', week_num)
    week = pd.read_csv('../input/nfl-big-data-bowl-2021/week{}.csv'.format(week_num))
    
    print('Processing Week {}...'.format(week_num))
    
    week = week.merge(merge_pd, on=['gameId','playId'], how='left')
    week['is_target'] = week.displayName == week.target_receiver
    
    start_frame_pd = week.query("event == 'ball_snap'") \
                        [['gameId','playId','frameId']].drop_duplicates() \
                        .rename(columns={"frameId": "startFrame"})

    end_frame_pd = week.query("event == 'pass_forward' or event == 'pass_shovel'") \
                            [['gameId','playId','frameId']].drop_duplicates() \
                            .rename(columns={"frameId": "endFrame"})

    week = week.merge(start_frame_pd, on=['gameId','playId']) \
               .merge(end_frame_pd, on=['gameId','playId']) \
               .query("frameId >= startFrame and frameId <= endFrame and team != 'football'")

    
    target_pd = week[week.is_target][['gameId','playId','frameId','x','y']] \
                       .rename(columns={"x": "target_x", 'y':'target_y'})


    week = week.merge(target_pd, on=['gameId','playId','frameId'])


    week['dist'] = np.sqrt((week.x-week.target_x)**2+(week.y-week.target_y)**2)

    week.loc[week.position.isin(offense_pos), 'dist'] = 100

    tot_dists = week.groupby(['gameId','playId','nflId'])[['dist']].sum()

    closest_pd = tot_dists.groupby(['gameId','playId']).min() 
    closest_pd['is_coverage'] = True

    tot_dists['nflId'] = [i[2] for i in tot_dists.index]

    week = tot_dists.merge(closest_pd, on=['gameId','playId','dist'],how='left') \
                 .fillna(False) \
                 .merge(week, on=['gameId','playId','nflId']) \
                 .drop(['dist_x','dist_y'], axis=1) 
    
    c_pd = week[week.is_coverage][['gameId','playId',
                                 'displayName','jerseyNumber']] \
      .drop_duplicates() \
      .rename(columns={"displayName": "coverage_player", 
                       'jerseyNumber':'coverage_jersey'})

    t_pd = week[week.is_target][['gameId','playId',
                                       'displayName','jerseyNumber']] \
          .drop_duplicates() \
          .rename(columns={"displayName": "target_player", 
                           'jerseyNumber':'target_jersey'})

    new_targs = c_pd.merge(t_pd, on=['gameId', 'playId']) 
        
    targ_plays = targ_plays.append(new_targs)


new_plays2 = new_plays2.merge(targ_plays, on=['gameId','playId'],how='left') 

new_plays2.loc[new_plays2.target_receiver == 'None', 'target_player'] = 'None'

new_plays2 = new_plays2.drop(['target_receiver'], axis=1)

new_plays2

In [None]:
new_plays2.to_csv('enriched_plays.csv')

In [None]:
# TODO

In [None]:
context['isManPred2'] = (context.isManPred >= (1/100)).astype(int)
context

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn

y_labels['isManPred'] = dtree_predictions

y = y_labels.groupby(['gameId','playId','displayName'])[['isManPred']].mean() \
            .merge(
                    y_labels[['gameId','playId','displayName','playingMan']].drop_duplicates(),
                    on=['gameId','playId','displayName']
                  )
cm = confusion_matrix(y.isManPred, y.playingMan)

df_cm = pd.DataFrame(cm)
# plt.figure(figsize=(10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

plt.title('{:.0f}% Acc'.format(100*(cm[0][0]+cm[1][1])/len(y)))
plt.show()

In [None]:
acc = sum(y.isManPred.values == y.playingMan.values)/len(y)
f1 = f1_score(y_test,dtree_predictions)

plot_roc_curve(dtree_model, X_test, y_test)
plt.plot([0,1],[0,1],linestyle='--',color='red')
plt.ylim([0,1])
plt.xlim([0,1])
plt.title('ACC: {}% || F1: {:.2f}'.format(int(acc*100),f1))

In [None]:
len(tmp), len(test_samples), len(X_test), len(X_train)

In [None]:
298960+58949

In [None]:
dtree_predictions

In [None]:
y_labels['preds'] = dtree_predictions

In [None]:
# x = x.drop(['time','event','jerseyNumber','route','team','playDirection'],axis=1).dropna()

for col in x.columns:
    if col in ['gameId','playId','displayName']:
        continue
    x[col] = np.float32(x[col])

test_samples = x_pd2[~x_pd2.playingMan.isna()][['gameId','playId','displayName']].sample(frac=0.2)

X_test = x.merge(test_samples, on=['gameId','playId','displayName']).dropna()
y_test = X_test.playingMan.values
y_labels = X_test[['gameId','playId','displayName','playingMan']]
X_test = X_test.drop(['playingMan','displayName','gameId','playId'],axis=1)

X_train = x.merge(test_samples, on=['gameId','playId','displayName'],how='outer').dropna()
y_train = X_train.playingMan.values
X_train = X_train.drop(['playingMan','displayName','gameId','playId'],axis=1)

# dtree_model = DecisionTreeClassifier(max_depth = 10).fit(X_train, y_train) 
dtree_model = GradientBoostingClassifier(max_depth = 150).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test) 

disp = plot_confusion_matrix(dtree_model, X_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize='true')

# x_pd2.merge(test_samples,on=['gameId','playId','displayName'],how='outer')

# X = bs_list_bin.drop(['coverage','coverage_label'],axis=1).values
# y = bs_list_bin.coverage_label.values

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
len(X_train), len(x)

In [None]:
# Add the Player-level labels generated above
x_pd2
# Train an additional classifier, splitting on the player-level 

# Get Acc metrics
    # Sum up predictions and that is prob of man coverage
# Loop through and score every player

In [None]:
# Can I take the ML model and use 

In [None]:
x_pd2['num_cover'] = x_pd2.coverage.str.extract('([0-9])')
x_pd2

In [None]:
x_pd2.coverage.unique()

In [None]:
weekArray = np.array(week)
previousEvent = 'ball_snap'
for i, instance in enumerate(weekArray):
    event = instance[8]
    frameId = instance[13]
    if (previousEvent == 'ball_snap' and event != 'ball_snap') or frameId == 1:
        weekArray[i][8] = 'ball_snap'
        previousEvent = 'ball_snap'
    elif (event == 'ball_snap'):
        previousEvent = 'between_snap'
    elif (previousEvent == 'between_snap' and event != 'pass_forward'):
        weekArray[i][8] = 'between_snap'
        previousEvent = 'between_snap'
    elif (event == 'pass_forward'):
        weekArray[i][8] = 'after_thrown'
        previousEvent = 'after_thrown'
    elif (previousEvent == 'after_thrown' and frameId != 1):
        weekArray[i][8] = 'after_thrown'
        previousEvent = 'after_thrown'
        
weekMod = pd.DataFrame(weekArray, columns=week.columns)
week['event'] = weekMod['event']
weekMod = week

In [None]:
# Collect
bs_list_bin.columns