# A Data-Driven Evolution to Punt Return Strategy
## Rahul Amara, Sohum Shah, and Rohit Tanikella

## Introduction: "What Causes Muffed Punts?"

The stage is set: it's the 2011 NFC Conference Championship game and the 49ers lead over the Giants in the 4th quarter. Their defense pushes hard to preserve their lead and forces the Giants to punt. It seems like a dream come true: the opportunity to secure a ticket to Superbowl 46. Unfortunately, this dream became too good to be true when Kyle Williams muffed the punt return and gave the Giants the ball back, leading to a scoring drive and an overtime victory for them.

NFL fans are not strangers to critical games being determined by punt return mistakes. Though only occurring on 4.4% of fieldable punts, misfielded punt returns (muffed or bobbled) have caused drastic swings in the momentum and outcome of countless games.

Conventional football strategy primarily attributes misfielded punts to:
1. the motion of the returner prior to making the catch 
2. the maximum height of the ball during the punt
3. the proximity of the nearest gunner. 

We aim to let the data speak to the causes of misfielding punts and then utilize these data to guide both punt team and punt return team strategy.

## EDA: What attributes are common among misfielded punts?

In [None]:
#Loading Data and creating dataframes that we will go on to use
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import warnings
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

tracking2018 = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2018.csv')
tracking2019 = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2019.csv')
tracking2020 = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2020.csv')
trackingYears = {2018: tracking2018, 2019: tracking2019, 2020:tracking2020}
games = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')
pff = pd.read_csv('../input/nfl-big-data-bowl-2022/PFFScoutingData.csv')



pff = pff[pff['hangTime'].notna()]
pff = pff[pff['operationTime'].notna()]
punts = plays[plays['specialTeamsPlayType'] == 'Punt']
punts = punts.merge(games, on = 'gameId', how = 'inner')
punts = punts.merge(pff, on = ['gameId', 'playId'], how = 'inner')

#Include only punts where a return was possible
punts = punts[punts['specialTeamsResult'].isin(['Return', 'Fair Catch', 'Downed', 'Touchback', 'Muffed'])]

#Look at punts that were touched
fieldedPunts = punts[punts['specialTeamsResult'].isin(['Return', 'Fair Catch', 'Muffed'])]

#Remove R style punts due to a low sample size
fieldedPunts = fieldedPunts[fieldedPunts['kickType'] != 'R']

#Include only punts that the designated returner had a chance to catch in the air
fieldedPunts = fieldedPunts[fieldedPunts['kickContactType'].isin(['CC', 'BC', 'MBDR'])]

fieldedPunts = fieldedPunts[fieldedPunts['returnerId'].notna()]
fieldedPunts = fieldedPunts[~fieldedPunts['returnerId'].str.contains(';')]
fieldedPunts = fieldedPunts[fieldedPunts['gunners'].notna()]
fieldedPunts = fieldedPunts[fieldedPunts['gunners'].str.count(';') == 1]
fieldedPunts = fieldedPunts[fieldedPunts['vises'].notna()]
fieldedPunts.reset_index(inplace = True)

In [None]:
def distance(x1, y1, x2, y2):
    return (np.sqrt((x2-x1)**2 + (y2-y1)**2))

def viseTeam(gunnerTeam):
    if gunnerTeam == 'home':
        return 'away'
    return 'home'

#Generates a dataframe of the desired features for each punt play in our sample set
def featureGeneration(row, totalFeatures):
    output = [str(row.gameId), row.playId]
    
    #kick type: 1 if Normal and 0 if Aussie
    if row.kickType == 'N':
        output.append(1)
    else:
        output.append(0)
        
    #hang time and operation time of punt (operation time is time from snap to punt)
    output.append(row.hangTime)
    output.append(row.operationTime)
    
    #Magnitude of Direction Match:
    #   Vectorized for the purpose of attaining magnitude of offset
    #   If how the team lined up to return is how they actually returned, equalDirection is True
    #   If the team's actual return is 1 to the left or right of how they lined up, off1Direction is True
    #   If team lined up Left and returned Right (or reversed), off2Direction is True
    if row.returnDirectionIntended == row.returnDirectionActual:
        output.extend([1, 0, 0])
    elif row.returnDirectionIntended == 'C' and (row.returnDirectionActual == 'L' or row.returnDirectionActual == 'R'):
        output.extend([0, 1, 0])
    elif (row.returnDirectionIntended == 'L' or row.returnDirectionIntended == 'R') and row.returnDirectionActual == 'C':
        output.extend([0, 1, 0])
    else:
        output.extend([0, 0, 1])
    
    tracking = trackingYears[row.season]
    puntRec = tracking[(tracking['gameId'] == row.gameId) & (tracking['playId'] == row.playId)]
    returnerTrack = puntRec[puntRec['nflId'] == float(row.returnerId)]
    received = returnerTrack[returnerTrack['event'].isin(['fair_catch', 'punt_muffed', 'punt_received'])]
    
    if len(received) == 0: #Missing information about when punt was fielded (applies to just 11 punts)
        return [None] * totalFeatures
    
    #Some received are longer than 1, but this only applies to 55 punts so we just take first one
    received = received.sort_values(by = 'frameId')
    
    #Acceleration and Speed of returner at time of catch
    output += [received.iloc[0].a, received.iloc[0].s]
    
    #Stationary: set a threshold of 1.5 yards per second as the mark for whether a returner is stationary
    if (received.iloc[0].s <= 1.5):
        output.append(1)
    else:
        output.append(0)
    
    
    #Fair catch called by returner
    if 'fair_catch' in returnerTrack['event'].unique():
        output.append(1)
    else:
        output.append(0)
    
    #Direction of retuner's motion: Set to be zero'd when facing desired endzone
    #Orientation of returner: Set to be zero'd when facing desired endzone
    #AbsOrientation: absolute value of orientation
    if received.iloc[0].playDirection == 'right':
        direction, orientation = (received.iloc[0].dir+90)%360, (received.iloc[0].o+90)%360
        if direction > 180:
            direction = direction - 360
        if orientation > 180:
            orientation = orientation - 360
        output.append(direction)
        output.append(orientation)
        output.append(abs(orientation))
    else:
        direction, orientation = (received.iloc[0].dir-90)%360, (received.iloc[0].o-90)%360
        if direction > 180:
            direction = direction - 360
        if orientation > 180:
            orientation = orientation - 360
        output.append(direction)
        output.append(orientation)
        output.append(abs(orientation))
    
    output.append(min(53.3- received.iloc[0].y, received.iloc[0].y))
    output.append(min(110 - received.iloc[0].x, received.iloc[0].x-10))
    
    
    #Gunner 1 information: Gunner 1 distance, Gunner 1 speed, Gunner 1 Accel, 
    #                      G1 Direction, G1 Orientation, G1 Abs Orientation
    #Note that orientation for gunner is zerod at their desired enzone (other side of returner's desired endzone)
    returner_x = received.iloc[0].x
    returner_y = received.iloc[0].y
    
    gunners = row.gunners.split('; ')
    gunner_team = gunners[0][0:gunners[0].index(" ")]
    gunner1_number = int(gunners[0][gunners[0].index(" ") + 1: ])
    gunner2_number = int(gunners[1][gunners[1].index(" ") + 1: ])
    
    homeTeam = games[games['gameId'] == row.gameId].iloc[0].homeTeamAbbr
    team_home_away = 'home'
    if homeTeam != gunner_team:
        team_home_away = 'away'
    
    gunner1_track = puntRec[(puntRec['team'] == team_home_away) & (puntRec['jerseyNumber'] == gunner1_number)]
    g1AtContact = gunner1_track[gunner1_track['frameId'] == received.iloc[0].frameId]
    g1AtContact = g1AtContact.sort_values(by = 'frameId')
    if (len(g1AtContact) == 0):
        return [None] * totalFeatures
    
    gunner1_x = g1AtContact.iloc[0].x
    gunner1_y = g1AtContact.iloc[0].y
    gunner1Distance = distance(returner_x, returner_y, gunner1_x, gunner1_y)
    gunner1Speed = g1AtContact.iloc[0].s
    gunner1Accel = g1AtContact.iloc[0].a
    gunner1Dir = 0
    gunner1Orientation = 0
    
    output.append(gunner1Distance)
    output.append(gunner1Speed)
    output.append(gunner1Accel)
    
    if received.iloc[0].playDirection == 'right':
        gunner1Dir, gunner1Orientation = (g1AtContact.iloc[0].dir-90)%360, (g1AtContact.iloc[0].o-90)%360
        if gunner1Dir > 180:
            gunner1Dir = gunner1Dir - 360
        if gunner1Orientation > 180:
            gunner1Orientation = gunner1Orientation - 360
        output.append(gunner1Dir)
        output.append(gunner1Orientation)
        output.append(abs(gunner1Orientation))
    else:
        gunner1Dir, gunner1Orientation = (g1AtContact.iloc[0].dir+90)%360, (g1AtContact.iloc[0].o+90)%360
        if gunner1Dir > 180:
            gunner1Dir = gunner1Dir - 360
        if orientation > 180:
            gunner1Orientation = gunner1Orientation - 360
        output.append(gunner1Dir)
        output.append(gunner1Orientation)
        output.append(abs(gunner1Orientation))
        

    #Gunner 2 information: Gunner 2 distance, Gunner 2 speed, Gunner 2 Accel
    #                      G2 Direction, G2 Orientation, G2 Abs Orientation
    gunner2_track = puntRec[(puntRec['team'] == team_home_away) & (puntRec['jerseyNumber'] == gunner2_number)]
    g2AtContact = gunner2_track[gunner2_track['frameId'] == received.iloc[0].frameId]
    g2AtContact = g2AtContact.sort_values(by = 'frameId')
    if (len(g2AtContact) == 0):
        return [None] * totalFeatures
    gunner2_x = g2AtContact.iloc[0].x
    gunner2_y = g2AtContact.iloc[0].y
    gunner2Distance = distance(returner_x, returner_y, gunner2_x, gunner2_y)
    gunner2Speed = g2AtContact.iloc[0].s
    gunner2Accel = g2AtContact.iloc[0].a
    gunner2Dir = 0
    gunner2Orientation = 0
    
    output.append(gunner2Distance)
    output.append(gunner2Speed)
    output.append(gunner2Accel)
    
    if received.iloc[0].playDirection == 'right':
        gunner2Dir, gunner2Orientation = (g2AtContact.iloc[0].dir-90)%360, (g2AtContact.iloc[0].o-90)%360
        if gunner2Dir > 180:
            gunner2Dir = gunner2Dir - 360
        if gunner2Orientation > 180:
            gunner2Orientation = gunner2Orientation - 360
        output.append(gunner2Dir)
        output.append(gunner2Orientation)
        output.append(abs(gunner2Orientation))
    else:
        gunner2Dir, gunner2Orientation = (g2AtContact.iloc[0].dir+90)%360, (g2AtContact.iloc[0].o+90)%360
        if gunner2Dir > 180:
            gunner2Dir = gunner2Dir - 360
        if gunner2Orientation > 180:
            gunner2Orientation = gunner2Orientation - 360
        output.append(gunner2Dir)
        output.append(gunner2Orientation)
        output.append(abs(gunner2Orientation))
    
    
    #Nearest Gunner Information: Distance, speed, acceleration, time to contact, direction,
    #                            orientation, abs orientation
    nearestGunnerDistance = 0
    if gunner1Distance < gunner2Distance:
        nearestGunnerDistance = gunner1Distance
        output.append(gunner1Distance)
        output.append(gunner1Speed)
        output.append(gunner1Accel)
        output.append((gunner1Speed + np.sqrt(gunner1Speed**2 + 2*gunner1Accel*gunner1Distance))/(gunner1Accel))
        output.append(gunner1Dir)
        output.append(gunner1Orientation)
        output.append(abs(gunner2Orientation))
        
    else:
        nearestGunnerDistance = gunner2Distance
        output.append(gunner2Distance)
        output.append(g2AtContact.iloc[0].s)
        output.append(g2AtContact.iloc[0].a)
        output.append((g2AtContact.iloc[0].s + np.sqrt(g2AtContact.iloc[0].s**2 + 2*g2AtContact.iloc[0].a*gunner2Distance))/(g2AtContact.iloc[0].a))
        output.append(gunner2Dir)
        output.append(gunner2Orientation)
        output.append(abs(gunner2Orientation))
    
    #Average Gunner Distance:
    output.append((gunner1Distance + gunner2Distance)/2)
    
    #Average Vise Information: Average distance of all vises
    #Closest Vise Information: Distance, speed, acceleration, direction, and orientation
    vises = row.vises.split('; ')
    nearestViseDistance = np.sqrt(100000)
    nearestViseSpeed = 0
    nearstViseAccel = 0
    nearestViseDirection = 0
    nearestViseOrientation = 0
    foundVice = False
    
    for vise in vises:
        viseNumber = int(vise[vise.index(" ") + 1:])
        viseTrack = puntRec[(puntRec['team'] == viseTeam(team_home_away)) & (puntRec['jerseyNumber'] == viseNumber)]
        viseAtContact = viseTrack[viseTrack['frameId'] == received.iloc[0].frameId]
        viseAtContact = viseAtContact.sort_values(by = 'frameId')
        if (len(viseAtContact) == 0):
            continue
        foundVice = True
        vise_x = viseAtContact.iloc[0].x
        vise_y = viseAtContact.iloc[0].y
        viseDistance = distance(returner_x, returner_y, vise_x, vise_y)
        if viseDistance < nearestViseDistance:
            nearestViseDistance = viseDistance
            nearestViseSpeed = viseAtContact.iloc[0].s
            nearstViseAccel = viseAtContact.iloc[0].a
            if viseAtContact.iloc[0].playDirection == 'right':
                nearestViseDirection, nearestViseOrientation = (viseAtContact.iloc[0].dir+90)%360, (viseAtContact.iloc[0].o+90)%360
                if nearestViseDirection > 180:
                    nearestViseDirection = nearestViseDirection - 360
                if nearestViseOrientation > 180:
                    nearestViseOrientation = nearestViseOrientation - 360
            else:
                nearestViseDirection, nearestViseOrientation= (viseAtContact.iloc[0].dir-90)%360, (viseAtContact.iloc[0].o-90)%360
                if nearestViseDirection > 180:
                    nearestViseDirection = nearestViseDirection - 360
                if nearestViseOrientation > 180:
                    nearestViseOrientation = nearestViseOrientation - 360
    if not foundVice:
        return [None] * totalFeatures    
    output.append(nearestViseDistance)
    output.append(nearestViseSpeed)
    output.append(nearstViseAccel)
    output.append(nearestViseDirection)
    output.append(nearestViseOrientation)
    
    #Result
    if row.kickContactType == 'CC':
        output.append(0)
    else:
        output.append(1)
        
    return output

columnNames = ['gameId', 'playId', 'kickType', 'hangTime', 'opTime', 'equalDirection', 'off1Direction', 'off2Direction',
               'accel', 'speed', 'stationary', 'fair_catch', 'returnerDir', 'returnerOr', 'returnerOrAbs', 'distanceFromSideline', 'distanceFromEndzone', 'gunner1Distance', 
                'gunner1Speed', 'gunner1Accel', 'gunner1Dir', 'gunner1Or', 'gunner1OrAbs', 'gunner2Distance', 
                'gunner2Speed', 'gunner2Accel', 'gunner2Dir', 'gunner2Or', 'gunner2OrAbs', 'closestGunnerDistance',
                'closestGunnerSpeed', 'closestGunnerAccel','closestGunnerTimeToContact', 'closestGunnerDir', 
                'closestGunnerOr', 'closestGunnerOrAbs', 'avgGunnerDistance', 
                'nearestViseDistance', 'nearestViseSpeed', 'nearestViseAccel', 'nearestViseDirection', 
                'nearestViseOr', 'result']
df = pd.DataFrame(columns = columnNames)
df[columnNames] = fieldedPunts.apply(lambda x: featureGeneration(x, len(columnNames)), axis = 1, result_type="expand")
df.dropna(inplace = True)
df.reset_index(inplace = True)

As part of our EDA Process, we first sought to identify trends in various features between when a punt was clean-caught and when a punt was misfielded. Among others, some of the critical features that we examined were
1. The rate of fair catches depending on whether a punt was misfielded
2. The orientation of the returner at time of catch depending on whether a punt was misfielded
3. The speed of the returner at the time of catch depending on whether a punt was misfielded
4. The distance between the returner and the gunner at time of catch depending on whether a punt was misfielded

While we understood that this isn't presenting any evidence for causality, our findings supported that there were common characteristics among plays that were misfielded and that these characteristics could likely be used in creating a misfielded return risk model.

In [None]:
pd.pivot_table(df, values="accel", index="fair_catch", columns="result", aggfunc="count")

In the pivot table above, we define a result of 1 to represent a misfielded return and a fair_catch value of 1 to represent a fair catching being called. As we see above, the vast majority of punts that were misfielded occured when a fair catch was not called. Specifically, the probability of sampling a misfielded punt on a fair catch is

In [None]:
print(str(round(df[df['fair_catch']==1]['result'].mean()*100, 2)) + "%")

and the probability of sampling a misfielded punt when no fair catch is called is 

In [None]:
print(str(round(df[df['fair_catch']==0]['result'].mean()*100, 1)) + "%")

In summary, nearly 10 times more misfielded punts occur on plays where a fair catch was not called.



In [None]:
plt.title("Distance to Gunner on Clean vs. Misfielded Punts", fontsize=14)
ax=sns.violinplot(x='result', y='closestGunnerDistance', data=df[df['fair_catch']==0], inner='quartile')
plt.xlabel('Fair Catch Called')
plt.xticks([0, 1], ['Clean Caught', 'Misfielded'])
plt.ylabel("Distance from Returner to Nearest Gunner")
plt.ylim(-5, 35);

In the above graphic, fair catch (in the legend) refers to whether the returner elected to call a fair caught with 0 referring to no fair catch called and 1 referring to a fair catch.

Because fair catches confound so heavily with the distance from the returner to the nearest gunner, it was important to separate them out in understanding the significance of this attribute. As such, what we see above is the distance to the nearest gunner exclusively on plays where a fair catch was not called.

As general football intuition would guide us, the gunner was significantly closer to the returner on punts that ended up being misfielded.

Specifically, we see that on plays where a fair catch was not called, the median distance from the returner to the gunner on clean caught punts was 

In [None]:
print(str(round(df[(df['fair_catch']==0) & (df['result']==0)]['closestGunnerDistance'].median(), 1)) + " yards")

and the median distance from the returner to the gunner on muffed punts was 

In [None]:
print(str(round(df[(df['fair_catch']==0) & (df['result']==1)]['closestGunnerDistance'].median(), 1)) + " yards.")

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=False)
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)

sns.violinplot(x=df['result'], y=(df['speed'] - df['speed'].mean())/df['speed'].std(), inner='quartile', ax=ax1)
ax1.set_title('Misfielded vs. Speed')
ax1.set_ylabel('Speed (Normalized)')
ax2.set_title('Misfielded vs. Normalized Orientation')
ax2.set_ylabel('Orientation (Normalized)')
f.set_size_inches(9, 5)
sns.violinplot(x=df['result'], y=(df['returnerOrAbs'] - df['returnerOrAbs'].mean())/df['returnerOrAbs'].std(), inner='quartile', ax=ax2);


As we see above, on misfielded punts, the median speed of a returner as well as the median angular displacement from center of a returner's orientation is roughly 0.75 standard deviations higher than when a punt is clean-caught.

We have included a correlation matrix below of all the features we identified as potentially causal. Note that the limitation of a matrix such as this one is that it does not demonstrate the correlation between features and the outcome (labeled "result" here) in conjunction with other features.

In [None]:
featuresToShow = ['kickType', 'equalDirection', 'off1Direction', 'off2Direction',
                'speed', 'fair_catch', 'returnerDir', 'returnerOr', 'returnerOrAbs', 'distanceFromSideline', 
                'closestGunnerDistance', 'result']

corrGraph = df[featuresToShow].copy()

corr = corrGraph.corr()

plt.figure(figsize=[20,20])
ax = plt.axes()
plt.rcParams.update({'font.size': 12})
ax.set_title('Punt Data Correlation Matrix', fontsize=40)
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='RdBu', annot=True, fmt='.2f', vmin=-1.0)
plt.show();

## Classification Model: Clean-Caught vs. Misfielded

Using the insights gained from the EDA above, we sought to produce a model that would predict if a punt was misfielded or not.
The first important characteristic of our data that we noticed was the imbalance in the dataset. This meant that our model could have high accuracy while not producing any useful insight by simply predicting every return as clean-caught. This led to us to focus on recall as the primary metric for measuring our modelâ€™s performance.

Recall = $\frac{TP}{TP+FN}$

We determined that we care about recall over precision specifically here because our goal was to predict returns that were likely to be misfielded even if that meant misclassifying a few borderline clean catches as misfielded.

We first used the data provided to produce a naive logistic regression model. Using that model as a baseline we then improved its performance by both reformulating/adding features as well as setting class weights to correct the inherent imbalance in the dataset. Our final model and its performance are displayed below.

The reason we chose to use a logistic regression model as our classifier is because we wanted to generate a model whose formula is extractable and utilizable to generate coaching insights on a feature-by-feature basis as opposed to using a black-box model. 

$Pr(Y_i=1|X_i) = {\frac{exp(\beta_0 + \beta_1X_i + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5...)}{1 + exp (\beta_0 + \beta_1X_i + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5...)}}$

In [None]:
def runClassifier(clf, params):
    cv = GridSearchCV(clf, params, error_score=-1, scoring = 'f1', verbose = 0)
    cv.fit(X_train, Y_train)
    best = cv.best_estimator_

    acc = best.score(X_test, Y_test)
    print("Model Accuracy: " + str(acc))

    preds = best.predict(X_test)
    prec = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    f1 = f1_score(Y_test, preds)

    print("Model Precision: " + str(prec))
    print("Model Recall: " + str(recall))
    #print("Model F1 Score: " + str(f1))
    
    print(cv.best_params_)
    return cv.best_estimator_


featureNames = ['kickType', 'hangTime', 'equalDirection', 'off1Direction', 'off2Direction', 'accel', 
                'speed','fair_catch', 'returnerDir', 'returnerOr', 'returnerOrAbs', 'gunner1Distance',  
                'gunner1Speed', 'gunner1Dir', 'gunner1Or', 'gunner2Distance', 
                'gunner2Speed', 'gunner2Accel', 'gunner2Dir', 'gunner2Or', 'gunner2OrAbs', 'closestGunnerDistance',
                'closestGunnerSpeed', 'closestGunnerAccel','closestGunnerTimeToContact', 'closestGunnerDir', 
                'closestGunnerOr', 
                'nearestViseDistance', 'nearestViseSpeed', 'nearestViseAccel', 'nearestViseDirection', 
                'nearestViseOr']
X = df[featureNames].copy()
Y = df['result']

toNormalize = ['hangTime', 'accel', 'speed' ,'returnerDir', 'returnerOr', 'returnerOrAbs', 'gunner1Distance',
              'gunner1Speed', 'gunner1Dir', 'gunner1Or', 'gunner2Distance', 'gunner2Speed',
              'gunner2Accel', 'gunner2Dir', 'gunner2Or', 'closestGunnerDistance', 'closestGunnerSpeed', 
              'closestGunnerAccel', 'closestGunnerTimeToContact', 'closestGunnerDir', 'closestGunnerOr', 
               'nearestViseDistance', 'nearestViseSpeed', 'nearestViseAccel', 
               'nearestViseDirection', 'nearestViseOr']
for f in toNormalize:
    X[f] = (X[f] - X[f].mean()) / X[f].std()
    

selectedFeatures = featureNames #Try removing highly correlated features and look at correlation matrix
#Feature selection code
X = X[selectedFeatures]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
params = {'solver': ['liblinear', 'sag', 'saga', 'lbfgs'], 'C': [0.001,0.01,0.1,1,10,100,1000], 
          'class_weight': [{0: 1, 1: 6},{0: 1, 1: 6.5}, {0: 1, 1: 7}],
          'penalty': ['l1', 'l2', 'elasticnet', 'none', 'newton-cg']}
logReg = runClassifier(LogisticRegression(), params)

With our final model, we can take a look at which features of our data were deemed most important to predicting whether a kick would be misfielded. We see that calling a fair catch as well as how a player was lined up (among several other factors) were very important in determining the likelihood of a misfield. The most important features from our model are displayed below with their relative importance

In [None]:
d = {"Feature": featureNames, "Coefficent": logReg.coef_[0], "Absolute Value of Coefficent": np.absolute(logReg.coef_[0])}
weights = pd.DataFrame(d)

weights = weights[weights["Absolute Value of Coefficent"] > 0.3]
sns.set(rc = {'figure.figsize':(15,8)})
ax = sns.barplot(x="Feature", y="Coefficent", data=weights, order=weights.sort_values('Absolute Value of Coefficent',ascending = False).Feature)
ax.set_title('Logistic Regression Feature Weights', fontsize=16);

As can be seen from the precision from above, our model sometimes incorrectly classifies a clean-caught return as misfielded. This occurs when a play is still "high risk", but the returner manages to catch it without misfielding it. To demonstrate this, we have included two visualizations: 
1. The model correctly called a misfield and one 
2. Model incorrectly called a misfield

In [None]:
from IPython.display import Image

Image(open('../input/gifsfornfl/muff.gif', 'rb').read())

In [None]:
Image(open('../input/gifsfornfl/riskyButClean.gif', 'rb').read())

As we see, the first visualization shows a clearly risky play that resulted in a misfield as the retuner was both running backwards in conjunction with the fact that the was very close to him.

The second visualization shows a clearly risky play that happened to not lead to a misfield despite the fact that the gunner was right next to the returner and ready to make contact.

## Transitioning to a risk assessment model

To make our model more granular, we converted the prediction of a clean catch vs. a misfield to a risk factor between 0 and 1 where a higher risk refers to a higher chance of misfielding. This was done by using the numerical output of our logistic regression model.

What is seen below is a distribution of 'risk factors' for all of the plays in our sample set. We have made plays that were misfielded red.

In [None]:
logistic_probabilities = logReg.predict_proba(X)
risks = []
for touple in logistic_probabilities:
    risks.append(touple[1])

withRisk = X.copy()
withRisk['gameId'] = df['gameId'].astype(int)
withRisk['playId'] = df['playId'].astype(int)
withRisk['risk'] = risks
withRisk['result'] = Y

plt.figure(figsize=[14,6])
#sns.histplot(x='risk', data=toGraph, hue='result', palette=sns.color_palette(['#00CCFF', '#FF3300']));
sns.histplot(x='risk', data=withRisk, binrange=(0,1), binwidth=0.2/14)
sns.histplot(x='risk', data=withRisk[(withRisk['result']==1) & (withRisk['risk']<0.4)], binrange=(0,1), binwidth=0.2/14, color='red')
sns.histplot(x='risk', data=withRisk[(withRisk['result']==1) & (withRisk['risk']>0.4)], binrange=(0,1), binwidth=0.4/14, color='red')
plt.title('Disribution of Modeled Risk Values for Punt Plays')
plt.xlabel('Modeled Risk Value')
plt.ylabel('Count (Red Bars refer to misfielded punts)');

As you can see above, the clear majority of misfielded punts occurred on plays that we modeled to be high risk.

## Defining Situational Risk-Reward

In the context of misfielded punts, there is no way to calculate a meaningful expected value. This is because if a punt is successfully fielded, the reward is yards gained. On the other hand, if a punt is misfielded, the risk is loss of possession. As such, we have chosen to present two metrics for any given situation:
1. The percentage of punts with similar risk values that were misfielded
2. The expected yards to be gained if the punt is not misfielded

In [None]:
riskAndOutcome = withRisk.merge(plays, on = ['gameId', 'playId'], how = 'inner').drop(columns=['passResult', 
                                'preSnapVisitorScore', 'preSnapHomeScore', 'penaltyJerseyNumbers', 'penaltyCodes',
                                'gameClock', 'yardlineSide', 'kickBlockerId', 'specialTeamsPlayType', 'kickerId',
                                'yardsToGo', 'possessionTeam', 'down', 'quarter', 'penaltyYards'])
x_vals = []
y1_vals = []
y2_vals = []
for i in np.arange(10):
    x = i/10 + 0.05
    y1 = riskAndOutcome[(riskAndOutcome['risk'] < x + 0.05) & (riskAndOutcome['risk'] >= x - 0.05)]['kickReturnYardage'].mean()
    y2 = riskAndOutcome[(riskAndOutcome['risk'] < x + 0.05) & (riskAndOutcome['risk'] >= x - 0.05)]['result'].mean()
    x_vals.append(x)
    y1_vals.append(y1)
    y2_vals.append(y2)
    
fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
sns.regplot(x=x_vals, y=y1_vals, ax=ax)
ax2 = ax.twinx()
sns.regplot(x=x_vals, y=y2_vals, ax=ax2, color='r')
ax.set_ylabel('Mean Return for Similar Risk')
ax2.set_ylabel("Misfield Rate for Similar Risk")
ax.set_xlabel("Risk")
ax.set_title("Risk vs. Mean Return Yardage and Misfielded Return Rate");

We can see above that as a play becomes riskier, not only does the likelyhood of a misfield increase, but the yards gained on the return decrease. This emphasizes further that returners should call a fair catch or allow the ball to bounce on risky plays.

## Punt Return Strategy Cheat-Sheet

Using the model decomposition and analysis in this section, we then set out to define some implementable strategies that coaches and special teams players can use to mitigate the risk of misfielding.

### 1) If the returner is running to catch the punt, he should think first to call fair catch

From the model decomposition above, it is clear that two of the largest risk factors for a misfield are not calling a fair catch as well as running while fielding the punt. To put this into numbers using our risk metric

In [None]:
riskForNoFairCatch = riskAndOutcome[(riskAndOutcome['speed'] > 2) & (riskAndOutcome['fair_catch'] == 0)]['risk'].mean()
riskForFairCatch = riskAndOutcome[(riskAndOutcome['speed'] > 2) & (riskAndOutcome['fair_catch'] == 1)]['risk'].mean()

noFairCatchMisfield = riskAndOutcome[(riskAndOutcome['risk'] > riskForNoFairCatch - 0.05) & (riskAndOutcome['risk'] > riskForNoFairCatch + 0.05)]['result'].mean()*100
noFairCatchReturn = riskAndOutcome[(riskAndOutcome['risk'] > riskForNoFairCatch - 0.05) & (riskAndOutcome['risk'] > riskForNoFairCatch + 0.05)]['kickReturnYardage'].mean()
fairCatchMisfield = riskAndOutcome[(riskAndOutcome['risk'] > riskForFairCatch - 0.05) & (riskAndOutcome['risk'] > riskForFairCatch + 0.05)]['result'].mean()*100

print("Returns as risky as a moving catch with no fair catch called, " + str(round(noFairCatchMisfield, 1)) + "% were misfielded and the average return yardage was " + str(round(noFairCatchReturn, 2)) + " yards")
print("If the returner simply called for a fair catch, only " + str(round(fairCatchMisfield, 1)) + "% of similarly risky plays were misfielded, but the average return yardage was 0 yards")


In essence, the returner is nearly doubling his risk of misfielding for a very small expected number of yards gained on the return.

### 2) If the nearest gunner is within 5 yards of you, should always call for a fair catch

From our model decomposition and supported by our EDA, another large risk factor is electing to return catches when the gunner is nearby. Putting this into numbers using our risk metric we see that

In [None]:
nearGunnerNoFairCatch = riskAndOutcome[(riskAndOutcome['fair_catch'] == 0) & (riskAndOutcome['closestGunnerDistance'] < 5)]['risk'].mean()
nearGunnerFairCatch = riskAndOutcome[(riskAndOutcome['fair_catch'] == 1) & (riskAndOutcome['closestGunnerDistance'] < 5)]['risk'].mean()

nearGunnerNoFairCatchMisfield = riskAndOutcome[(riskAndOutcome['risk'] > nearGunnerNoFairCatch - 0.05) & (riskAndOutcome['risk'] > nearGunnerNoFairCatch + 0.05)]['result'].mean()*100
nearGunnerNoFairCatchReturn = riskAndOutcome[(riskAndOutcome['risk'] > nearGunnerNoFairCatch - 0.05) & (riskAndOutcome['risk'] > nearGunnerNoFairCatch + 0.05)]['kickReturnYardage'].mean()

nearGunnerFairCatchMisfield = riskAndOutcome[(riskAndOutcome['risk'] > nearGunnerFairCatch - 0.05) & (riskAndOutcome['risk'] > nearGunnerFairCatch + 0.05)]['result'].mean()*100
nearGunnerFairCatchReturn = riskAndOutcome[(riskAndOutcome['risk'] > nearGunnerFairCatch - 0.05) & (riskAndOutcome['risk'] > nearGunnerFairCatch + 0.05)]['kickReturnYardage'].mean()



print("For returns as risky as fielded return with a nearby gunner " + str(round(nearGunnerNoFairCatchMisfield, 1)) + "% were misfielded and the average return yardage was " + str(round(nearGunnerNoFairCatchReturn, 2)) + " yards")
print("If the returner simply called for a fair catch, only " + str(round(nearGunnerFairCatchMisfield, 1)) + "% of similarly risky plays were misfielded, but the average return yardage was 0 yards")


Once again, the returner is more than doubling his risk of misfielding for a very small expected number of yards gained on the return.

### 3) It is ill advised to field a ball that you have run across the field to catch

From our model decomposition, another large risk factor is fielding punts that are 'off2Direction'. Putting this into numbers using our risk metric we see that

In [None]:
riskForCrossCatch = riskAndOutcome[(riskAndOutcome['off2Direction'] == 1) & (riskAndOutcome['speed'] > 1.5)]['risk'].mean()
riskForNotCrossCatch = riskAndOutcome[(riskAndOutcome['off2Direction'] == 0)]['risk'].mean()

crossCatchMisfield = riskAndOutcome[(riskAndOutcome['risk'] > riskForCrossCatch - 0.05) & (riskAndOutcome['risk'] > riskForCrossCatch + 0.05)]['result'].mean()*100
crossCatchReturn = riskAndOutcome[(riskAndOutcome['risk'] > riskForCrossCatch - 0.05) & (riskAndOutcome['risk'] > riskForCrossCatch + 0.05)]['kickReturnYardage'].mean()

notCrossCatchMisfield = riskAndOutcome[(riskAndOutcome['risk'] > riskForNotCrossCatch - 0.05) & (riskAndOutcome['risk'] > riskForNotCrossCatch + 0.05)]['result'].mean()*100
notCrossCatchReturn = riskAndOutcome[(riskAndOutcome['risk'] > riskForNotCrossCatch - 0.05) & (riskAndOutcome['risk'] > riskForNotCrossCatch + 0.05)]['kickReturnYardage'].mean()




print("For returns as risky as those that were fielded in motion on the opposite side of the field " + str(round(crossCatchMisfield, 1)) + "% were misfielded and the average return yardage was " + str(round(crossCatchReturn, 2)) + " yards")
print("On the other hand, for returns as risky as returns that weren't fielded across the field " + str(round(notCrossCatchMisfield, 1)) + "% of similarly risky plays were misfielded, but the average return yardage was "+ str(round(notCrossCatchReturn, 2)) + " yards")

The returner is significantly increasing his risk for an even smaller return. As such, it is best that the returer avoids fielding the ball altogether.

While we have presented a few strategies here, please feel free to play around with the calculator at the bottom of the notebook that we have included below to try to create and assess your own strategies.

## Next Steps
While our results above do lead to some valid and interesting strategies for returners, there are a couple areas where our team would like to explore further. One idea was to incorporate the idea of max ball height into our model and risk calculator. Conventional wisdom says that a higer punt increases the odds of a misfield, so the addition of height to our model may increase accuracy and recall. Another idea is to create a more robust risk calculator (like the one below) so that coaches and players can create more detailed simulations to test out situations they may find themselves in. 


## Works Cited
https://towardsdatascience.com/logistic-regression-detailed-overview-46c4da4303bc

https://www.sciencedirect.com/topics/medicine-and-dentistry/logistic-regression-analysis

https://machinelearningmastery.com/what-is-imbalanced-classification/

https://www.analyticsvidhya.com/blog/2020/07/10-techniques-to-deal-with-class-imbalance-in-machine-learning/

https://towardsdatascience.com/decision-trees-in-machine-learning-641b9c4e8052

https://web.engr.oregonstate.edu/~tgd/publications/mcs-ensembles.pdf

https://builtin.com/data-science/precision-and-recall

## Risk-Reward Calculator

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual, FloatSlider
import ipywidgets as widgets

In [None]:
def status(not_fair_catch, equal_direction, off_one_direction, off_two_direction, gunner_distance_less_than_5_yards, returner_orientation_greater_than_15_degrees):
    notFairCatch = riskAndOutcome['gameId'] > 0
    equal = riskAndOutcome['gameId'] > 0
    offOne = riskAndOutcome['gameId'] > 0
    offTwo = riskAndOutcome['gameId'] > 0
    gunnerNear = riskAndOutcome['gameId'] > 0
    gunnerTurned = riskAndOutcome['gameId'] > 0
    
    if not_fair_catch:
        notFairCatch = riskAndOutcome['fair_catch'] == 0
    if equal_direction:
        equal = riskAndOutcome['equalDirection'] == equal_direction
    if off_one_direction:
        offOne = riskAndOutcome['off1Direction'] == off_one_direction
    if off_two_direction:
        offTwo = riskAndOutcome['off2Direction'] == off_two_direction
    if gunner_distance_less_than_5_yards:
        gunnerNear = riskAndOutcome['closestGunnerDistance'] < 5
    if returner_orientation_greater_than_15_degrees:
        gunnerTurned = riskAndOutcome['returnerOrAbs'] > 10
    
    situationalRisk = riskAndOutcome[notFairCatch & equal & offOne & offTwo & gunnerNear & gunnerTurned]['risk'].mean()
    situationalMisfield = riskAndOutcome[(riskAndOutcome['risk'] > situationalRisk - 0.05) & (riskAndOutcome['risk'] < situationalRisk + 0.05)]['result'].mean()
    situationalReturn = riskAndOutcome[(riskAndOutcome['risk'] > situationalRisk - 0.05) & (riskAndOutcome['risk'] < situationalRisk + 0.05)]['kickReturnYardage'].mean()
    print("Based on the inputs, for similar risk plays, " + str(round(situationalMisfield, 2)) + "% were misfielded and the average return was " + str(round(situationalReturn,2)) + " yards")
    

interact(status, not_fair_catch=False, equal_direction=False, off_one_direction=False, off_two_direction=False, 
         gunner_distance_less_than_5_yards=False, returner_orientation_greater_than_15_degrees=False)

