In [None]:
import re
import math
import pandas as pd
import numpy as np
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from tqdm import tqdm

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)
plt.rcParams["animation.html"] = "jshtml"
plt.rcParams['figure.dpi'] = 150  
plt.ioff()

Note that I download the plays_with_both.csv not plays.csv. This file is same as the standard plays.csv except it has the full names of the covers and the targets.

In [None]:
plays = pd.read_csv("/kaggle/input/mybigdatabowl2021/plays_with_both.csv").drop(['Unnamed: 0'], axis=1)

# What are the most successful/ unsuccessful target-cover pair?
Given that we know who the target and the cover are for each play, we want to know if there is a most successful or most unsuccessful pairings. Success will be determined by whether the pass was complete or incomplete.

In [None]:
nan = np.nan
playsWithPair = plays.query("target!='-'")
playsWithPair = playsWithPair[~playsWithPair.coverOne.isna()]
playsWithPair = playsWithPair.reset_index(drop=True)
print(str(round(100.0*len(playsWithPair)/len(plays), 1))+"%")

We lost about 20% of the plays. I think that's expected. All the plays that resulted in sacks and some other miscellaneous cases. Now, I need to create tabluate pairs and results. To do that, I need to formulate a way to count the countTwo as an equal share holder in the passResult. Let's create this DataFrame:

In [None]:
playsWithPair.head(1)

In [None]:
coverTwoExists = playsWithPair[~playsWithPair.coverTwo.isna()]
target = pd.concat([playsWithPair.target, coverTwoExists.target])
cover = pd.concat([playsWithPair.coverOne, coverTwoExists.coverTwo])
passResult = pd.concat([playsWithPair.passResult, coverTwoExists.passResult])
pairResult = pd.DataFrame()
pairResult = pairResult.assign(target=target, cover=cover, passResult=passResult)
pairResult = pairResult.reset_index(drop=True)
pairResult

In [None]:
pairResult.query("passResult=='C'").groupby(['target', 'cover']).count().sort_values("passResult", ascending=False)

Obviously, this does not mean that Xavier Rhodes is the worst cover in the league. Especially, given that Davante Adams is the leagues best WR in general. What I need to look is percentage out of all plays.

In [None]:
pairCompletion= pairResult.query("passResult=='C'").groupby(['target', 'cover']).count().sort_values("passResult", ascending=False)
pairTotal = pairResult.groupby(['target', 'cover']).count().sort_values("passResult", ascending=False)
pairTotal = pairTotal.assign(completionRate=pairCompletion.passResult.divide(pairTotal.passResult, fill_value=0.0))
pairTotal.query("passResult > 7").sort_values(['passResult','completionRate'], ascending=[False, True])

We can see that most pairs are above 50% except for Odell-James Bradberry pair. I think this is very impressive for James Bradberry.

However, in general, it looks like it's hard to be conclusive when grouped by pairs. The number of samples is too little. I think this makes sense because if the teams identify this pairing is a great matchup (or poor depended on which team you are rooting for), they would most likely switch around to gain favoritism.

# Who are the most succssful covers?
Instead of looking at pairs, I think more useful information might be to look at, which cover forced the the lowest completionRate. Note that from here on out, we look at INcompletion rate.

In [None]:
pairIncompletion = pairResult.query("passResult!='C'").groupby('cover').count().sort_values("passResult", ascending=False)
pairTotal = pairResult.groupby('cover').count().sort_values("passResult", ascending=False)
pairTotal = pairTotal.assign(incompletionRate=pairIncompletion.passResult.divide(pairTotal.passResult, fill_value=0.0)).drop('target', axis=1)
pairTotal = pairTotal.rename(columns={'passResult':'coverCount'})[["incompletionRate", "coverCount"]]
pairTotal = pairTotal.sort_values(['coverCount', 'incompletionRate'], ascending=[False, True])
pairTotal.head(10)

Well, looks like the 50% incompletion rate is insanely good when you look at this result. What is the baseline here? What is the average incompletionRate in the league?

In [None]:
print("Mean Incompletion Rate:\t" + str(round(pairTotal.incompletionRate.mean(),3)))
print("Mean Cover Count:\t" + str(round(pairTotal.coverCount.mean(), 1)))

Let's just plot this. It's difficult to see all the numbers:

In [None]:
plt.close('all')
ax = pairTotal.plot.scatter(x='coverCount', y='incompletionRate', figsize=(10,5))
plt.plot([-10,130], [pairTotal.incompletionRate.mean(), pairTotal.incompletionRate.mean()], c='C1')

# Finding the players on the edges
tempX = np.arange(0,110,1)
tempY = np.exp(-0.03 * (tempX+10))+0.6
#plt.plot(tempX, tempY, c='C2')

tempDistToLine = []
for i in range(len(pairTotal)):
    p = pairTotal.iloc[i]
    minDist = 100000000
    for j in range(len(tempX)):
        tempdist = math.sqrt((p.coverCount-j)**2+(p.incompletionRate-tempY[j])**2)
        minDist = min(minDist, tempdist)
    tempDistToLine.append(minDist)
temp = pairTotal.assign(dist=tempDistToLine)
edge_points = temp.sort_values('dist', ascending=True)[:7]
edge_points.plot.scatter(x='coverCount', y='incompletionRate',c='C3', ax=ax)
for i in range(len(edge_points)):
    p = edge_points.iloc[i]
    name = edge_points.index[i]
    if 'Darius Slay' in name:
        plt.text(x=p.coverCount, y=p.incompletionRate+0.03, s=name)
    else:   
        plt.text(x=p.coverCount, y=p.incompletionRate, s=name)
print(edge_points.index)


plt.xlim([0, 110])
plt.ylim([-.05, 1.05])
plt.title("Forced Incompletion Rate per Defensive Tries")
plt.show()

We can see that the above 7 players are exceptional when it comes to forcing imcompletions.

In [None]:
edge_points.drop('dist', axis=1).sort_values('coverCount', ascending=False)