# Introduction

The purpose of this analysis was to try to determine when a defence applies man or zone defences based on player tracking data, and then analysing the effectiveness of these defensive strategies against various offensive formations. A deeper dive was then done to break it down to the individual player level, to see which defenders play the tightest coverage, and how defenders stack up generally when they are operating in zone or man defence. 


# Man vs Zone Coverage

How can we tell from the data when a defender is in man or zone coverage? Whilst there are many possible different defensive play calls which do not fit neatly in the category of strictly all man or all zone, this analysis attempted to approach this question by looking at the individual defensive players and measure the distance between them and the closest receiver to them (presumed to be the opponent they are marking up against) at each event from the beginning of the play snap up until the point when the pass arrives. If their marked opponent (the receiver) did not change throughout the whole play up until the pass arrived, it was assumed therefore that this defensive player was in a man coverage. 

The Free Safety position was left out of this however and assumed to be playing in some type of zone coverage at the back. Players categorised were therefore CBs, LBs and SSs. The code used to run this is included below (the code has been commented out to reduce runtime. Final results will be explored further on in the notebook).

This code returns a summary of all defensive players on the field at each time frame, listing the closest receiver to them and the distance between them at that point in time. 

In [None]:
"""import pandas as pd
import numpy as np

#function that returns the closest receiver to a defensive player at a given point in time, plus that distance
def coords(x, y, receivers):
    #single point x,y coords for current defensive player
    a = np.array([x, y]).astype(float)
    #vector of coordinates for receivers
    receivers = pd.DataFrame(receivers).rename(columns={0:'displayName', 1: 'x', 2: 'y'})
    b = np.array(receivers[['x', 'y']])
    distances = np.linalg.norm(b-a, axis=1)
    return [receivers.loc[np.where(distances == np.min(distances))[0][0], 'displayName'], min(distances)]

#function that returns all receivers on the field plus their x, y coordinates at a given frameId during a play 
def listcompr(db, val, a, b):
        return [l[[10, 1, 2]] for l in db if l[a] == val if l[b] in ['receiver']]
    
db = pd.read_csv('plays.csv')
positions = pd.read_csv('positions.csv')

for i in range(1, 18):
    
    data = pd.read_csv('week' + str(i) + '.csv')
    
    #posGen is a column added which just simplified all offensive players as a 'receiver' category and cleaned up defensive positions
    for pos in positions['desc'].unique():
        selection = data['position'].isin(np.array(positions[positions['desc'] == pos]['pos']))
        data.loc[selection, 'posGen'] = pos
        
    football = data['displayName'].isin(['Football'])
    data.loc[football, 'posGen'] = 'football'
    
    #create 'val' column which is a unique Id for each frame of each play
    data['val'] = data['gameId'].astype(str) + data['playId'].astype(str) + data['frameId'].astype(str) 
    valCol = data.columns.get_loc('val')
    posCol = data.columns.get_loc('posGen')
    n = np.array(data)
    allFrames = pd.DataFrame(data['val'].unique()).rename(columns={0:'val'})
    
    #return list of receivers on field with their respective coordinates at each unique frameId
    f = allFrames.apply(lambda row: listcompr(n, row['val'], valCol, posCol), axis=1)   
    allFrames['receivers'] = f
    
    data = pd.merge(data, allFrames, on=['val'])
    dbs = data['posGen'].isin(['db', 'lb'])
    distances = data.loc[dbs].apply(lambda row: coords(row['x'], row['y'], row['receivers']), axis=1)   
    
    data.loc[dbs, ['closestReceiver', 'distToReceiver']] = pd.DataFrame(distances).apply(lambda x: [x[0][0], x[0][1]], axis=1, result_type='expand').rename(columns={0:'closestReceiver', 1: 'distToReceiver'})

    defData = data.loc[data['posGen'].isin(['lb', 'db']), ['gameId', 'playId', 'frameId', 'nflId', 'displayName', 'position', 'closestReceiver', 'distToReceiver']].reset_index(drop=True)
    defData.to_csv('Defensive Data Week ' + str(i) + '.csv')
    print(i)"""

The code below determines for each play, for each defender, whether they were in man or zone coverage based on the definition established above. Based off this we establish various categories of defensive plays based on the combination of man and zone coverage implemented by the defenders playing in various positions. 

We then run a variety of calculations to determine how well a defenders marks his opponent in a man coverage scheme by measuring the distance between the defender and the receiver at various points in the play up until the pass arrives. The information from these calcs will be used later to determine who are the best defenders in man or zone coverage. 

In [None]:
"""import pandas as pd
import numpy as np

games = pd.read_csv('games.csv')
plays = pd.read_csv('plays.csv').merge(games[['gameId', 'week']], how='left', on='gameId')
players = pd.read_csv('players.csv')
positions = pd.read_csv('positions.csv')
allDistances = pd.read_csv('defensiveData.csv').merge(games[['gameId', 'week']], how='left', on='gameId')

#returns the frameId at which the pass arrived on the play (if that event occurred)
def passArrived(db, gameId, playId):
    try:
        frame = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['event'] == 'pass_arrived')]['frameId'].reset_index(drop=True)[0]
        return frame
    except (KeyError, IndexError):
        return db.loc[(db['gameId'] == gameId) & (db['playId'] == playId)]['frameId'].max()

#returns the number of defensive players at each position playing in man or zone coverage (according to definition of man vs zone coverage established previously)
def man(db, gameId, playId):
    cb = db[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['position'] == 'CB')]['man']
    cbMan = cb.sum()
    cbZone = len(cb) - cbMan
    ss = db[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['position'] == 'SS')]['man']
    ssMan = ss.sum()
    ssZone = len(ss) - ssMan
    fs = db[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['position'] == 'FS')]['man']
    fsMan = fs.sum()
    fsZone = len(fs) - fsMan
    lb = db[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['position'] == 'LB')]['man']
    lbMan = lb.sum()
    lbZone = len(lb) - lbMan
    return pd.Series([cbMan, cbZone, ssMan, ssZone, fsMan, fsZone, lbMan, lbZone])

#returns the defending team on the play
def homeTeam(db, gameId, posTeam):
    if db[db['gameId'] == gameId].reset_index(drop=True)['homeTeamAbbr'][0] == posTeam:
        return db[db['gameId'] == gameId].reset_index(drop=True)['visitorTeamAbbr'][0]
    else:
        return db[db['gameId'] == gameId].reset_index(drop=True)['homeTeamAbbr'][0]

#three definitions for position groups. 
def define(row):
    cats = []
    for pos in ['cb', 'ss', 'fs', 'lb']:
        #there are more players from the position selected playing in zone coverage
        if row[pos + 'Zone'] > row[pos + 'Man']: 
            cat = 0
        #there are more players from the position selected playing in man coverage
        elif row[pos + 'Man'] > row[pos + 'Zone']:
            cat = 1
        #there is an equal number of players from the position selected in man and zone coverage
        else:
            cat = 2
        cats.append(cat)
    return pd.Series(cats)

#this returns an array for each defender that brings in the initial distance away from his opponent, the average distance between him and his opponent for the length of the play
#as well as the final and max distance between him and his opponent on the play (up until the pass arrives)
def initDist(db, gameId, playId, player, man):
    if man == 1:
        init = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['nflId'] == player) & (db['frameId'] == 1), 'distToReceiver'].reset_index(drop=True)[0]
        avg = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['nflId'] == player), 'distToReceiver'].mean()
        final = float(db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['nflId'] == player), 'distToReceiver'].reset_index(drop=True).iloc[-1])
        maxDist = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['nflId'] == player), 'distToReceiver'].max()
        return pd.Series([init, avg, final, maxDist])
    else:
        return pd.Series([np.nan]*4)

#for any given play, this counts the number of relavent defensive events that happen on a play. Eg tackle or interception
def defenceMoves(db, gameId, playId, defMoves):
    moves = []
    for move in defMoves:
        moves.append(db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['displayName'] == 'Football') & (db['event'] == move), 'event'].count())
    return pd.Series(moves)

#function to get the coordinates of the Football at any given relevant event, and calculate the closest offensive or defensive player to the ball
def closest(db, gameId, playId, move, role, row):
    if row[move] > 0:
        #determine at which frameId a specific event occurred
        frame = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['displayName'] == 'Football') & (db['event'] == move), 'frameId'].reset_index(drop=True)[0]
        #coordinates of the football at the time of the event
        a = np.array(db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['displayName'] == 'Football') & (db['frameId'] == frame)].reset_index(drop=True)[['x', 'y']]).astype(float)
        #vector of coordinates for players on the field
        players = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['role'] == role) & (db['frameId'] == frame)].reset_index(drop=True)[['nflId', 'displayName', 'x', 'y']]
        if len(players) == 0:
            players = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['role'] == role) & (db['event'] == move)].reset_index(drop=True)[['nflId', 'displayName', 'x', 'y']]
        b = np.array(players[['x', 'y']]).astype(float)
        #distances between all players and the football to return the minimum
        distanceCalc = np.linalg.norm(b-a, axis=1)
        return pd.Series([players.loc[np.where(distanceCalc == np.min(distanceCalc))[0][0], 'nflId'], min(distanceCalc)])
    else:
        return pd.Series([np.nan, np.nan])
    
#function to return the closest defender to a given receiver
def closestDef(db, gameId, playId, rec, data, row):
    if row['pass_arrived'] > 0:
        try:
            return pd.Series(np.array(db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['event'] == 'pass_arrived') & (db['receiverId'] == rec)][['nflId', 'distToReceiver']].sort_values('distToReceiver', ascending=True).reset_inde$
        except IndexError:
            a = np.array(data.loc[(data['gameId'] == gameId) & (data['playId'] == playId) & (data['nflId'] == rec) & (data['event'] == 'pass_arrived')].reset_index(drop=True).iloc[0][['x', 'y']]).astype(float)
            #vector of coordinates for defenders
            defence = data.loc[(data['gameId'] == gameId) & (data['playId'] == playId) & (data['role'] == 'def') & (data['event'] == 'pass_arrived')].reset_index(drop=True)[['nflId', 'displayName', 'x', 'y']]
            b = np.array(defence[['x', 'y']]).astype(float)
            distanceCalc = np.linalg.norm(b-a, axis=1)
            return pd.Series([defence.loc[np.where(distanceCalc == np.min(distanceCalc))[0][0], 'nflId'], min(distanceCalc)])
    else:
        return pd.Series([np.nan, np.nan])
                                      
#function that looks up the target receiver and the closest player to this receiver when the pass arrived. It then determines whether this defender
#was in a man coverage and if he marking up on the same opponent at the beginning of the play. If these conditions are satisfied the function returns a 1, else 0                                      
def isMan(db, gameId, playId, rec, dMan, row):
    if row['pass_arrived'] > 0:
        try:
            defPlayer = db.loc[(db['gameId'] == gameId) & (db['playId'] == playId) & (db['frameId'] == 1) & (db['receiverId'] == rec) & (db['man'] == 1)]['nflId'].reset_index(drop=True)[0]
            if defPlayer == dMan:
                return 1
            else:
                return 0
        except (KeyError, IndexError):
            return 0
    else:
        return 0
                                      
for i in range(1, 18):
    data = pd.read_csv('week' + str(i) + '.csv')
    #need to drop duplicates
    data = data.drop_duplicates().reset_index(drop=True)
    #distances is the dataframe that we produced in the previous code above of the information regarding the closest receiver to a defender and the distance between them at each frameId
    distances = allDistances[allDistances['week'] == i].reset_index(drop=True)    
    distances = distances.drop_duplicates().reset_index(drop=True)
    distances = distances.merge(data[['gameId', 'playId', 'frameId', 'event']].drop_duplicates().reset_index(drop=True), how='left', on=['gameId', 'playId', 'frameId'])
                                      
    # filter for pre pass_arrived data
    current = plays[plays['week'] == i].reset_index(drop=True)
    current['passArrivedFrame'] = pd.Series(current.apply(lambda x: passArrived(data, x['gameId'], x['playId']), axis=1))
    
    #the below dataframe filters out events after the pass arrives 
    dist = distances.merge(current[['gameId', 'playId', 'passArrivedFrame']], how='left', on=['gameId', 'playId'])
    dist = dist[dist['frameId'] <= dist['passArrivedFrame']].reset_index(drop=True)
    df = dist.groupby(['gameId', 'playId', 'nflId', 'position',  'closestReceiver']).size().reset_index().rename(columns={0:'count'})
    frames = dist.groupby(['gameId', 'playId'])['frameId'].max().reset_index().rename(columns={'frameId':'frames'})
    df = df.merge(frames, how='left', on=['gameId', 'playId'])
    
    #several methods to determine whether a player is in man or zone coverage
    df['manPercentage'] = df['count'] / df['frames']
    df['man'] = [1 if x > 0.975 else 0 for x in df['manPercentage']]
                                      
    # change all LB to be LB
    df['position'] = df['position'].replace(['MLB', 'OLB', 'ILB'], 'LB')
                                      
    #based on the fact that there are more FS than SS registered, we will changed S and DB to be SS for now
    df['position'] = df['position'].replace(['DB', 'S'], 'SS')
                                      
    #this creates a dataframe which lists each defensive player for each play and whether they were in a man or zone coverage                                  
    defenders = df.sort_values(['gameId', 'playId', 'man'], ascending=[True, True, False]).drop_duplicates(subset=['gameId', 'playId', 'nflId', 'position'], keep='first').reset_index(drop=True)
    
    #return the number of players in each position playing in a certain type of coverage
    current[['cbMan', 'cbZone', 'ssMan', 'ssZone', 'fsMan', 'fsZone', 'lbMan', 'lbZone']] = current.apply(lambda x: man(defenders, x['gameId'], x['playId']), axis=1)
    
    #insert defending team
    current['defTeam'] = current.apply(lambda x: homeTeam(games, x['gameId'], x['possessionTeam']), axis=1)
    
    #count of players by position
    current[['cb', 'ss', 'fs', 'lb']] = current.apply(define, axis=1)
                                      
    #drop plays that don't have defender info
    current['defenderCount']  = current.loc[:, 'cbMan':'lbZone'].sum(1)
    current = current[current['defenderCount'] > 0].reset_index(drop=True)
                                      
    #get initial, average, final and max distance between defenders and they marked opponent during each play
    defenders[['init', 'avg', 'final', 'maxDist']] = defenders.apply(lambda x: initDist(dist, x['gameId'], x['playId'], x['nflId'], x['man']), axis=1)
    
    #next get list of events that we are interested in to see which defensive player was responsible
    defMoves = ['pass_outcome_incomplete', 'first_contact', 'qb_sack', 'pass_outcome_interception', 'tackle', 'pass_outcome_caught', 'pass_tipped', 'qb_strip_sack', 'fumble', 'out_of_bounds', 'pass_outcome_touchdown']
    current[defMoves] = current.apply(lambda x: defenceMoves(data, x['gameId'], x['playId'], defMoves), axis=1)
                                      
    #now we want to go through the columns, and if there is a 1 in the column, we will get the closest defender from the football at the timeframe in which the specific event happens
    data = data.merge(positions[['position', 'role']], on='position', how='left')
    for move in defMoves:
        current[move + '_id'] = np.nan
        current[move + '_dist'] = np.nan
        d = current[current[move] > 0].apply(lambda x: closest(data, x['gameId'], x['playId'], move, 'def', x), axis=1)
        loc = current[move] > 0
        current.loc[loc, [move + '_id', move + '_dist']] = d.rename(columns={0:move + '_id', 1:move + '_dist'})

    #for pass incomplete and pass caught, based on the closest defender we can look up their respective opponent in the distances data at the same event frameId
    #we then calculate the distance between the football and the offensive player. filter cases where the football is within 2 - 3 yards of the receiver (to take out errant throws)
    # we can then see how the probability of a complete play changes with distance between the defender and receiver. CNN model, random forest, k nearest neighbours all possibilities
    #for a player in man coverage, measure the percentage of which he allows receptions, then also measure the average YAC when he is the man marking his opponent
    feature = ['pass_arrived']
    current['pass_arrived'] = current.apply(lambda x: defenceMoves(data, x['gameId'], x['playId'], feature), axis=1)
                                      
    #get closest receiver at the time of the pass_arrived event
    current[['target_id', 'target_dist']] = current.apply(lambda x: closest(data, x['gameId'], x['playId'], 'pass_arrived', 'off', x), axis=1)
    current = current.merge(players[['nflId', 'displayName']].rename(columns={'nflId':'target_id'}), how='left', on='target_id').rename(columns={'displayName':'targetName'})
                                      
    #then lookup in distances the closest defender to this player. 
    check = data[['gameId', 'playId', 'nflId', 'displayName']].drop_duplicates().reset_index(drop=True).rename(columns={'nflId':'receiverId', 'displayName':'closestReceiver'})
    distances = distances.merge(check, on=['gameId', 'playId', 'closestReceiver'], how='left')
    distances = distances.merge(defenders[['gameId', 'playId', 'displayName', 'man']], on=['gameId', 'playId', 'displayName'], how='left')
    current[['coverage_id', 'distToReceiver']] = current.apply(lambda x: closestDef(distances, x['gameId'], x['playId'], x['target_id'], data, x), axis=1)
    
    #we would also want to check who was marking this receiver in the first frame, and whether it is a man coverage or not. If man, it would be a negative outcome for the defender 
    current['man'] = current.apply(lambda x: isMan(distances, x['gameId'], x['playId'], x['target_id'], x['coverage_id'], x), axis=1)
    #if the receiver isn't found it is likely that the player isn't in a tight man coverage
    #note that the distance between receiver and defender will not be very effective for lbs covering the hb as they may be closer to a tight end but that may not be their man
    
    #define the play was successful is the epa of the play was greater than 0
    current['success'] = (current['epa'] > 0).astype(int)
    
    #now we have our relevant dataframes ready
    current.to_csv('playsSummaryWeek' + str(i) + '.csv')
    defPlayers.to_csv('defPlayersWeek' + str(i) + '.csv')
    distances.to_csv('distancesWeek' + str(i) + '.csv')
    defenders.to_csv('defendersWeek' + str(i) + '.csv')
"""

# The Results: Do defenses perform better in zone or man coverage?



Below is a breakdown of defensive performance based on various classifications of defensive schemes. We breakdown whether the defence is in man or zone coverage by position. The different combinations are then sorted by the average offensive epa achieved against those defensive setups from lowest to highest. The success column is the average success rate of an offensive play against each defensive setup, where a play is successful if the epa > 0. The two most successful defensive schemes both occur with CBs in zone coverage, with the most successful having the SSs and LBs in man coverage. 

In [None]:
import pandas as pd
players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/players.csv')
current = pd.read_csv('/kaggle/input/mydata/current.csv')
defenders = pd.read_csv('/kaggle/input/mydata/defenders.csv')

coverage = current[(current['cb'] != 2) & (current['ss'] != 2) & (current['lb'] != 2)].groupby(['cb', 'ss', 'lb'])['epa'].mean().reset_index().sort_values(['epa'], ascending=True).reset_index(drop=True)
coverage['success'] = current[(current['cb'] != 2) & (current['ss'] != 2) & (current['lb'] != 2)].groupby(['cb', 'ss', 'lb'])[['epa', 'success']].mean().reset_index().sort_values(['epa'], ascending=True).reset_index(drop=True)['success']

coverage = coverage.replace(to_replace=0, value="zone") 
coverage = coverage.replace(to_replace=1, value="man") 

coverage.style \
  .format('{:.2f}', subset=['epa', 'success']) \
  .bar(align='mid', color=['#90EE90', '#FCC0CB'], subset=['epa', 'success']) \
  .set_caption('Defensive performance by coverage (Man/Zone)') \
  .set_properties(padding="15px", border='2px solid white', width='50px', subset=['cb', 'ss', 'lb']) \
  .set_properties(padding="15px", border='2px solid white', width='200px', subset=['epa', 'success'])

# CBs should play more zone coverage

To get a better indication of how the defense performs by man or zone coverage, we can isolate one position at a time. The table below shows the average offensive epa per play when each of the positions are playing in zone (0), man (1) or split (1) coverage. (Split would be where for example 1 CB plays in man and 1 in zone coverage). From the results we can see that the defense performed better (i.e. forced a lower average offensive epa per play) when CBs, SS, and FSs played in zone coverage, which is especially significant for the CB position. This might be useful for defensive playcallers as the CB position is the most likely to operate in a man coverage. The differences between the defensive performance for different LB defensive schemes are minimal, but there is a slight advantage when they play in man coverage.

In [None]:
summary = pd.DataFrame()
for pos in ['cb', 'ss', 'fs', 'lb']:
    d = pd.Series(current.groupby([pos])['epa'].mean().reset_index(drop=True))
    summary[pos] = d
    
summary.style \
  .format('{:.2f}') \
  .background_gradient(cmap='winter_r') \
  .set_properties(padding="15px", border='2px solid white', width='100px')

# Should defenses change their defensive play calls based on the offensive formation?

We see experienced QBs making audibles all the team based on what they see in the defensive set up in from of them, but how often does the defensive coordinator let their players change things up after seeing what kind of formation the offense sets up in? Obviously at times there would not be enough time to call defensive audibles just before the ball is snapped, but maybe there would be ways to implement some changes based on the offensive formation if the observed results could be seen to occur season in season out - in this case one year of data may not be enough to justify any overhauls to the defensive playcalling system. But nevertheless! The results below show the average offensive epa per play by offensive formation and defensive scheme. This reinforces what we saw previously that in mainly common pass formations (e.g. SHOTGUN), a zone coverage scheme fairs considerably better. 

In [None]:
cb = current.groupby(['offenseFormation', 'cb'])['epa'].mean().unstack().reset_index().iloc[:, :3].rename(columns={0:'cbZone', 1:'cbMan'})
lb = current.groupby(['offenseFormation', 'lb'])['epa'].mean().unstack().reset_index().iloc[:, :3].rename(columns={0:'lbZone', 1:'lbMan'})
ss = current.groupby(['offenseFormation', 'ss'])['epa'].mean().unstack().reset_index().iloc[:, :3].rename(columns={0:'ssZone', 1:'ssMan'})
allDef = cb.merge(lb, on='offenseFormation',  how='left').merge(ss, on='offenseFormation',  how='left').transpose().reset_index()

headings = allDef.iloc[0].reset_index(drop=True)
allDef.columns = headings
allDef = allDef.iloc[1:].reset_index(drop=True)

allDef.style \
  .format('{:.2f}', subset=['EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT']) \
  .background_gradient(cmap='winter_r', subset=['EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT'])\
  .set_caption('Breakdown of defensive performance by offensive formation and man vs zone coverage') \
  .set_properties(padding="15px", border='2px solid white', width='20px', subset=['offenseFormation']) \
  .set_properties(padding="15px", border='2px solid white', width='150px', subset=['EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT'])

# Who is the best defensive back in tight man coverage?

From the information we produced from the code run above, we can now start to play around at the individual level in order to determine who are the best individual defensive players using the tracking data that we have available to us. Below is a summary of the top 10 defensive backs by final distance between themselves and their marked opponent (when in a man coverage assignment) at the time the pass arrives at its intended target. NOTE that this covers all throws, not specifically when the ball is thrown to the man they are marking, so this may affect the numbers if for example a defender drifts off his man when the ball is in flight prior to arriving if they notice that it is not going to their marked opponent. We will account for this in the next section.

In [None]:
playerCoverage = defenders[(defenders['man'] == 1) & (defenders['position'].isin(['FS', 'SS', 'CB']))].groupby(['displayName', 'position'])[['avg', 'final']].mean().reset_index().dropna()
playerCoverage['count'] = defenders[(defenders['man'] == 1) & (defenders['position'].isin(['FS', 'SS', 'CB']))].groupby(['displayName', 'position'])['avg'].count().reset_index().dropna()['avg']
playerCoverage = playerCoverage.sort_values('final', ascending=True).reset_index(drop=True)

playerCoverage[playerCoverage['count'] > 10].drop(['count'], axis=1).head(10).style.format({"avg": "{:.2f}", 
                          "final": "{:.2f}"})\
                 .hide_index()\
                 .bar(align='mid', subset=["avg",], color='lightgreen')\
                 .bar(align='mid', subset=["final"], color='lightblue') \
                 .set_properties(padding="15px", border='2px solid white', width='150px', subset=['displayName', 'position', 'avg', 'final'])

# Positional breakdown: Top 10 man coverage defenders at each position

As mentioned above, the above rankings took into account routes run on all plays, regardless of whether the ball was actually thrown to the receiver that the defender was marking up against. In the summary tables below, we account for this, and return a measure that determines the ability of a player to break up a pass when playing in man coverage. The results for the top 10 man coverage defenders at each position are displayed below, with plenty of familiar names, but some that may not be as expected. 

The summary tables bring in the average distance between the defender and their opponent when they are playing in man coverage and their opponent is targeted, as well as the average epa per such play, the percentage of successful plays that occurred (where success = 1 when epa > 0), and the percentage of completed passes when the specific defender is in coverage. The tables are sorted by the completion percentage, whereby a lower completion rate is a better defensive outcome. 

The data is filtered for cases where the football is within 2.5 yards of the targeted receiver to avoid including cases where the QB throws an errant pass and we are incorrectly assigning that as a successful defensive play made by a specific defender who was within the area. 

In [None]:
players['position'] = players['position'].replace(['MLB', 'OLB', 'ILB'], 'LB')
players['position'] = players['position'].replace(['DB', 'S'], 'SS')

manStats = current[(current['man'] == 1) & (current['target_dist'] < 2.5)].groupby('coverage_id')[['distToReceiver', 'epa', 'success']].mean().reset_index()
manCompletion = current[current['man'] == 1].groupby(['coverage_id', 'passResult']).size().unstack(fill_value=0).reset_index()
manCompletion['completion (%)'] = manCompletion['C'] / (manCompletion['C'] + manCompletion['I'] + manCompletion['IN'])
manCompletion ['count'] = (manCompletion['C'] + manCompletion['I'] + manCompletion['IN'])
manStats = manStats.merge(manCompletion[['coverage_id', 'completion (%)', 'count']], on='coverage_id', how='left')

#coverage stats looks at the average dist a defender is from their opposite man when they were playing in man coverage, as well as the average offensive epa plus the percentage of successful offensive plays
manStats  = manStats.merge(players[['nflId', 'displayName']].rename(columns={'nflId':'coverage_id'}), how='left', on='coverage_id')
manRankings = manStats[manStats['count'] > 10].sort_values('completion (%)', ascending=True).reset_index(drop=True).rename(columns={'coverage_id':'nflId'}).merge(players[['nflId', 'position']], on='nflId', how='left')

a = manRankings[manRankings['position'] == 'CB'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best CBs in man coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

b = manRankings[manRankings['position'] == 'SS'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best SSs in man coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

c = manRankings[manRankings['position'] == 'LB'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best LBs in man coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

display(a)
display(b)
display(c)

# Positional breakdown: Top 10 zone coverage defenders at each position

Similarly as above, we can determine from the data who are the best defenders in zone coverages. These defenders may be considered as better readers of the game, whereby they can react better to the offensive play unfolding to make a play. Definitions are the same as described above for the man coverage analysis. 

In [None]:
zoneStats = current[(current['man'] == 0) & (current['target_dist'] < 2.5)].groupby('coverage_id')[['distToReceiver', 'epa', 'success']].mean().reset_index()
zoneCompletion = current[current['man'] == 0].groupby(['coverage_id', 'passResult']).size().unstack(fill_value=0).reset_index()
zoneCompletion['completion (%)'] = zoneCompletion['C'] / (zoneCompletion['C'] + zoneCompletion['I'] + zoneCompletion['IN'])
zoneCompletion['count'] = (zoneCompletion['C'] + zoneCompletion['I'] + zoneCompletion['IN'])
zoneStats = zoneStats.merge(zoneCompletion[['coverage_id', 'completion (%)', 'count']], on='coverage_id', how='left')

#coverage stats looks at the average dist a defender is from their opposite man when they were playing in man coverage, as well as the average offensive epa plus the percentage of successful offensive plays
zoneStats  = zoneStats.merge(players[['nflId', 'displayName']].rename(columns={'nflId':'coverage_id'}), how='left', on='coverage_id')
zoneRankings = zoneStats[zoneStats['count'] > 10].sort_values('completion (%)', ascending=True).reset_index(drop=True).rename(columns={'coverage_id':'nflId'}).merge(players[['nflId', 'position']], on='nflId', how='left')

a = zoneRankings[zoneRankings['position'] == 'CB'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best CBs in zone coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])


b = zoneRankings[zoneRankings['position'] == 'SS'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best SSs in zone coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

c = zoneRankings[zoneRankings['position'] == 'FS'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best FSs in zone coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

d = zoneRankings[zoneRankings['position'] == 'LB'][['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best LBs in zone coverage') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

display(a)
display(b)
display(c)
display(d)

# Who's the best in contested situations? Our top 10 (Man coverage)

We now wanted to filter the dataset again to see who were the best at breaking up the pass when they are within contestable distance of the receiver attempting to make a catch. Prior to this, we were including the data even if the receiver made a wide open catch without pressure and were then tackled (or not) by the closest defender. This will give us an idea of who is the best in a contested football situation. The summary below displays the top 10 defenders based on the above criteria across all positions.

In [None]:
manStats = current[(current['man'] == 1) & (current['target_dist'] < 2.5) & (current['distToReceiver'] < 2.5)].groupby('coverage_id')[['distToReceiver', 'epa', 'success']].mean().reset_index()
manCompletion = current[current['man'] == 1].groupby(['coverage_id', 'passResult']).size().unstack(fill_value=0).reset_index()
manCompletion['completion (%)'] = manCompletion['C'] / (manCompletion['C'] + manCompletion['I'] + manCompletion['IN'])
manCompletion ['count'] = (manCompletion['C'] + manCompletion['I'] + manCompletion['IN'])
manStats = manStats.merge(manCompletion[['coverage_id', 'completion (%)', 'count']], on='coverage_id', how='left')

#coverage stats looks at the average dist a defender is from their opposite man when they were playing in man coverage, as well as the average offensive epa plus the percentage of successful offensive plays
manStats  = manStats.merge(players[['nflId', 'displayName']].rename(columns={'nflId':'coverage_id'}), how='left', on='coverage_id')
manRankings = manStats[manStats['count'] > 10].sort_values('completion (%)', ascending=True).reset_index(drop=True).rename(columns={'coverage_id':'nflId'}).merge(players[['nflId', 'position']], on='nflId', how='left')

a = manRankings[['displayName', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best defenders in man coverage in the contest') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName','distToReceiver', 'epa', 'success', 'completion (%)'])

display(a)

# Who's the best in contested situations? Our top 10 (Zone coverage)


In [None]:
zoneStats = current[(current['man'] == 0) & (current['target_dist'] < 2.5) & (current['distToReceiver'] < 2.5)].groupby('coverage_id')[['distToReceiver', 'epa', 'success']].mean().reset_index()
zoneCompletion = current[current['man'] == 0].groupby(['coverage_id', 'passResult']).size().unstack(fill_value=0).reset_index()
zoneCompletion['completion (%)'] = zoneCompletion['C'] / (zoneCompletion['C'] + zoneCompletion['I'] + zoneCompletion['IN'])
zoneCompletion['count'] = (zoneCompletion['C'] + zoneCompletion['I'] + zoneCompletion['IN'])
zoneStats = zoneStats.merge(zoneCompletion[['coverage_id', 'completion (%)', 'count']], on='coverage_id', how='left')

#coverage stats looks at the average dist a defender is from their opposite man when they were playing in man coverage, as well as the average offensive epa plus the percentage of successful offensive plays
zoneStats  = zoneStats.merge(players[['nflId', 'displayName']].rename(columns={'nflId':'coverage_id'}), how='left', on='coverage_id')
zoneRankings = zoneStats[zoneStats['count'] > 10].sort_values('completion (%)', ascending=True).reset_index(drop=True).rename(columns={'coverage_id':'nflId'}).merge(players[['nflId', 'position']], on='nflId', how='left')

a = zoneRankings[['displayName', 'position', 'distToReceiver', 'epa', 'success', 'completion (%)']].head(10).style.format({"distToReceiver": "{:.2f}", 
                                   "epa": "{:.2f}", "success": "{:.2f}", "completion (%)": "{:.2f}"})\
                 .hide_index()\
                 .set_caption('Best defenders in zone coverage in the contest') \
                 .background_gradient(cmap='Purples', subset=['distToReceiver']) \
                 .background_gradient(cmap='Reds', subset=['epa']) \
                 .background_gradient(cmap='winter_r', subset=['completion (%)']) \
                 .bar(align='mid', subset=["success"], color='orange') \
                 .set_properties(padding="15px", border='2px solid white', width='200px', subset=['displayName', 'position', 'distToReceiver', 'epa', 'success', 'completion (%)'])


display(a)


# How does distance between defender and receiver affect probability of success on a play?

We all know that the further away a defensive player is from a receiver attempting to catch a pass from their QB, the more likely they will be to successfully haul it in. However, now armed with the tracking data for all plays of the 2018 season, we are able to demonstrate this relationship between distance and probability of a catch. The effect is rather as expected! Graph shown below. 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objs as go

def catchProb(x):
    a = current[(current['target_dist'] < 2.5) & (current['distToReceiver'] < x)]['passResult'].value_counts().reset_index()
    return a.loc[a['index'] == 'C', 'passResult'].reset_index(drop=True)[0] / a['passResult'].sum()

x_func = np.linspace(0.25, 5, 50)
y_func = [catchProb(x) for x in x_func]
fig = go.Figure()

fig.add_trace(go.Scatter(x=x_func, y=y_func, name='nfl Average',
                    mode='lines', line=dict(width=4)))

fig.update_layout(title='Effect of defensive pressure on pass outcome',
                   xaxis_title='Distance from receiver (yards)',
                   yaxis_title='Prob of completion (%)',
                  width=800, height=600)

fig.show()

# How do our best individual defenders match up against the NFL average?

The below graphs demonstrate how our best individual defenders stack up against the NFL average when it comes to breaking up passing at certain distances from the targeted receiver. Again as expected, for the most part our top defenders are below the NFL average line at all distances. 

In [None]:
current = current.merge(players[['nflId', 'displayName']].rename(columns={'nflId':'coverage_id', 'displayName':'coverageName'}), on='coverage_id', how='left')

def catchProb(x, cov):
    a = current[(current['target_dist'] < 2.5) & (current['distToReceiver'] < x) & (current['man'] == cov)]['passResult'].value_counts().reset_index()
    return a.loc[a['index'] == 'C', 'passResult'].reset_index(drop=True)[0] / a['passResult'].sum()

def catchProbInd(x, i, cov):
    if cov == 1:
        nflId = manRankings.loc[i, 'nflId']
    else:
        nflId = zoneRankings.loc[i, 'nflId']
    a = current[(current['target_dist'] < 2.5) & (current['distToReceiver'] < x) & (current['coverage_id'] == nflId) & (current['man'] == cov)]['passResult'].value_counts().reset_index()
    try:
        return a.loc[a['index'] == 'C', 'passResult'].reset_index(drop=True)[0] / a['passResult'].sum()
    except KeyError:
        return 0


x_func = np.linspace(0.5, 2.5, 10)
# numpy will do the right thing and evaluate found_fit for all elements
y_func = [catchProb(x, 1) for x in x_func]

#fig, ax = plt.subplots(figsize=(12, 9))
fig = go.Figure()
for i in range(10):
    fig.add_trace(go.Scatter(x=x_func, y=[catchProbInd(x, i, 1) for x in x_func], name=manRankings.loc[i, 'displayName'],
                    mode='lines'))

fig.add_trace(go.Scatter(x=x_func, y=y_func, name='nfl Average',
                    mode='lines', line=dict(color='black', width=4, dash='dash')))

fig.update_layout(title='Effect of defensive pressure on pass outcome (man coverage)',
                   xaxis_title='Distance from receiver (yards)',
                   yaxis_title='Prob of completion (%)',
                   width=1200, height=700)

fig.show()


In [None]:
x_func = np.linspace(0.25, 3.5, 10)
y_func = [catchProb(x, 0) for x in x_func]

fig = go.Figure()
for i in range(10):
    fig.add_trace(go.Scatter(x=x_func, y=[catchProbInd(x, i, 0) for x in x_func], name=zoneRankings.loc[i, 'displayName'],
                    mode='lines'))

fig.add_trace(go.Scatter(x=x_func, y=y_func, name='nfl Average',
                    mode='lines', line=dict(color='black', width=4, dash='dash')))

fig.update_layout(title='Effect of defensive pressure on pass outcome (zone coverage)',
                   xaxis_title='Distance from receiver (yards)',
                   yaxis_title='Prob of completion (%)',
                 width=1200, height=700)

fig.show()
