In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents
1. [Introduction to Ball Proximity Average](#Introduction to Ball Proximity Average)
2. [BPA Metric Use Case](#BPA Metric Use Case)
3. [Data Preparation and Processing](#Data Preparation and Processing)
4. [BPA Analysis by Age](#BPA Analysis by Age)
5. [BPA Analysis by Position](#BPA Analysis by Position)
6. [BPA Analysis by College](#BPA Analysis by College)
7. [Analytical Conclusions](#Analytical Conclusions)
8. [Appendix](#Appendix)
9. [Visualizations](#Visualizations)



## Introduction to Ball Proximity Average (BPA)

This data analytics project aims to uncover insights with respect to a novel statistic derived in our analysis which we will call Ball Proximity Average or BPA. To summarize the use case of this metric, I will paraphrase a concept that we have all heard from coaches when giving praise to defensive players. “He has a nose for the football”, “She is always around the football disrupting the play”, or “They were ‘Johnny on the spot’” are often notable words of praise on players whose job it is to play defense. Their job is to make tackles, get fumble recoveries, assist teammates in the process or otherwise be disruptive by hunting down the football during the play. Ball Proximity Average (BPA) is defined as the average distance a player is away from the football during the relevant events of all “Live Kickoff” plays that only include touchdown, safety, tackle, fumble, and defensive fumble recovery. This statistic can be viewed objectively as a way to judge how close a kickoff coverage special teams player was to the ball on average when there was a “play to be made”. For each relevant event instance, players were awarded a distance value as measured by the differences in their coordinates and the football coordinates using the Euclidean Distance formula aka the Pythagorean Theorem. Over the course of each of the three sample seasons, BPA, tackles, assists, misses, and fumble recoveries were aggregated and compared using simple linear regression and comparative analysis. The resulting StoryBoard presentation was published in Tableau and made public for this competition. See link to the Tableau Dashboard here. [Our Tableau Dashboard](http://public.tableau.com/app/profile/taidje.tang/viz/NFLBigDataBowl2022BallProximityAnalysis/2018_Avg_BPAvs_Avg_TackleRatebyPosition)
    
## BPA Metric Use Case

Kickoff Coverage is an area of the game we believe we can improve and make more objectively measurable by using BPA when evaluating players. In this analysis, I will propose the idea of using BPA as a simple, yet articulate way to measure the performance of all non-kicking or quarterbacking players while defending against “Live” Kickoff Returns. This idea may also be useful for other phases of the game, however this analysis focuses on analyzing performance, in-game potential, and overall “hustle” while handling kickoff coverage duties.

First imagine you are watching a “live” kickoff play where there was no touchback or out-of-bounds kick with the ball being returned or fielded by the receiving team. No matter the result of the play, the distance of the player from the ball is relevant to his performance because it represents one’s ability to make a play on the ball. Just as one can’t make a tackle from 15 yards away from the ball, he is also limited on his ability to recover a fumble or help a teammate who may have missed a tackle. From this perspective, the BPA metric can also be objectively used to judge a player’s “hustle” and “nose for the football”. For example, in the event that a player is beaten on an open field block that led to a touchdown for the receiving team, that player would have their BPA increase accordingly. By the same token, the player that narrowly missed catching the returner trailing the eventual touchdown would have their BPA much less affected than the weak link player who ended the play many yards away. In American football, proximity to the football matters tremendously, especially when there is a play to be made. 

The potential use case and applications of the BPA metric are extensive. At the end of every game, season, or even career, BPA can be used to summarize players’ play making potential and hustle. While the metric doesn’t always translate into success on a play in the form of a tackle, assist, touch, or fumble recovery, it can certainly quantify the potential of a player to make the play. Much like Batting Average in baseball, this rolling average can be viewed against any number of factors such as field conditions, recency, or vs. certain opponents. I can imagine using this stat to analyze the impact of a player who may not show up in the traditional box score because it may be useful by giving credit to players who are “around the football”. In conclusion, the lower a player’s BPA, the closer they are to the football when a play needs to be made. 

## Data Preparation and Processing

Since BPA was strictly defined to be applied to only non-kicker and non-quarterback Kickoff Coverage players, the metric needed to be created only using relevant tracking data for relevant play event instances. The process of the cleaning and refining necessary was meticulous and tedious, but essential to the objectivity of the stat. The following conditions were used to determine “relevant” plays:

-specialTeamsPlayType was limited to ‘Kickoff’ only.

-Relevant events on kickoff play events only included ‘tackle’, ‘fumble’, ‘fumble_defense_recovered’, ‘touchdown’, and ‘safety’. These were determined to be the “plays to be made” on kickoff coverage. 

-The set only included players on kickoff coverage, excluding players on the receiving team.

Once relevant plays and qualifying players were selected, many columns in the data set needed to be cleaned and transformed in order to accurately tally play success measurables such as tackle_rate, touch_rate, and fumble_recovery_rate. The details of the data processing are in the source code of this note book in the [Appendix](#Appendix). 

## BPA Analysis by Age

When examining age as a factor, BPA tends to increase for players over 30 when compared to players in their 20s. Some outliers exist in the over 30 years old segment. This may be a result of the players who are still playing that postion have specialized and excelled which kept them there, as opposed to being cut for a younger player. 

## BPA Analysis by Position

When examining position as a factor, avg. BPA tends to increase for players who are on offense as opposed to defense. Tackle rates reflect poorer performance as a result. 

## BPA Analysis by College

When examining College as a factor, we found it difficult to draw insights from the college attended as a factor in BPA, other than perhaps players coming from smaller programs may be "huslting" more on plays because they were always underdogs coming into the league. When sorted in ascending order, most players from schools with the lowest average BPA often come from colleges with very few products in the NFL. 

## Analytical Conclusions

Unfortunately, due to our limited Kaggle experience and current difficulties presenting our findings in python, we will have to make my analytical conclusions based on my findings in Tableau. After examining BPA vs. performance stats such as tackle rate, touch rate, and fumble recovery rate against such factors as Age, Postion, College, and BPA segment, we have evidence that BPA can be used as a measure for evaluting players in 2 ways: 

*We can use BPA to judge when a player has "lost a step" or is "getting older" as BPA clearly increases as a player enters their 30s. 

*We can use BPA to judge how close a player was to the ball during relevant events and measure their proximity as a metric for "play making potential" since there exists a broad inverse relationship between kickoff coverage performance and BPA. 

With our findings, we think there is a good use case argument for Ball Proximity Average as a statistic to follow throughout a player's game, season, and even career. 


## Appendix

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime, date

In [None]:
#read tracking data from each season
tracking_2018 = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/tracking2018.csv')
tracking_2019 = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/tracking2019.csv')
tracking_2020 = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/tracking2020.csv')

In [None]:
#read plays data and assign
plays = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/plays.csv')
#read player data and assign
players = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/players.csv')
#read games data and assign
games = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/games.csv')
#read PFFScouting data and assign
pff = pd.read_csv(r'/kaggle/input/nfl-big-data-bowl-2022/PFFScoutingData.csv')

In [None]:
#examine the data using examine function

def examine(table):
    print('The dataset details:')
    print(table.shape)
    print(table.columns)
    print("*"*20)
    print('The statistical breakdown of column ranges:')
    print(table.describe())
    print("*"*20)
    print('Checking the first 5 rows:')
    print(table.head())

In [None]:
#examine(tracking_2018)
#examine(tracking_2019)
#examine(tracking_2020)
#examine(plays)
#examine(players)
#examine(games)
#examine(pff)

After inspecting and examining the data, proceed to preparing and processing the data

In [None]:
#filter for Kickoffs plays only
kickoff_plays = plays[plays['specialTeamsPlayType'] == 'Kickoff']

In [None]:
#filter out kickoff plays where the words 'RECOVERED' and 'Replay' are in playDescription
d_fumble_plays = kickoff_plays[kickoff_plays['playDescription'].str.contains('RECOVERED') ]

#manually inspected all 7 plays and they were all overturned by replay, not actually making them fumbles
nullified_fumble_plays = d_fumble_plays[d_fumble_plays['playDescription'].str.contains('Replay') ]

#get an indicator value to remove the subset of nullified plays on
refined_d_fumble_plays = pd.merge(d_fumble_plays, nullified_fumble_plays, on =['gameId', 'playId'], how = 'left', indicator = True)

In [None]:
#this will result in a set without the rows that intersected on left merge
refined_d_fumble_plays = refined_d_fumble_plays[refined_d_fumble_plays['_merge'] == 'left_only']

#manually checked and confirmed that all 7 of the resulting plays were NULLIFIED due to replay overturning the ruling on the field
refined_d_fumble_plays.drop(['playDescription_y', 'quarter_y', 'down_y', 'yardsToGo_y',
       'possessionTeam_y', 'specialTeamsPlayType_y', 'specialTeamsResult_y',
       'kickerId_y', 'returnerId_y', 'kickBlockerId_y', 'yardlineSide_y',
       'yardlineNumber_y', 'gameClock_y', 'penaltyCodes_y',
       'penaltyJerseyNumbers_y', 'penaltyYards_y', 'preSnapHomeScore_y',
       'preSnapVisitorScore_y', 'passResult_y', 'kickLength_y',
       'kickReturnYardage_y', 'playResult_y', 'absoluteYardlineNumber_y'], axis = 1, inplace = True)

#rename columns back to their original name
refined_d_fumble_plays = refined_d_fumble_plays.rename(columns={'playDescription_x': 'playDescription', 'quarter_x':'quarter', 'down_x':'down',
       'yardsToGo_x': 'yardsToGo', 'possessionTeam_x': 'possessionTeam', 'specialTeamsPlayType_x': 'specialTeamsPlayType',
       'specialTeamsResult_x': 'specialTeamsResult', 'kickerId_x':'kickerId', 'returnerId_x':'returnerId', 'kickBlockerId_x':'kickBlockerId',
       'yardlineSide_x': 'yardlineSide', 'yardlineNumber_x': 'yardlineNumber', 'gameClock_x':'gameClock', 'penaltyCodes_x':'penaltyCodes',
       'penaltyJerseyNumbers_x':'enaltyJerseyNumbers', 'penaltyYards_x':'penaltyYards', 'preSnapHomeScore_x':'preSnapHomeScore',
       'preSnapVisitorScore_x':'preSnapVisitorScore', 'passResult_x':'passResult', 'kickLength_x':'kickLength',
       'kickReturnYardage_x':'kickReturnYardage', 'playResult_x':'playResult', 'absoluteYardlineNumber_x':'absoluteYardlineNumber'})

In [None]:
#declare a function to search for defensive recovery indication in playDescription column
def d_recovery_check(x):
    word_index = 0
    word_list = x.split()
    for word in word_list:
        if word == 'RECOVERED':
            return word_list[word_index + 2]
        else:
            word_index += 1

In [None]:
#now we can count the fumble recoveries based on credits and counts
#now the d_recovery_credit' column will be used to hold the team abbr and abbr name of the player
#will be used differently later
refined_d_fumble_plays['d_recovery_credit'] = refined_d_fumble_plays['playDescription'].apply(lambda x: d_recovery_check(x))

In [None]:
#make a function to parse the string 'd_recovery_credit' and parse out the last name of the player only

def d_recovery_name(x):
    credit = x.split('-')
    player_team = credit[0] 
    player_name = credit[1]
    player_last_name = player_name.split('.')[1]
    return player_last_name

In [None]:
#split the result into the last name of the player credited with with having recovered the ball
refined_d_fumble_plays['recovery_lastName'] = refined_d_fumble_plays['d_recovery_credit'].apply(lambda x: d_recovery_name(x))

In [None]:
#merge the 'recovery_lastName' column onto the refined_d_fumble_plays df on gameId and playId
lastNames_d_fumble_plays = pd.DataFrame(refined_d_fumble_plays, columns = ['recovery_lastName', 'gameId', 'playId'])

#finally left merge the recovery_lastname column back onto the original kickoffs subset
kickoff_plays = pd.merge(kickoff_plays, lastNames_d_fumble_plays, on =['gameId', 'playId'], how = 'left')

In [None]:
#start cleaning player table data to be merged onto the final refined aggregated dataset later
#create a function for height conversion

def height_conversion(x):
    if len(str(x)) >2:
        ft_in = x.split('-')
        height_inches = int(ft_in[0])*12 + int(ft_in[1])
    else:
        height_inches = x
    return height_inches

In [None]:
#convert height to inches from feet
players['height'] = players['height'].apply(lambda x: height_conversion(x))

Start by cleaning the players table for later merge onto aggregate data table

In [None]:
#first, manually input the missing birthdates for a number of players
#used www.espn.com for finding birthdates
players.loc[ players['nflId'] == 52464, 'birthDate'] = '1997-08-21'
players.loc[ players['nflId'] == 52592, 'birthDate'] = '1997-10-18'
players.loc[ players['nflId'] == 53086, 'birthDate'] = '1997-09-24'
players.loc[ players['nflId'] == 52606, 'birthDate'] = '1997-10-28'
players.loc[ players['nflId'] == 52585, 'birthDate'] = '1997-12-01'
players.loc[ players['nflId'] == 52566, 'birthDate'] = '1997-11-05'
players.loc[ players['nflId'] == 52626, 'birthDate'] = '1997-07-21'
players.loc[ players['nflId'] == 50975, 'birthDate'] = '1994-11-26'
players.loc[ players['nflId'] == 52637, 'birthDate'] = '1997-07-30'
players.loc[ players['nflId'] == 53020, 'birthDate'] = '1996-06-30'
players.loc[ players['nflId'] == 52624, 'birthDate'] = '1999-03-31'
players.loc[ players['nflId'] == 52662, 'birthDate'] = '1997-07-11'
players.loc[ players['nflId'] == 52631, 'birthDate'] = '1997-07-17'
players.loc[ players['nflId'] == 52539, 'birthDate'] = '1998-08-27'
players.loc[ players['nflId'] == 41112, 'birthDate'] = '1989-05-23'
players.loc[ players['nflId'] == 52587, 'birthDate'] = '1998-01-17'
players.loc[ players['nflId'] == 52459, 'birthDate'] = '1997-09-20'


In [None]:
#convert birthDate to a datetime object
players['birthDate'] = pd.to_datetime(players['birthDate']) 

In [None]:
#get the datetime delta for each players age using datetime objects for today and birthdate
today = datetime.today()
players['age'] = today - players['birthDate']

In [None]:
#with the resulting date time deltas, extract the number of days and divide by 365 for years, leaving NA alone for now
def convert_age(x):
    try:
        time_delta_str = str(x)
        days = int(time_delta_str.split()[0])
        return days/365
    except:
        return x

In [None]:
#run the convert_age function on the age column
players['age'] = players['age'].apply(lambda x: convert_age(x))


#subtract the number of years we are now removed from prior seasons to get players age at the time
players['age_2018'] = (players['age'] - 3).map(int)
players['age_2019'] = (players['age'] - 2).map(int)
players['age_2020'] = (players['age'] - 1).map(int)

In [None]:
#filter the tracking events frames to only include the types of play instances of concern

In [None]:
#define and assign all relevant event types to track and include on kickoff plays (define the universe of relevant plays)

tackle_event_tracking_2018 = tracking_2018[tracking_2018['event'] == 'tackle' ]
tackle_event_tracking_2019 = tracking_2019[tracking_2019['event'] == 'tackle' ]
tackle_event_tracking_2020 = tracking_2020[tracking_2020['event'] == 'tackle' ]

fumble_event_tracking_2018 = tracking_2018[tracking_2018['event'] == 'fumble']
fumble_event_tracking_2019 = tracking_2019[tracking_2019['event'] == 'fumble']
fumble_event_tracking_2020 = tracking_2020[tracking_2020['event'] == 'fumble']

d_recovery_event_tracking_2018 = tracking_2018[tracking_2018['event'] == 'fumble_defense_recovered']
d_recovery_event_tracking_2019 = tracking_2019[tracking_2019['event'] == 'fumble_defense_recovered']
d_recovery_event_tracking_2020 = tracking_2020[tracking_2020['event'] == 'fumble_defense_recovered']

touchdown_event_tracking_2018 = tracking_2018[tracking_2018['event'] == 'touchdown']
touchdown_event_tracking_2019 = tracking_2019[tracking_2019['event'] == 'touchdown']
touchdown_event_tracking_2020 = tracking_2020[tracking_2020['event'] == 'touchdown']

safety_event_tracking_2018 = tracking_2018[tracking_2018['event'] == 'safety']
safety_event_tracking_2019 = tracking_2019[tracking_2019['event'] == 'safety']
safety_event_tracking_2020 = tracking_2020[tracking_2020['event'] == 'safety']

In [None]:
#concatente the relevant play events for each season 
concat_2018 = [tackle_event_tracking_2018, fumble_event_tracking_2018, d_recovery_event_tracking_2018, touchdown_event_tracking_2018, safety_event_tracking_2018]
concat_2019 = [tackle_event_tracking_2019, fumble_event_tracking_2019, d_recovery_event_tracking_2019, touchdown_event_tracking_2019, safety_event_tracking_2019]
concat_2020 = [tackle_event_tracking_2020, fumble_event_tracking_2020, d_recovery_event_tracking_2020, touchdown_event_tracking_2020, safety_event_tracking_2020]

#finish concatenation of play frames
relevant_events2018 = pd.concat(concat_2018)
relevant_events2019 = pd.concat(concat_2019)
relevant_events2020 = pd.concat(concat_2020)

In [None]:
#inner join the kickoff plays with the relevant play tracking frames, to get the desired events for kickoffs only
kickoff_play_frames2018 = pd.merge(relevant_events2018, kickoff_plays, on = ['gameId', 'playId'], how = 'inner' )
kickoff_play_frames2019 = pd.merge(relevant_events2019, kickoff_plays, on = ['gameId', 'playId'], how = 'inner' )
kickoff_play_frames2020 = pd.merge(relevant_events2020, kickoff_plays, on = ['gameId', 'playId'], how = 'inner' )

In [None]:
#get the specific rows labeled 'football' in the displayName column
football_tracking2018 = kickoff_play_frames2018[kickoff_play_frames2018['displayName'] == 'football']
football_tracking2019 = kickoff_play_frames2019[kickoff_play_frames2019['displayName'] == 'football']
football_tracking2020 = kickoff_play_frames2020[kickoff_play_frames2020['displayName'] == 'football']

#create new shortened dataframes for just the football coordinates, columns that will be used to merge with tracking frames
football_coordinates2018 = pd.DataFrame(football_tracking2018, columns = ['x','y','gameId','playId','frameId'])
football_coordinates2019 = pd.DataFrame(football_tracking2019, columns = ['x','y','gameId','playId','frameId'])
football_coordinates2020 = pd.DataFrame(football_tracking2020, columns = ['x','y','gameId','playId','frameId'])

#rename the columns for ball_x and ball_y to differentiate from other x y coordinates in the row
football_coordinates2018 = football_coordinates2018.rename(columns={"x": "ball_x", "y": "ball_y"})
football_coordinates2019 = football_coordinates2019.rename(columns={"x": "ball_x", "y": "ball_y"})
football_coordinates2020 = football_coordinates2020.rename(columns={"x": "ball_x", "y": "ball_y"})

In [None]:
#define two functions that adjust the ball coordinates to keep them in the field of play
def adjust_x_inbounds(x):
    if x < 0.0:
        return 0.0
    elif x > 120.0:
        return 120.0
    else:
        return x

def adjust_y_inbounds(y):
    if y < 0.0:
        return 0.0
    elif y > 53.33:
        return 53.33
    else:
        return y

In [None]:
#To reduce error by only including distances that are measured when the ball is in bounds, remember to replace y values 
#that are less than 0.0 and greater than 53.33 and x values that are less than 0 or greater than 120. 

#replace them with boundry values 0 or 53.33 for y and 0 or 120 for x

#adjust 'ball_x' for anything out of bounds
football_coordinates2018['ball_x'] = football_coordinates2018['ball_x'].apply(lambda x: adjust_x_inbounds(x))
football_coordinates2019['ball_x'] = football_coordinates2019['ball_x'].apply(lambda x: adjust_x_inbounds(x))
football_coordinates2020['ball_x'] = football_coordinates2020['ball_x'].apply(lambda x: adjust_x_inbounds(x))

#adjust 'ball_y' for anything out of bounds
football_coordinates2018['ball_y'] = football_coordinates2018['ball_y'].apply(lambda x: adjust_y_inbounds(x))
football_coordinates2019['ball_y'] = football_coordinates2019['ball_y'].apply(lambda x: adjust_y_inbounds(x))
football_coordinates2020['ball_y'] = football_coordinates2020['ball_y'].apply(lambda x: adjust_y_inbounds(x))

In [None]:
#merge the ball coordinates onto each relevent event's corresponding event frame using inner join
kickoff_play_frames2018 = pd.merge(kickoff_play_frames2018, football_coordinates2018, on = ['gameId', 'playId', 'frameId'], how = 'inner' )
kickoff_play_frames2019 = pd.merge(kickoff_play_frames2019, football_coordinates2019, on = ['gameId', 'playId', 'frameId'], how = 'inner' )
kickoff_play_frames2020 = pd.merge(kickoff_play_frames2020, football_coordinates2020, on = ['gameId', 'playId', 'frameId'], how = 'inner' )

In [None]:
#inner join game data that shares the same gameId
kickoff_play_frames2018 = pd.merge(kickoff_play_frames2018, games, on = ['gameId'], how = 'inner' )
kickoff_play_frames2019 = pd.merge(kickoff_play_frames2019, games, on = ['gameId'], how = 'inner' )
kickoff_play_frames2020 = pd.merge(kickoff_play_frames2020, games, on = ['gameId'], how = 'inner' )

In [None]:
#create a new column kickTeam that runs a conditional match on possessionTeam and homeTeamAbbr to deduce each players team
#leave a home or away value in the column for matching in the next step
kickoff_play_frames2018['kickTeam'] = np.where(kickoff_play_frames2018['possessionTeam'] == kickoff_play_frames2018['homeTeamAbbr'],'home' , 'away')
kickoff_play_frames2019['kickTeam'] = np.where(kickoff_play_frames2019['possessionTeam'] == kickoff_play_frames2019['homeTeamAbbr'],'home' , 'away')
kickoff_play_frames2020['kickTeam'] = np.where(kickoff_play_frames2020['possessionTeam'] == kickoff_play_frames2020['homeTeamAbbr'],'home' , 'away')

#only include kickoff coverage team player rows by filtering for a match on team and kickTeam
kickoff_coverage_2018 = kickoff_play_frames2018[ kickoff_play_frames2018['team'] == kickoff_play_frames2018['kickTeam']]
kickoff_coverage_2019 = kickoff_play_frames2019[ kickoff_play_frames2019['team'] == kickoff_play_frames2019['kickTeam']]
kickoff_coverage_2020 = kickoff_play_frames2020[ kickoff_play_frames2020['team'] == kickoff_play_frames2020['kickTeam']]

In [None]:
#cast 'jerseyNumber' as integer instead of float
kickoff_coverage_2018['jerseyNumber'] = kickoff_coverage_2018['jerseyNumber'].map(int)
kickoff_coverage_2019['jerseyNumber'] = kickoff_coverage_2019['jerseyNumber'].map(int)
kickoff_coverage_2020['jerseyNumber'] = kickoff_coverage_2020['jerseyNumber'].map(int)

#concatenate team and number for the player
kickoff_coverage_2018['teamNumber'] = kickoff_coverage_2018['possessionTeam']+ ' ' + kickoff_coverage_2018['jerseyNumber'].map(str)  
kickoff_coverage_2019['teamNumber'] = kickoff_coverage_2019['possessionTeam']+ ' ' + kickoff_coverage_2019['jerseyNumber'].map(str) 
kickoff_coverage_2020['teamNumber'] = kickoff_coverage_2020['possessionTeam']+ ' ' + kickoff_coverage_2020['jerseyNumber'].map(str) 


In [None]:
#drop irrelevant columns such as 'Out of Bounds', 'Touchback', and ‘Downed’

#filter out "Out of Bounds"
kickoff_coverage_2018 = kickoff_coverage_2018[ kickoff_coverage_2018['specialTeamsResult'] != 'Out of Bounds']
kickoff_coverage_2019 = kickoff_coverage_2019[ kickoff_coverage_2019['specialTeamsResult'] != 'Out of Bounds']
kickoff_coverage_2020 = kickoff_coverage_2020[ kickoff_coverage_2020['specialTeamsResult'] != 'Out of Bounds']

#filter out specialTeamsResult == "Touchback"
kickoff_coverage_2018 = kickoff_coverage_2018[ kickoff_coverage_2018['specialTeamsResult'] != 'Touchback']
kickoff_coverage_2019 = kickoff_coverage_2019[ kickoff_coverage_2019['specialTeamsResult'] != 'Touchback']
kickoff_coverage_2020 = kickoff_coverage_2020[ kickoff_coverage_2020['specialTeamsResult'] != 'Touchback']

#filter out "Downed", since they were all plays negated by penalty
kickoff_coverage_2018 = kickoff_coverage_2018[ kickoff_coverage_2018['specialTeamsResult'] != 'Downed']
kickoff_coverage_2019 = kickoff_coverage_2019[ kickoff_coverage_2019['specialTeamsResult'] != 'Downed']
kickoff_coverage_2020 = kickoff_coverage_2020[ kickoff_coverage_2020['specialTeamsResult'] != 'Downed']

In [None]:
#merge selected PffScouting columns to the kickoff_coverage_2018 df for testing for tackle/assisted/missedtackle rates
Pff_Scouting = pd.DataFrame(pff, columns = ['gameId','playId','tackler','assistTackler','missedTackler','hangTime','kickType','kickContactType'])

In [None]:
#merge PFFScouting columns onto kickoff_coverage dataframes
kickoff_coverage_2018 = pd.merge(kickoff_coverage_2018, Pff_Scouting, on = ['gameId', 'playId'], how = 'inner' )
kickoff_coverage_2019 = pd.merge(kickoff_coverage_2019, Pff_Scouting, on = ['gameId', 'playId'], how = 'inner' )
kickoff_coverage_2020 = pd.merge(kickoff_coverage_2020, Pff_Scouting, on = ['gameId', 'playId'], how = 'inner' )

In [None]:
#start the creation of the stats that we will use to count for the analysis
#create the column 'ball_distance' by using the formula for Euclidean distance on columns 'x', 'y', 'ball_x' and 'ball_y'
kickoff_coverage_2018['ball_distance'] = ((kickoff_coverage_2018['x'] - kickoff_coverage_2018['ball_x'])**2 + (kickoff_coverage_2018['y'] - kickoff_coverage_2018['ball_y'])**2)**(0.5) 
kickoff_coverage_2019['ball_distance'] = ((kickoff_coverage_2019['x'] - kickoff_coverage_2019['ball_x'])**2 + (kickoff_coverage_2019['y'] - kickoff_coverage_2019['ball_y'])**2)**(0.5) 
kickoff_coverage_2020['ball_distance'] = ((kickoff_coverage_2020['x'] - kickoff_coverage_2020['ball_x'])**2 + (kickoff_coverage_2020['y'] - kickoff_coverage_2020['ball_y'])**2)**(0.5) 

In [None]:
#give tackle credit as 1 for tackle made or 0 for no tackle made when there is a match on teamNumber and tackler
kickoff_coverage_2018['tackle_credit'] = np.where(kickoff_coverage_2018['teamNumber'] == kickoff_coverage_2018['tackler'],1,0)
kickoff_coverage_2019['tackle_credit'] = np.where(kickoff_coverage_2019['teamNumber'] == kickoff_coverage_2019['tackler'],1,0)
kickoff_coverage_2020['tackle_credit'] = np.where(kickoff_coverage_2020['teamNumber'] == kickoff_coverage_2020['tackler'],1,0)

In [None]:
#make an 'assist_credit' column for assistTackler on each play frame, credit 1 or 0 if there is a match
kickoff_coverage_2018['assist_credit'] = np.where(kickoff_coverage_2018['teamNumber'] == kickoff_coverage_2018['assistTackler'], 1, 0)
kickoff_coverage_2019['assist_credit'] = np.where(kickoff_coverage_2019['teamNumber'] == kickoff_coverage_2019['assistTackler'], 1, 0)
kickoff_coverage_2020['assist_credit'] = np.where(kickoff_coverage_2020['teamNumber'] == kickoff_coverage_2020['assistTackler'], 1, 0)

In [None]:
#with missed tackles, there is much more parsing, so there is a different process
#start by creating the column 'missed_tackle_credit'
kickoff_coverage_2018['missed_tackle_credit'] = ''
kickoff_coverage_2019['missed_tackle_credit'] = ''
kickoff_coverage_2020['missed_tackle_credit'] = ''

In [None]:
#iterate through the equal length series missedTackler one by one by index, and split them or leave them for the next step
#use counts to check work of the missed tackle crediting algorithm
count = 0
mult_miss_tackle_play_count = 0
singular_missed_tackle_play_count = 0
NA_count = 0

for row in range(0, len(kickoff_coverage_2018['missedTackler'])):
    if len(str(kickoff_coverage_2018['missedTackler'][row])) > 6:
        split_list= str(kickoff_coverage_2018['missedTackler'][row]).split(';')
        new_list = [split_list[0]]
        for player in split_list[1:]:
            new_list.append(player[1:])
        kickoff_coverage_2018['missedTackler'][row] = new_list
        count+=1
        mult_miss_tackle_play_count +=1
    
    elif len(str(kickoff_coverage_2018['missedTackler'][row])) <= 6 and len(str(kickoff_coverage_2018['missedTackler'][row])) >=4:
        singular_missed_tackle_play_count+=1
        count+=1
        pass
    else: 
        count+=1
        NA_count+=1
        pass
#print(count)
#print(mult_miss_tackle_play_count)
#print(singular_missed_tackle_play_count)
#print(NA_count)

In [None]:
#2019 missed tackle check
count = 0
mult_miss_tackle_play_count = 0
singular_missed_tackle_play_count = 0
NA_count = 0
for row in range(0, len(kickoff_coverage_2019['missedTackler'])):
    if len(str(kickoff_coverage_2019['missedTackler'][row])) > 6:
        split_list= str(kickoff_coverage_2019['missedTackler'][row]).split(';')
        new_list = [split_list[0]]
        for player in split_list[1:]:
            new_list.append(player[1:])
        kickoff_coverage_2019['missedTackler'][row] = new_list
        count+=1
        mult_miss_tackle_play_count +=1
    
    elif len(str(kickoff_coverage_2019['missedTackler'][row])) <= 6 and len(str(kickoff_coverage_2019['missedTackler'][row])) >=4:
        singular_missed_tackle_play_count+=1
        count+=1
        pass
    else: 
        count+=1
        NA_count+=1
        pass
#print(count)
#print(mult_miss_tackle_play_count)
#print(singular_missed_tackle_play_count)
#print(NA_count)

In [None]:
#2020
count = 0
mult_miss_tackle_play_count = 0
singular_missed_tackle_play_count = 0
NA_count = 0
for row in range(0, len(kickoff_coverage_2020['missedTackler'])):
    if len(str(kickoff_coverage_2020['missedTackler'][row])) > 6:
        split_list= str(kickoff_coverage_2020['missedTackler'][row]).split(';')
        new_list = [split_list[0]]
        for player in split_list[1:]:
            new_list.append(player[1:])
        kickoff_coverage_2020['missedTackler'][row] = new_list
        count+=1
        mult_miss_tackle_play_count +=1
    
    elif len(str(kickoff_coverage_2020['missedTackler'][row])) <= 6 and len(str(kickoff_coverage_2020['missedTackler'][row])) >=4:
        singular_missed_tackle_play_count+=1
        count+=1
        pass
    else: 
        count+=1
        NA_count+=1
        pass
#print(count)
#print(mult_miss_tackle_play_count)
#print(singular_missed_tackle_play_count)
#print(NA_count)

In [None]:
#iterate through each df and check if there is a match between teamNumber and one of the missed tackle strings
for row in range(0, len(kickoff_coverage_2018['missedTackler'])):
    if len(str(kickoff_coverage_2018['missedTackler'][row])) > 6: #takes care of more than 1 misses
        #print(kickoff_coverage_2018['missedTackler'][row], len(kickoff_coverage_2018['missedTackler'][row]))
        for player in kickoff_coverage_2018['missedTackler'][row]:
            if player == kickoff_coverage_2018['teamNumber'][row]:
                kickoff_coverage_2018['missed_tackle_credit'][row] = 1
            else:
                pass
    else: #taking care of singular misses or NA
        if kickoff_coverage_2018['missedTackler'][row] == kickoff_coverage_2018['teamNumber'][row]:
            kickoff_coverage_2018['missed_tackle_credit'][row] = 1
        else:
            pass
#2019 missed tackle award credit      
for row in range(0, len(kickoff_coverage_2019['missedTackler'])):
    if len(str(kickoff_coverage_2019['missedTackler'][row])) > 6: #takes care of more than 1 misses
        #print(kickoff_coverage_2018['missedTackler'][row], len(kickoff_coverage_2018['missedTackler'][row]))
        for player in kickoff_coverage_2019['missedTackler'][row]:
            if player == kickoff_coverage_2019['teamNumber'][row]:
                kickoff_coverage_2019['missed_tackle_credit'][row] = 1
            else:
                pass
    else: #taking care of singular misses or NA
        if kickoff_coverage_2019['missedTackler'][row] == kickoff_coverage_2019['teamNumber'][row]:
            kickoff_coverage_2019['missed_tackle_credit'][row] = 1
        else:
            pass
#2020 missed tackle award credit
for row in range(0, len(kickoff_coverage_2020['missedTackler'])):
    if len(str(kickoff_coverage_2020['missedTackler'][row])) > 6: #takes care of more than 1 misses
        #print(kickoff_coverage_2018['missedTackler'][row], len(kickoff_coverage_2018['missedTackler'][row]))
        for player in kickoff_coverage_2020['missedTackler'][row]:
            if player == kickoff_coverage_2020['teamNumber'][row]:
                kickoff_coverage_2020['missed_tackle_credit'][row] = 1
            else:
                pass
    else: #taking care of singular misses or NA
        if kickoff_coverage_2020['missedTackler'][row] == kickoff_coverage_2020['teamNumber'][row]:
            kickoff_coverage_2020['missed_tackle_credit'][row] = 1
        else:
            pass

In [None]:
#input 0 for no missed tackle credit
kickoff_coverage_2018['missed_tackle_credit'] = np.where(kickoff_coverage_2018['missed_tackle_credit'] == 1, 1, 0)
kickoff_coverage_2019['missed_tackle_credit'] = np.where(kickoff_coverage_2019['missed_tackle_credit'] == 1, 1, 0)
kickoff_coverage_2020['missed_tackle_credit'] = np.where(kickoff_coverage_2020['missed_tackle_credit'] == 1, 1, 0)

In [None]:
#start to create the defensive fumble recovery stat

In [None]:
#create a defensive fumble recovery statistic to get d_recovery_rate
def split_lastName(x):
    name = x.split()
    last_name = name[1]
    return last_name

In [None]:
#get just lastName to match on recovery_lastName
kickoff_coverage_2018['lastName'] = kickoff_coverage_2018['displayName'].apply(lambda x: split_lastName(x)) 
kickoff_coverage_2019['lastName'] = kickoff_coverage_2019['displayName'].apply(lambda x: split_lastName(x)) 
kickoff_coverage_2020['lastName'] = kickoff_coverage_2020['displayName'].apply(lambda x: split_lastName(x))

In [None]:
#give fumble recovery credit when kickoff player gets a defensive recovery, 1 or 0
kickoff_coverage_2018['d_recovery_credit'] = np.where(kickoff_coverage_2018['recovery_lastName'] == kickoff_coverage_2018['lastName'], 1, 0)
kickoff_coverage_2019['d_recovery_credit'] = np.where(kickoff_coverage_2019['recovery_lastName'] == kickoff_coverage_2019['lastName'], 1, 0)
kickoff_coverage_2020['d_recovery_credit'] = np.where(kickoff_coverage_2020['recovery_lastName'] == kickoff_coverage_2020['lastName'], 1, 0)

In [None]:
#create a count mechanism for tackles and assisted tackles grouped by gameid, playid using a pivot table
#switched displayName and nflId

pivot_kickoff_2018 = pd.pivot_table(kickoff_coverage_2018, index=['gameId','playId','nflId', 'displayName'], values=['ball_distance', 'tackle_credit', 'assist_credit', 'missed_tackle_credit', 'd_recovery_credit'],aggfunc=np.sum)
pivot_kickoff_2019 = pd.pivot_table(kickoff_coverage_2019, index=['gameId','playId','nflId', 'displayName'], values=['ball_distance', 'tackle_credit', 'assist_credit', 'missed_tackle_credit', 'd_recovery_credit'],aggfunc=np.sum)
pivot_kickoff_2020 = pd.pivot_table(kickoff_coverage_2020, index=['gameId','playId','nflId', 'displayName'], values=['ball_distance', 'tackle_credit', 'assist_credit', 'missed_tackle_credit', 'd_recovery_credit'],aggfunc=np.sum)

In [None]:
#inner merge player table onto the pivot table
pivot_kickoff_2018 = pd.merge(pivot_kickoff_2018, players, on = 'nflId', how = 'inner')
pivot_kickoff_2019 = pd.merge(pivot_kickoff_2019, players, on = 'nflId', how = 'inner')
pivot_kickoff_2020 = pd.merge(pivot_kickoff_2020, players, on = 'nflId', how = 'inner')

In [None]:
#once the pivot table is made, be sure to adjust to make sure there is only one credit per column per player at most

#adjust to make sure tackles are only being awarded once per play, not for each frame in df
pivot_kickoff_2018.loc[pivot_kickoff_2018['tackle_credit'] > 1, 'tackle_credit'] = 1
pivot_kickoff_2019.loc[pivot_kickoff_2019['tackle_credit'] > 1, 'tackle_credit'] = 1
pivot_kickoff_2020.loc[pivot_kickoff_2020['tackle_credit'] > 1, 'tackle_credit'] = 1

#adjust to make sure assisted tackles are only being awarded once per play, not for each frame in df
pivot_kickoff_2018.loc[pivot_kickoff_2018['assist_credit'] > 1, 'assist_credit'] = 1
pivot_kickoff_2019.loc[pivot_kickoff_2019['assist_credit'] > 1, 'assist_credit'] = 1
pivot_kickoff_2020.loc[pivot_kickoff_2020['assist_credit'] > 1, 'assist_credit'] = 1

#adjust to make sure missed tackles are only being awarded once per play, not for each frame in df
pivot_kickoff_2018.loc[pivot_kickoff_2018['missed_tackle_credit'] > 1, 'missed_tackle_credit'] = 1
pivot_kickoff_2019.loc[pivot_kickoff_2019['missed_tackle_credit'] > 1, 'missed_tackle_credit'] = 1
pivot_kickoff_2020.loc[pivot_kickoff_2020['missed_tackle_credit'] > 1, 'missed_tackle_credit'] = 1

#adjust to make sure missed tackles are only being awarded once per play, not for each frame in df
pivot_kickoff_2018.loc[pivot_kickoff_2018['d_recovery_credit'] > 1, 'd_recovery_credit'] = 1
pivot_kickoff_2019.loc[pivot_kickoff_2019['d_recovery_credit'] > 1, 'd_recovery_credit'] = 1
pivot_kickoff_2020.loc[pivot_kickoff_2020['d_recovery_credit'] > 1, 'd_recovery_credit'] = 1

In [None]:
#create touch_credit column for each row in the pivot table to check if a player tackled, assisted or missed on the play
pivot_kickoff_2018['touch_credit'] = pivot_kickoff_2018['tackle_credit'] + pivot_kickoff_2018['assist_credit'] + pivot_kickoff_2018['missed_tackle_credit']
pivot_kickoff_2019['touch_credit'] = pivot_kickoff_2019['tackle_credit'] + pivot_kickoff_2019['assist_credit'] + pivot_kickoff_2019['missed_tackle_credit']
pivot_kickoff_2020['touch_credit'] = pivot_kickoff_2020['tackle_credit'] + pivot_kickoff_2020['assist_credit'] + pivot_kickoff_2020['missed_tackle_credit']

In [None]:
#copy the nflId column into a new column count_id because it will be used to count the number of plays 
pivot_kickoff_2018['count_id'] = pivot_kickoff_2018['nflId']
pivot_kickoff_2019['count_id'] = pivot_kickoff_2019['nflId']
pivot_kickoff_2020['count_id'] = pivot_kickoff_2020['nflId']

In [None]:
#aggregate the pivot table to get tackle and play totals, tackle_rate, touch_rate, d_recovery_rate, grouped by nflid
kickoff_tackle_touch_rates_2018 = pivot_kickoff_2018.groupby('nflId').agg(num_tackles = ('tackle_credit', 'sum'), num_plays = ('count_id', 'size'), tackle_rate = ('tackle_credit', 'mean'), touch_rate = ('touch_credit', 'mean'), fumble_recovery_rate = ('d_recovery_credit', 'mean'))
kickoff_tackle_touch_rates_2019 = pivot_kickoff_2019.groupby('nflId').agg(num_tackles = ('tackle_credit', 'sum'), num_plays = ('count_id', 'size'), tackle_rate = ('tackle_credit', 'mean'), touch_rate = ('touch_credit', 'mean'), fumble_recovery_rate = ('d_recovery_credit', 'mean')) 
kickoff_tackle_touch_rates_2020 = pivot_kickoff_2020.groupby('nflId').agg(num_tackles = ('tackle_credit', 'sum'), num_plays = ('count_id', 'size'), tackle_rate = ('tackle_credit', 'mean'), touch_rate = ('touch_credit', 'mean'), fumble_recovery_rate = ('d_recovery_credit', 'mean')) 


In [None]:
#create summary dataframes for 'BPA' metric from kickoff coverage tables for each season grouping by nflId 
#and getting the mean of 'ball_distance' for each player and count the number of instances they had a 
#ball_distance observation

Ball_Proximity_Avgs2018 = kickoff_coverage_2018.groupby('nflId') \
       .agg(event_count=('nflId', 'size'), ball_proximity_avg=('ball_distance', 'mean')) 

Ball_Proximity_Avgs2019 = kickoff_coverage_2019.groupby('nflId') \
       .agg(event_count =('nflId', 'size'), ball_proximity_avg=('ball_distance', 'mean')) 

Ball_Proximity_Avgs2020 = kickoff_coverage_2020.groupby('nflId') \
       .agg(event_count=('nflId', 'size'), ball_proximity_avg=('ball_distance', 'mean')) 

In [None]:
#inner merge ball proximity avg table on nflId
merged_BPA_stats_2018 = pd.merge(Ball_Proximity_Avgs2018, kickoff_tackle_touch_rates_2018, on = ['nflId'], how = 'inner' )
merged_BPA_stats_2019 = pd.merge(Ball_Proximity_Avgs2019, kickoff_tackle_touch_rates_2019, on = ['nflId'], how = 'inner' )
merged_BPA_stats_2020 = pd.merge(Ball_Proximity_Avgs2020, kickoff_tackle_touch_rates_2020, on = ['nflId'], how = 'inner' )

In [None]:
#inner merge players table on nflId one last time to include player info on final refined tables
final_merged_BPA_stats_2018 = pd.merge(merged_BPA_stats_2018, players, on = ['nflId'], how = 'inner' )
final_merged_BPA_stats_2019 = pd.merge(merged_BPA_stats_2019, players, on = ['nflId'], how = 'inner' )
final_merged_BPA_stats_2020 = pd.merge(merged_BPA_stats_2020, players, on = ['nflId'], how = 'inner' )

In [None]:
#adjust all rates to become percentages, round to 2 decimal places
final_merged_BPA_stats_2018['tackle_rate'] = round(final_merged_BPA_stats_2018['tackle_rate']*100, 2)
final_merged_BPA_stats_2019['tackle_rate'] = round(final_merged_BPA_stats_2019['tackle_rate']*100, 2)
final_merged_BPA_stats_2020['tackle_rate'] = round(final_merged_BPA_stats_2020['tackle_rate']*100, 2)

final_merged_BPA_stats_2018['touch_rate'] = round(final_merged_BPA_stats_2018['touch_rate']*100, 2)
final_merged_BPA_stats_2019['touch_rate'] = round(final_merged_BPA_stats_2019['touch_rate']*100, 2)
final_merged_BPA_stats_2020['touch_rate'] = round(final_merged_BPA_stats_2020['touch_rate']*100, 2)

final_merged_BPA_stats_2018['fumble_recovery_rate'] = round(final_merged_BPA_stats_2018['fumble_recovery_rate']*100, 2)
final_merged_BPA_stats_2019['fumble_recovery_rate'] = round(final_merged_BPA_stats_2019['fumble_recovery_rate']*100, 2)
final_merged_BPA_stats_2020['fumble_recovery_rate'] = round(final_merged_BPA_stats_2020['fumble_recovery_rate']*100, 2)

In [None]:
#export the final aggregated data tables to .csv file for analysis in R and/or Tableau

#final_merged_BPA_stats_2018.to_csv('merged_BPA_stats_2018.csv')
#final_merged_BPA_stats_2019.to_csv('merged_BPA_stats_2019.csv')
#final_merged_BPA_stats_2020.to_csv('merged_BPA_stats_2020.csv')

In [None]:
#from here, show work that leads to a final conclusion
import seaborn as sns

In [None]:
#all sets having min. of 16 plays
Min_16plays = final_merged_BPA_stats_2018[final_merged_BPA_stats_2018['num_plays'] >=16]

#sets excluding kickers, punters, and qbs
non_kickers_2018 = Min_16plays[Min_16plays['Position']!= 'K']
non_kickers_Punters_2018 = non_kickers_2018[non_kickers_2018['Position']!= 'P']
non_kickers_Punters_Qbs_2018 = non_kickers_Punters_2018[non_kickers_Punters_2018['Position'] != 'QB']

#sets filtered by age
#under 25
AgeU25_Min_16plays_2018 = non_kickers_Punters_Qbs_2018[non_kickers_Punters_Qbs_2018['age_2018']<25]
#25-29
Age2529_Min_16plays_2018 = non_kickers_Punters_Qbs_2018[non_kickers_Punters_Qbs_2018['age_2018']>=25]
Age2529_Min_16plays_2018 = Age2529_Min_16plays_2018[Age2529_Min_16plays_2018['age_2018']<=29]
#30 and over
Age30plus_Min_16plays_2018 = non_kickers_Punters_Qbs_2018[non_kickers_Punters_Qbs_2018['age_2018']>=30]


## Visualizations

In [None]:
#BPA vs. Tackle Rates Segmented by age
sns.set_theme(color_codes=True)
ax = sns.regplot(x="tackle_rate", y="ball_proximity_avg", data=AgeU25_Min_16plays_2018)

plt.figure(figsize=(8,5))
sns.scatterplot(data=AgeU25_Min_16plays_2018,x='tackle_rate',y='ball_proximity_avg').set(title='BPA vs. Tackle Rate for Players under 25')

plt.figure(figsize=(8,5))
sns.scatterplot(data=AgeU25_Min_16plays_2018,x='touch_rate',y='ball_proximity_avg').set(title='BPA vs. Touch Rate for Players under 25')

plt.figure(figsize=(8,5))
sns.scatterplot(data=Age2529_Min_16plays_2018,x='tackle_rate',y='ball_proximity_avg').set(title='BPA vs. Tackle Rate for Players 25-29')

plt.figure(figsize=(8,5))
sns.scatterplot(data=Age2529_Min_16plays_2018,x='touch_rate',y='ball_proximity_avg').set(title='BPA vs. Touch Rate for Players 25-29')

plt.figure(figsize=(8,5))
sns.scatterplot(data=Age30plus_Min_16plays_2018,x='tackle_rate',y='ball_proximity_avg').set(title='BPA vs. Tackle Rate for Players 30 and over')

plt.figure(figsize=(8,5))
sns.scatterplot(data=Age30plus_Min_16plays_2018,x='touch_rate',y='ball_proximity_avg').set(title='BPA vs. Touch Rate for Players 30 and over')

In [None]:
final_merged_BPA_stats_2018.groupby('age_2018')['ball_proximity_avg'].mean().plot(kind='line')

In [None]:
position_BPA = final_merged_BPA_stats_2018.groupby('Position')['ball_proximity_avg'].mean()
indexed_position_BPA.sort_values('ball_proximity_avg')
indexed_position_BPA= position_BPA.reset_index()


In [None]:
#Average BPA by college sorted ascending
college_BPA = final_merged_BPA_stats_2018.groupby('collegeName')['ball_proximity_avg'].mean()
college_BPA.reset_index()
college_BPA_sorted = college_BPA.sort_values()
college_BPA_sorted.head(40)