In [None]:
from IPython.display import HTML
style = """
<style>
    .header1 { font-family:'Arial';font-size:30px; color:Black; font-weight:800;}
    .header2 { 
        font-family:'Arial';
        font-size:18px; 
        color:Black; 
        font-weight:600;
        border-bottom: 1px solid; 
        margin-bottom: 8px;
        margin-top: 8px;
        width: 100%;
        
    }
    .header3 { font-family:'Arial';font-size:16px; color:Black; font-weight:600;}
    .para { font-family:'Arial';font-size:14px; color:Black;}
    .flex-columns {
        display: flex;
        flex-direction: row;
        flex-wrap: wrap;
    }
    .flex-container {
         padding: 20px;
    }
    
    .flex-container-large {
         padding: 20px;
         max-width: 40%;
    }
    
    .flex-container-small {
         padding: 20px;
         max-width: 17.5%;
    }
    
    .list-items {
        margin: 10px;
    }
    
    .list-items li {
        color: #3692CC;
        font-weight: 500;
    }
</style>
"""
HTML(style)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

<div class="header1"> NFL Punt Analytics Competition </div>
<div class="header2">Goal: Reduce probability of concussions during punt play</div>
<div class="header3">What is punt play?</div>
<div class="para">A punt is a kick where the ball must not hit the ground before the kick. It is typically performed in the final down if a team does not believed they can cover the necessary distance to touchdown or to score a field goal. Thus it is used to deny the opposition additional ground on turnover. A punter and a punt reciever are specialised roles on either team who's responsibility it is to kick / recieve the ball and place it as close as possible to the opponents endzone before the turnover
</div>
<div class="header3">Before You Read</div>
<div class="para">All code cells have been hidden for the purposes of clarity of findings, you can easily reveal these code cells by clicking (...) throughout this notebook. This notebook details by entire exploration process, if you are judging this competition I highly reccomend looking at the slide deck / presentation to get an overview of my findings before diving into the details below! 
</div>
<div class="header2">Data Import / Cleaning</div>
<div class="para">
 Below we read in the game_data, video_review and play_information data sets in order to examine links between match data and concussions
</div>

In [None]:
# Game Data & Video Review Data
game_data = pd.read_csv('../input/game_data.csv')
video_review = pd.read_csv('../input/video_review.csv')
concussions_and_game_data = video_review.merge(right=game_data,how='inner', on='GameKey')

# Play Information Data Cleaning / Engineering
play_information = pd.read_csv('../input/play_information.csv')
play_information_punt_only = play_information[play_information['Play_Type'] == 'Punt']

play_information_punt_only[['TeamYardLine','YardLineDist']] = play_information_punt_only['YardLine'].str.split(' ', expand=True)
play_information_punt_only[['HomeTeam','VisitTeam']] = play_information_punt_only['Home_Team_Visit_Team'].str.split('-', expand=True)
play_information_punt_only[['ScoreHome','ScoreVisit']] = play_information_punt_only['Score_Home_Visiting'].str.strip().str.split('-', expand=True).astype('int64')
play_information_punt_only['YardLineDist'] = play_information_punt_only['YardLineDist'].astype('int64')

# [Assumption] Score difference is more important than the actual scores
# This is logically what creates a pressured environment on the pitch 
# [Hypothesis] There exists certain score differences where concussions are more likely
play_information_punt_only['ScoreDifference'] = play_information_punt_only['ScoreHome']-play_information_punt_only['ScoreVisit']

# Combining play information with data on concussions
play_concussion_data = play_information_punt_only.merge(right=concussions_and_game_data,how='inner', on=['GameKey','PlayID'])

helmet_to_body = play_concussion_data[play_concussion_data['Primary_Impact_Type']=='Helmet-to-body']
helmet_to_helmet = play_concussion_data[play_concussion_data['Primary_Impact_Type']=='Helmet-to-helmet']

# Combining play, game_data and player roles durin each concussion
player_role = pd.read_csv('../input/play_player_role_data.csv')
play_concussion_data_role = play_concussion_data.merge(right=player_role,how='inner',on=['Season_Year','GameKey','PlayID','GSISID'])

<div class="header2">Initial Data Exploration</div>
<div class="para">
 In this section we will explore some of the following key identified features from the dataset.
</div>

In [None]:
plt.figure(figsize=(12,5))
ax0 = plt.subplot2grid((1,2),(0,0))
ax1 = plt.subplot2grid((1,2),(0,1))
ax0.set_title('Frequency of concussions by Type')
ax1.set_title('Frequency of concussions by Player Role')

video_review.groupby(by='Primary_Impact_Type').count()['Season_Year'].plot(kind='barh', ax=ax0)
play_concussion_data_role.pivot_table(index='Role', columns='Quarter', aggfunc='count', values='GameKey', fill_value=0).sort_values(by=[1,2,3,4], ascending=False).plot(kind='barh', stacked=True, ax=ax1)

<div class="header2">Overview</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Types of concussion</div>
        <div class="para">
        The first thing to do is to examine how frequently each type of concussion occurs. As you can see above almost all concussions fall into either helmet to helmet or helmet to body categories. This makes sense as it is difficult to see how a concussion could occur if the first contact was not with the body / helmet! We will focus specifically on these two types during initial data exploration.
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Roles with Concussions</div>
        <div class="para">
        Above the frequency of concussion is shown by player role (for all roles which recieved concussions) it is immediately evident that some roles such as punt reciever, left guard and right guard recieve more concussions than any other role, this is clearly an important statistic as it suggest some roles are potentially more vulnerable to concussions or have a play style which leads to these events with higher probability.
        </div>
    </div>
</div>
<div class="header2">Feature Suitability</div>
<div class="para">
        There is evidence to suggest a realtionship between player role and concussions, this feature will likely be inlcuded in any model built.
</div>

In [None]:
plt.figure(figsize=(12,5))
plt.suptitle('Frequency of concussions (and types thereof) over score difference')
ax0 = plt.subplot2grid((1,2),(0,0))
ax1 = plt.subplot2grid((1,2),(0,1))

# Score Difference Between Teams | Concussions
sns.distplot(play_concussion_data['ScoreDifference'], bins=6, ax=ax0)
ax0.legend(['Concussions'])
# ax0.set_title('Score Difference and frequency of concussion impacts')
ax0.set_xlim((-40,40))

# Score Difference Between Teams | Concussions [Type]
sns.distplot(helmet_to_body['ScoreDifference'], bins=6, ax=ax1)
sns.distplot(helmet_to_helmet['ScoreDifference'], bins=6, ax=ax1)
ax1.legend(['Helmet to Body','Helmet to helmet'])
# ax1.set_title('Score Difference and frequency of concussion impacts')

<div class="header2">Description</div>
<div class="para">
        The above graphs show the relative frequency of concussions over score differences. Score differences are calculated as the Home Team Score - Visit Team Score. I do not take absolute values in the event that there is skew towards home / away team.
        </div>
<div class="header2">Interpretation</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Overall Concussions</div>
        <div class="para">
        Overall concussions seem to be concentrated around a zero score difference with slight negative skew. At a glance this would suggest that plays with narrower score differences are likely to have more concussions. However we must also consider the possibility that there are fewer games with wider score differences.
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Types of Concussions</div>
        <div class="para">
        Helmet to helmet has a non zero mean where as helmet to body seems to have a zero mean and negative skew. This would suggest that the conditions under which the two major types of collisions occur may be different. Notice that there are two peaks for helemet to helmet concussions, that appear to be closer to +- 20 than to 0 score difference, this would suggest unlike our previous conclusion of zero score centred concussions that their may indeed be certain critical score differences, which put additional mental stress on players and leads to increased probabilities of concussion.
        </div>
    </div>
</div>
<div class="header2">Feature Suitability</div>
<div class="para">
        There is evidence to suggest a realtionship between score difference and concussions, this feature will likely be inlcuded in any model built
</div>

In [None]:
plt.figure(figsize=(12,5))
plt.suptitle('Frequency of concussions (and types thereof) over distace from endzone (in yards)')
ax0 = plt.subplot2grid((1,2),(0,0))
ax1 = plt.subplot2grid((1,2),(0,1))

# Yard Distance From Endzone | Concussions
sns.distplot(play_concussion_data['YardLineDist'], ax=ax0)
ax0.legend(['Concussions'])
ax0.set_xlim((0,50))

# Yard Distance From Endzone | Concussions [Type]
sns.distplot(helmet_to_body['YardLineDist'], ax=ax1)
sns.distplot(helmet_to_helmet['YardLineDist'], ax=ax1)
ax1.legend(['Helmet to Body','Helmet to helmet'])

<div class="header2">Description</div>
<div class="para">
        The above graphs show the relative frequency of concussions over yard line distances. Yard line distance has been extracted from the play_information dataset through the data cleaning process in the code above. It measures the distance from the teams respective endzone in which punt play began (ie line of scrimmage). Unfortunately since the data available for individual players does not indicate which team they reside on, we are unable to find out which end zone the yard line distance refers to with respect to the concussed player.
        </div>
<div class="header2">Interpretation</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Overall Concussions</div>
        <div class="para">
        Overall concussions seem to be concentrated between the 20 and 33 yards from the endzone, with smaller peaks at 10 and 50 yards. This makes sense as punts are often reciever between 20 - 40 yards and then the punt reciever either advances forwards towards the 50 yard line or is forced backwards by offensive team towards his own endzone, explaining the peak at the 10 yard mark. 
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Types of Concussions</div>
        <div class="para">
        There are several clear peaks in concussions at the 10, 25, 47 yards from the endzone. Similarly to score difference we see a gap between the peaks for the two types of concussions, further reinforcing the hypothesis that these two types of concussions have different underlying mechanics driving their occurences.
        </div>
    </div>
</div>
<div class="header2">Feature Suitability</div>
<div class="para">
        There is evidence to suggest a realtionship between yard line distance and concussions, clearly there are certain critical regions in which concussions are likely to occur as they represent characteristic turning points in play.
</div>

In [None]:
# Quarter Distribution | Concussions [Type]
ax1 = sns.distplot(helmet_to_body['Quarter'])
ax2 = sns.distplot(helmet_to_helmet['Quarter'])
ax1.legend(['Helmet to Body','Helmet to helmet'])
ax1.set_title('Quarter distribution of concussion impacts')

<div class="header2">Description</div>
<div class="para">
        The above graphs show the relative frequency of concussions over different quarters of play, Q5 being overtime.
        </div>
<div class="header2">Interpretation</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Types of Concussions</div>
        <div class="para">
        Both types of concussions peak in the 3rd quarter, but helmet to helmet concussions have a secondary peak in the 2nd quarter. The distribution towards the extremes is quite similar for both types of concussion
        </div>
    </div>
</div>
<div class="header2">Feature Suitability</div>
<div class="para">
        This feature is not as useful as some of the others as it doesn't really have any logical explanation to why more concussions should happen in the second or third half. Considering all it tells us that most concussions happen in 2 out of 4 quarters it doesn't seem particularly helpful to add 4 extra one hot encoded features to a model for limited additional explanatory power.
</div>

In [None]:
plt.figure(figsize=(12,5))
plt.suptitle('Frequency of concussions (and types thereof) over tempeature (F)')
ax0 = plt.subplot2grid((1,2),(0,0))
ax1 = plt.subplot2grid((1,2),(0,1))

# Temperature | Concussions
sns.distplot(game_data['Temperature'].dropna(), color='b', bins = 20, ax=ax0)
sns.distplot(concussions_and_game_data['Temperature'], color='r', bins = 10, ax=ax0)
ax0.legend(['All Games', 'Concussions'])
ax0.set_xlim((0,100))

# Temperature | Concussions [Type]
sns.distplot(helmet_to_body['Temperature'], color='orange', bins=10, ax=ax1)
sns.distplot(helmet_to_helmet['Temperature'], color='teal', bins=6, ax=ax1)
ax1.legend(['Helmet to body', 'Helmet to helmet'])

<div class="header2">Description</div>
<div class="para">
        The above graphs show the relative frequency of concussions over different tempeatures (Farenheit).
        </div>
<div class="header2">Interpretation</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Overall Concussions</div>
        <div class="para">
        Concussions tend to occur at a rate that exceeds the number of games being played at more extreme temperatures, this logically makes sense as their might be additional stress on the players in colder temperatures.
        We can see this where the red line is above the blue line. The relative rate of concussions tends to fall relative to the number of games being played in normal temperatures (60 - 90F)
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Types of Concussions</div>
        <div class="para">
        Helemet to helmet concussions for some reason tend to occur at lower temperatures than helmet to body collisions, this is evident from the peak in the green line being at a lower temperature than the orange line. This is possibly a spurious correlation.
        </div>
    </div>
</div>
<div class="header2">Feature Suitability</div>
<div class="para">
        There is evidence to suggest a realtionship between temperature and concussions, given the rate of concussions increases relative to games at tempratures less than 60F and greater than 90F. This could be used as a feature in the final model.
</div>

<div class="header2">Utilising Next Gen Stats</div>
<div class="para">
 It would be logical to assume that players which recieve concussions are travelling at high velocity, giving them higher momentumn and colliding with greater force.
</div>

In [None]:
# Label player role data by concussion and impact type
players_concussed = video_review.set_index(['Season_Year','GameKey','PlayID','GSISID'])
players_concussed['Concussed'] = 1
players_concussed = pd.DataFrame(players_concussed[['Concussed', 'Primary_Impact_Type']])

player_data_NGS = player_role.set_index(['Season_Year','GameKey','PlayID','GSISID'])
player_data_NGS = player_data_NGS.merge(right=players_concussed, how='left', left_index=True, right_index=True)
player_data_NGS['Primary_Impact_Type'] = player_data_NGS['Primary_Impact_Type'].fillna('None')
player_data_NGS = player_data_NGS.fillna(0)

# Load and merge NGS 2017
NGS_2017_reg_week_1_6 = pd.read_csv('../input/NGS-2017-reg-wk1-6.csv')
NGS_2017_reg_week_13_17 = pd.read_csv('../input/NGS-2017-reg-wk13-17.csv')
NGS_2017_reg_week_7_12 = pd.read_csv('../input/NGS-2017-reg-wk7-12.csv')
NGS_2017_reg = NGS_2017_reg_week_1_6.append(NGS_2017_reg_week_7_12.append(NGS_2017_reg_week_13_17))

# Calculate Velocity 2017
NGS_2017_reg['Time'] = NGS_2017_reg['Time'].astype('datetime64[ns]')
NGS_2017_reg = NGS_2017_reg.drop(columns='Event')

NGS_2017_reg['TimeDelta'] = NGS_2017_reg['Time'].diff().dt.total_seconds()
NGS_2017_reg = NGS_2017_reg[NGS_2017_reg['TimeDelta'] > 0.0]
NGS_2017_reg['dis_metres'] = NGS_2017_reg['dis'] / 1.094
NGS_2017_reg['Velocity'] = NGS_2017_reg['dis_metres'] / NGS_2017_reg['TimeDelta']

#Load and merge NGS 2016
NGS_2016_reg_week_7_12 = pd.read_csv('../input/NGS-2016-reg-wk7-12.csv')
NGS_2016_reg_week_1_6 = pd.read_csv('../input/NGS-2016-reg-wk1-6.csv')
NGS_2016_reg_week_13_17 = pd.read_csv('../input/NGS-2016-reg-wk13-17.csv')
NGS_2016_reg = NGS_2016_reg_week_1_6.append(NGS_2016_reg_week_7_12.append(NGS_2016_reg_week_13_17))

# Calculate Velocity 2016
NGS_2016_reg['Time'] = NGS_2016_reg['Time'].astype('datetime64[ns]')
NGS_2016_reg = NGS_2016_reg.drop(columns='Event')

NGS_2016_reg['TimeDelta'] = NGS_2016_reg['Time'].diff().dt.total_seconds()
NGS_2016_reg = NGS_2016_reg[NGS_2016_reg['TimeDelta'] > 0.0]
NGS_2016_reg['dis_metres'] = NGS_2016_reg['dis'] / 1.094
NGS_2016_reg['Velocity'] = NGS_2016_reg['dis_metres'] / NGS_2016_reg['TimeDelta']

# Merge 2016 and 2017
NGS_reg = NGS_2016_reg.append(NGS_2017_reg)

NGS_group_by_play_player = NGS_reg.groupby(by=['Season_Year','GameKey','PlayID','GSISID'])

# Calculate every players velocity by play and update their Next Gen Stats
velocity_by_play = NGS_group_by_play_player.mean()['Velocity']
player_data_NGS['Velocity'] = velocity_by_play
player_data_NGS['Velocity'] = player_data_NGS['Velocity'].fillna(player_data_NGS['Velocity'].mean())

# Calculate the duration of each play and pass that information to player NGS
duration_by_play = (NGS_group_by_play_player.max() - NGS_group_by_play_player.min())['Time'].dt.total_seconds()
player_data_NGS['Play_Duration'] = duration_by_play

# Clean Player Data Values
player_data_NGS = player_data_NGS.dropna()
player_data_NGS = player_data_NGS[player_data_NGS['Velocity'] < 6]
player_data_NGS = player_data_NGS[player_data_NGS['Velocity'] > 10**-4]

# Transform play information data for NGS format
play_data_NGS = play_information_punt_only.set_index(['Season_Year','GameKey','PlayID'])
play_data_NGS = play_data_NGS[[c for c in play_data_NGS.columns if c[-2:] != '_y']]
play_data_NGS.columns = play_data_NGS.columns.str.replace('_x', '')

# Merge NGS Play Data and NGS Player Data
play_player_data_NGS = play_data_NGS.merge(player_data_NGS.reset_index().set_index(
    ['Season_Year','GameKey','PlayID']), how='left', left_index=True,right_index=True)
play_player_data_NGS = play_player_data_NGS.reset_index().set_index(['Season_Year','GameKey','PlayID','GSISID'])
play_player_data_NGS = play_player_data_NGS.dropna()

# Format Game Data for NGS
game_data_NGS = game_data.set_index(['Season_Year','GameKey'])

# Merge Play, Player, Game Data NGS
mod_play_player_data_NGS = play_player_data_NGS.reset_index().set_index(['Season_Year','GameKey'])
play_player_game_data_NGS = mod_play_player_data_NGS.merge(right=game_data_NGS, how='left', left_index=True, right_index=True)
play_player_game_data_NGS = play_player_game_data_NGS.reset_index().set_index(['Season_Year','GameKey','PlayID','GSISID'])

In [None]:
plt.figure(figsize=(12,5))
ax0 = plt.subplot2grid((1,2),(0,0))
ax1 = plt.subplot2grid((1,2),(0,1))
ax0.set_title('Frequency of Play / Down Duration')
ax1.set_title('Frequency of Player Avg Velocity')

duration_by_concussion_play = play_player_game_data_NGS[play_player_game_data_NGS['Concussed'] == 1]['Play_Duration']

sns.distplot(duration_by_play, ax=ax0)
sns.distplot(duration_by_concussion_play, ax=ax0)
ax0.axvline(duration_by_play.mean(), color='blue', linestyle='dashed',linewidth=1.2)
ax0.axvline(duration_by_concussion_play.mean(), color='orange', linestyle='dashed',linewidth=1.2)
ax0.annotate("%.2f s" % round(duration_by_play.mean(),2), (duration_by_play.mean() + 5, 0.06))
ax0.annotate("%.2f s" % round(duration_by_concussion_play.mean(),2), (duration_by_concussion_play.mean() + 5, 0.04))

ax0.legend(['Mean Duration','Mean Concussion\nDuration','All Plays','Concussion Plays'])
ax0.set_xlim((0,175))
ax0.set_xlabel("Time / s")

play_velocity = play_player_game_data_NGS['Velocity']
play_concussed_velocity = play_player_game_data_NGS[play_player_game_data_NGS['Concussed'] == 1]['Velocity']

sns.distplot(play_velocity, ax = ax1)
sns.distplot(play_concussed_velocity, ax=ax1)
ax1.axvline(play_velocity.mean(), color='blue', linestyle='dashed',linewidth=1.2)
ax1.axvline(play_concussed_velocity.mean(), color='orange', linestyle='dashed',linewidth=1.2)
ax1.legend(['Mean Velocity','Mean Concussion\nVelocity', 'All Plays','Concussion Plays'])
ax1.annotate("%.2f m/s" % round(play_velocity.mean(),2), (play_velocity.mean() + 0.1, 2.0))
ax1.annotate("%.2f m/s" % round(play_concussed_velocity.mean(),2), (play_concussed_velocity.mean()+ 0.1, 2.5))
ax1.set_xlim((0,6))
ax1.set_xlabel("Velocity / m/s")

<div class="header2">Description</div>
<div class="para">
        Velocity is claculated as the average velocity of a player over a single play. Similarly duration is calculated over a single play as the max - min time on the clock. 
        </div>
<div class="header2">Interpretation</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Duration and Concussions</div>
        <div class="para">
        Plays involving concussions have a mean play duration 13 seconds greater than that of the average play duration, this is interesting and something we should keep in mind, as perhaps it gives tackling players a chance to building up more speed before impact.
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Velocity and concussion</div>
        <div class="para">
        Interestingly on average players who tended to be concussed were travelling at lower speeds than the average player on the pitch. This is especially surprising as you would expect the oppposite. We will study this further when we look at the video replays, it is possibile that their opponent was travelling at high speed.
        </div>
    </div>
</div>
<div class="header2">Feature Suitability</div>
<div class="para">
        There is evidence to suggest that both duration of the play and the velocity of the player are important factors to consider in our model
</div>

In [None]:
# Select Data for model features
data_selected = play_player_game_data_NGS[['YardLineDist','ScoreDifference','Role','Temperature','Quarter','Velocity','Concussed','Primary_Impact_Type','Play_Duration']]
data_selected.to_csv('data_selected.csv')

<div class="header2">End of part 1</div>
<div class="para">
        Now that we have done a thorough examination of the model features we will move onto build
    a predictive model to validate our findings and determine a rule to reduce concussions in part 2
        </div>