In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import matplotlib.patches as patches
sns.set_style("whitegrid")

# Data:
There are three files provided in the dataset, as described below:

1. **Injury Record:** The injury record file in .csv format contains information on 105 lower-limb injuries that occurred during regular season games over the two seasons. Injuries can be linked to specific records in a player history using the PlayerKey, GameID, and PlayKey fields.

2. **Play List:** – The play list file contains the details for the 267,005 player-plays that make up the dataset. Each play is indexed by PlayerKey, GameID, and PlayKey fields. Details about the game and play include the player’s assigned roster position, stadium type, field type, weather, play type, position for the play, and position group.

3. **Player Track Data:** player level data that describes the location, orientation, speed, and direction of each player during a play recorded at 10 Hz (i.e. 10 observations recorded per second).


In [None]:
# Read the input files
playlist = pd.read_csv('../input/nfl-playing-surface-analytics/PlayList.csv')
inj = pd.read_csv('../input/nfl-playing-surface-analytics/InjuryRecord.csv')
trk = pd.read_csv('../input/nfl-playing-surface-analytics/PlayerTrackData.csv')

## Injury Data
First lets look at the injury data. It's a fairly small file with only 105 injury plays shown. I notice that many of the rows for the injury plays do not show `PlayerKey`, `GameId`, etc. I'm not sure if this is a bug or intentially done.

- PlayerKey, GameId, PlayKey
- BodyPart
- Surface
- DM_M1, DM_M7, DM_28, DM_42 - One hot encoding the number of days missed for injury

In [None]:
inj.groupby('Surface').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='barh', figsize=(15, 5), title='Count of injuries by Field Surface', color='orange')
plt.show()

In [None]:
# injuries_encoded = inj[['BodyPart', 'Surface','DM_M1','DM_M7','DM_M28','DM_M42']]
# body_parts_map = {'Heel': 2, 'Foot': 3, 'Toes': 4, 'Ankle': 1, 'Knee': 0}
# surface_map = {'Synthetic': 0, 'Natural': 1}
# injuries_encoded['BodyPart'] = injuries_encoded['BodyPart'].map(body_parts_map)
# injuries_encoded['Surface'] = injuries_encoded['Surface'].map(surface_map)
# coefs = np.corrcoef(injuries_encoded, rowvar=False)

# from matplotlib import cm as cm
# fig = plt.figure()
# ax1 = fig.add_subplot(111)
# cmap = cm.get_cmap('jet', 30)
# cax = ax1.imshow(coefs, interpolation="nearest", cmap=cmap)
# fig.colorbar(cax, ticks=[.75,.8,.85,.90,.95,1])
# ax1.grid(True)

In [None]:
fig, axes = plt.subplots(1, 2)
# inj[['BodyPart', 'Surface','DM_M1','DM_M7','DM_M28','DM_M42']]
fig_1 = inj[inj['Surface'] == 'Natural'].groupby('BodyPart').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(15, 5), title='Count of injuries by Body Part - Natural', ax=axes[0])
fig_2 = inj[inj['Surface'] == 'Synthetic'].groupby('BodyPart').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(15, 5), title='Count of injuries by Body Part - Synthetic', ax=axes[1])
# plt.plot(fig_1, fig_2)
plt.show()

Distribution of injured body parts against the surface type. We can see that ankles is more highly represented on synthetic turf.

In [None]:
def encode_missed_days(row):
    missed_days = ['DM_M1','DM_M7','DM_M28','DM_M42']
    if row['DM_M42']:
        return 42
    if row['DM_M28']:
        return 28
    if row['DM_M7']:
        return 7
    if row['DM_M1']:
        return 1

In [None]:
ankle_injuries = inj[inj['BodyPart'] == 'Ankle']
ankle_injuries['DaysMissed'] = ankle_injuries.apply(lambda x: encode_missed_days(x), axis=1)
fig, axes = plt.subplots(1, 2)
# inj[['BodyPart', 'Surface','DM_M1','DM_M7','DM_M28','DM_M42']]
fig_1 = ankle_injuries[ankle_injuries['Surface'] == 'Natural'].groupby('DaysMissed').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(15, 5), title='Count of ankle injuries by days missed - Natural', ax=axes[0])
fig_2 = ankle_injuries[ankle_injuries['Surface'] == 'Synthetic'].groupby('DaysMissed').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(15, 5), title='Count of ankle injuries by days missed - Synthetic', ax=axes[1])
plt.show()


The number of days missed due to an ankle injury at the top end is way more represented on synthetic turf than natural turf.

We can see that synthetic turf is causing ankle injuries at a higher rate and that those ankle injuries are causing more damage (harder to recover from). More exploration to follow...