In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**An Analysis of injury factors in NFL**


The main purpose of this analysis is to find out what factors cause injury to NFL athletes through the analysis of the data provided. It not only contributes to the effects of synthetic turn or natural turn, but also to the analysis of various factors to help prevent injury in advance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

injury_df = pd.read_csv('/kaggle/input/nfl-playing-surface-analytics/InjuryRecord.csv')
player_df = pd.read_csv('/kaggle/input/nfl-playing-surface-analytics/PlayList.csv')
player_track_info_df = pd.read_csv('/kaggle/input/nfl-playing-surface-analytics/PlayerTrackData.csv')
injury_df.shape, player_df.shape, player_track_info_df.shape

In [None]:
# column names lower for convenience
injury_df.columns = injury_df.columns.str.lower()
player_df.columns = player_df.columns.str.lower()
player_track_info_df.columns = player_track_info_df.columns.str.lower()

**_EDA_**

In [None]:
# check any missing value from Injury data
injury_df.isnull().sum(), player_df.isnull().sum(), player_track_info_df.isnull().sum()

We have only 105 injury data. I think, this is not enough data to design effective model to predict. But still, we can go over the data which factors give negative or positive impact for the injury even though it is not significant. And there are 28 missing value in playkey. This would be fiiled with the value of the Playlist. For rest of missing value in other data tables, I tried not to fill out because most of them are categorical values and there is a lack of evidence to estimate the value.

In [None]:
injury_df.head()

In [None]:
# extract gameid with no playkey from injury_df
gameid_no_playkey = injury_df[injury_df['playkey'].isnull()]['gameid']
gameid_no_playkey
playkey_=[]
for id in gameid_no_playkey:
    playkey_.append(player_df[player_df['gameid']==id].iloc[len(player_df[player_df['gameid']==id])-1,2])

ii=0
for i in gameid_no_playkey.index:
    injury_df.loc[i,'playkey']= playkey_[ii]
    ii=ii+1
injury_df.isnull().sum()

Injury bodypart Frequency Check

In [None]:
injury_df.groupby('bodypart').count()['playerkey'].sort_values(ascending=False).plot(kind='bar',
                                                                     figsize=(15,5),
                                                                     title="# of injuries by BodyPart")
for a, b in enumerate(injury_df.groupby('bodypart').count()['playerkey'].sort_values(ascending=False)):
    plt.text(a,b,str(b))
    
plt.show()

We can easily find, Knee and Ankle are mostly injured. Let's take a look fieldtype which causes injury more. 

In [None]:
injury_df.groupby('surface')['playerkey'].count().sort_values(ascending=False)\
.plot(kind='bar', figsize=(6,6), title="# of injuries on different fieldtype")
for a, b in enumerate(injury_df.groupby('surface').count()['playerkey'].sort_values(ascending=False)):
    plt.text(a,b,str(b))
plt.show()

We can find out that injuries happen on Synthetic turf more times than natural turf. Let's see the percentage of body part injured on different field types. 

In [None]:
# % of body injuried on diff surfaces
pd.set_option('display.float_format', '{:.2f}'.format)
injury_df.groupby(['surface','bodypart'])['playerkey'].count().unstack('bodypart').T.sort_values('Natural').T.sort_values('Ankle').plot(kind='bar', figsize=(15,5))
previous_r=injury_df.groupby(['surface','bodypart'])['playerkey'].count()
previous_s=injury_df.groupby('surface')['playerkey'].count()
previous_p=previous_r/previous_s*100
previous_p.unstack('bodypart').T.sort_values('Natural').T.sort_values('Ankle').plot(kind='bar', figsize=(15,5))
previous_p


On both Natural and Synthetic turf, knee and ankle injuries accounted for about 85% of all injuries. However, Foot injuries on natural turf, toe injuries on synthetic turf seem to be more environmental effects. 


**Determine When to occur**

In [None]:
inj_detailed = injury_df.merge(player_df)
# find out when to occur
p_d_i =inj_detailed.groupby('playergame')['playerkey'].count()
p_day_inj = pd.DataFrame(p_d_i)
p_day_inj.columns = ['inj_cnt']
p_day_inj.reset_index(inplace=True)
#fig = plt.figure()
#ax = plt.axes()
p_day_inj.plot(x='playergame', y='inj_cnt',  figsize=(12,5))
plt.ylabel('Number of Injuries'), plt.xlabel('number of games'), plt.title('Injury Trend')
plt.legend(['Injury count'])
p_day_inj['inj_p_gday']=(p_day_inj['inj_cnt']/p_day_inj['inj_cnt'].sum())*100
p_day_inj['inj_p_acu']=p_day_inj['inj_p_gday'].cumsum()
p_day_inj['inj_c_acu']=p_day_inj['inj_cnt'].cumsum()
p_day_inj.plot(x='playergame', y='inj_p_acu',  figsize=(12,5), color='orange')
plt.ylabel('Accumulated percentage of Injuries'), plt.xlabel('number of games'), plt.title('Injury Trend')
plt.legend(['Accumulated injury (%)'])

As shown in the above results, there have been about 70/105 injuries in 10 games, accounting for about 67% of the total injuries.

In [None]:
p_p_d=player_df.groupby(['playergame'])['playerkey'].nunique()
p_p_day = pd.DataFrame(p_p_d)
p_p_day.columns = ['nofplayers']
p_p_day.reset_index(inplace=True)
inj_pct_by_time=pd.merge(p_day_inj, p_p_day, left_on='playergame', right_on='playergame')
inj_pct_by_time['inj_pct']= inj_pct_by_time['inj_cnt']/inj_pct_by_time['nofplayers']*100
inj_pct_by_time['inj_pct_player_acu']=inj_pct_by_time['inj_pct'].cumsum()
inj_pct_by_time.plot('playergame', 'inj_pct', figsize=(12,5))
plt.title('injury % in total players'), plt.xlabel('number of games'), plt.ylabel('injury percentage')
plt.legend(['Percentage / players'])
inj_pct_by_time.plot('playergame', 'inj_pct_player_acu', figsize=(12,5),color='orange')
plt.title('Accumulated injury % in total players'), plt.xlabel('number of games'), plt.ylabel('Accumulated injury percentage')
plt.legend(['Accumulated percentage / players'])

The above results indicate the number of injuries per player. In the graph below, about 30 percent of 250 players in 10 games were injured.


Small result from EDA of Injury data: 
   - Most injuries occurred in ankle and knee
   - More injuries on synthetic turf
   - 10 more ankle injuries on the synthetic turf
   - The percentage of injuries on each field type show similar features
     However, foot injuries on the natural turf and toe injuries on the synthetic turf
     are more concerned
   - about 67% of injuries in 10 gamies were occured
   - about 30 % of 250 players in 10 games were injured

Combine Injury data and Playerlist data to identify the bodypart of injury according to the type of play

In [None]:
inj_detailed = injury_df.merge(player_df)
# percentage of injury body part on each different fieldtypes 
inj_body=inj_detailed.groupby(['surface','playtype','bodypart'])['playerkey'].count().unstack('bodypart')
inj_tot_body=inj_detailed.groupby(['surface','playtype'])['playerkey'].count()
inj_p_body=pd.DataFrame()
for i in inj_body.columns:
      inj_p_body[i]=inj_body[i]/inj_tot_body

inj_p_body.loc['Natural'].plot(kind='bar', figsize=(10,4), title='Natural',stacked=True)

inj_p_body.loc['Synthetic'].plot(kind='bar', figsize=(10,4), title='Synthetic',stacked=True)

plt.show()


In this result, what I can see is that specially, on the synthetic turf, ankle injuries were more common in the kicking playtype.

By the position, injury percentage on each different fieldtype

In [None]:
#  injury percentage on each position
inj_detailed.groupby(['rosterposition','surface']).count().unstack('surface')['playerkey']\
    .T.apply(lambda x : x/x.sum()).sort_values('surface').T.sort_values('Natural')\
    .plot(kind='barh',figsize=(15,5), title='injury percentage(%) on each position', stacked=True)
plt.show()

The result of separating the total number of injury occurrences for each position according to the field type is that in the case of cornerbacks, more injuries occurred on synthetic turf.

In [None]:
#injury percentage on each fieldtype
inj_detailed.groupby(['surface','rosterposition','bodypart']).count()['playerkey']
inj_rosterposition=inj_detailed.groupby(['surface','rosterposition']).count()['playerkey']
inj_surface=inj_detailed.groupby(['surface']).count()['playerkey']
inj_ros_sur_p = inj_rosterposition/inj_surface
inj_ros_sur_p.unstack('rosterposition').T.sort_values('Natural').T.sort_values('Cornerback').plot(kind='bar',figsize=(15,5), stacked=False)

However, if you divide injuries by the environment and look at the percentage of injuries caused by the environment, you can see slightly different results. On the Natural turf, especially the Linebacker position has more injuries than on the synthetic turf. This accounts for more than 25% of all injuries on natural turf. On the Synthetic turf, you can see that the injury of a cornerback position occurs noticeably more than that of natural turf. Compared to the result of the natural turf, there is about a 15% difference. How do they play? Let's check their play type.

In [None]:
inj_detailed[inj_detailed['rosterposition'].isin(['Cornerback','Linebacker'])].groupby(['rosterposition','surface','playtype'])['playerkey'].count().unstack('playtype').plot(kind='bar', figsize=(10,4), title='Injuries by playtype',stacked=False)

The above results show that Cornerbacks were particularly prone to injuries by pass play on synthetic turf. The Linebackers were injured by various plays. But in the case of punts, more injuries were reported on the natural turf. So, where do they get injured? 

In [None]:
# injury bodypart by position(%)
inj_bodypart_freq = inj_detailed.groupby(['surface','rosterposition','bodypart'])['bodypart'].count()
inj_bodypart_pct_nat=inj_bodypart_freq['Natural']/inj_bodypart_freq['Natural'].sum()*100
inj_bodypart_pct_syn=inj_bodypart_freq['Synthetic']/inj_bodypart_freq['Synthetic'].sum()*100

inj_bodypart_pct_nat.unstack('bodypart').plot(kind='bar', figsize=(10,4), title='Natural Turf',stacked=True)
inj_bodypart_pct_syn.unstack('bodypart').plot(kind='bar', figsize=(10,4),title='Synthetic Turf',stacked=True)
plt.show()

The above results show that on the Synthetic turf, Cornerbacks suffered injuries to their ankle, knee, and foot. Linebackers suffered ankle and knee injuries in both environments. The results also show that athletes in various positions are injured in toes on synthetic turf, regardless of their position. This seems to be caused by environmental factors. Then, how serious are the injuries?

In [None]:
# grouping by fieldtype
inj_sep =inj_detailed.groupby(['surface','dm_m1','dm_m7','dm_m28','dm_m42'])['playerkey'].count()
inj_sep.unstack(['dm_m1','dm_m7','dm_m28','dm_m42']).plot(kind='bar', figsize=(10,4), stacked=True)
plt.legend(['1d','7d','28d','42d'])

This result shows that injuries on synthetic turf seem more serious. This is because more cases have been registered as injured for longer than 28 days on synthetic turf. 

In [None]:
inj_detailed['light_inj']= inj_detailed['dm_m1']+inj_detailed['dm_m7']
inj_detailed['light_inj']=inj_detailed['light_inj'].map((lambda x: True if x>0 else False)).astype(np.int)
inj_detailed['heavy_inj']= inj_detailed['dm_m28']+inj_detailed['dm_m42']
inj_detailed['heavy_inj']=inj_detailed['heavy_inj'].map((lambda x: True if x>0 else False)).astype(np.int)

pd.set_option('display.float_format','{:.2f}'. format)
inj_heavy =inj_detailed.groupby('surface')['heavy_inj'].sum()
inj_surface=inj_detailed.groupby('surface')['heavy_inj'].count()
inj_heavy_surface_p=round(inj_heavy/inj_surface*100,1)
for a,b in enumerate(inj_heavy_surface_p):
    plt.text(a,b,str(b))
inj_heavy_surface_p.plot(kind='bar', stacked=True)
plt.title('Percentage of Severe injury on each different field type')
plt.show()

In [None]:
# Heavy injuries by position
inj_detailed.groupby(['rosterposition'])['heavy_inj'].sum().sort_values(ascending=False).plot(kind='bar', figsize=(10,4))
plt.title('Severe injuries by play position')

Severe injuries are more apparent in Linebackers and Wide Receivers.

** Data Preprocessing 2**

In [None]:
#merge injury with playlist table

total_player=player_df.groupby(['gameid'])['playkey'].count()
injured_player=injury_df.groupby(['gameid'])['playkey'].count()

test_frame = pd.DataFrame(total_player)
test_frame.columns=['tot_cnt_p_game']
test_frame.reset_index(level=0,inplace=True)

injury_player_frame = pd.DataFrame(injured_player)
injury_player_frame.columns=['injury_cnt_p_game']
injury_player_frame.reset_index(level=0, inplace=True)

t_i_rate=pd.merge(test_frame,injury_player_frame, left_on=['gameid'], right_on=['gameid'], how='left')
t_i_rate.fillna(0,inplace=True)

# data trim(replacement, generalization)
#stadium data
st_map={'Outdoor':'outdoor_nc', 'Indoors':'indoor_wc', 'Oudoor':'outdoor_nc', 'Outdoors':'outdoor_nc', 'Open':'indoor_nc', 'Closed Dome':'indoor_wc',
       'Domed, closed':'indoor_wc', 'Dome':'indoor_wc', 'Indoor':'indoor_wc', 'Domed':'indoor_wc',
       'Retr. Roof-Closed':'indoor_wc', 'Outdoor Retr Roof-Open':'indoor_nc', 'Retractable Roof':'indoor_nc',
       'Ourdoor':'outdoor_nc', 'Indoor, Roof Closed':'indoor_wc', 'Retr. Roof - Closed':'indoor_wc', 'Bowl':'outdoor_nc',
       'Outddors':'outdoor_nc', 'Retr. Roof-Open':'indoor_nc', 'Dome, closed':'indoor_wc', 'Indoor, Open Roof':'indoor_nc',
       'Domed, Open':'indoor_nc', 'Domed, open':'indoor_nc', 'Heinz Field':'outdoor_nc', 'Cloudy':'outdoor_nc',
       'Retr. Roof - Open':'indoor_nc', 'Retr. Roof Closed':'indoor_wc', 'Outdor':'outdoor_nc', 'Outside':'outdoor_nc'}


# weather data 
weather_map={'Clear and warm':'clear', 'Mostly Cloudy':'cloudy', 'Sunny':'sunny', 'Clear':'clear', 'Cloudy':'cloudy',
       'Cloudy, fog started developing in 2nd quarter':'fog', 'Rain':'rain',
       'Partly Cloudy':'cloudy', 'Mostly cloudy':'cloudy', 'Cloudy and cold':'cloudy',
       'Cloudy and Cool':'cloudy', 'Rain Chance 40%':'cloudy', 'Controlled Climate':'clear',
       'Sunny and warm':'sunny', 'Partly cloudy':'cloudy', 'Clear and Cool':'clear',
       'Clear and cold':'clear', 'Sunny and cold':'sunny', 'Indoor':'clear', 'Partly Sunny':'sunny',
       'N/A (Indoors)':'clear', 'Mostly Sunny':'sunny', 'Indoors':'clear', 'Clear Skies':'clear',
       'Partly sunny':'sunny', 'Showers':'rain', 'N/A Indoor':'clear', 'Sunny and clear':'clear', 'Snow':'snow',
       'Scattered Showers':'rain', 'Party Cloudy':'cloudy', 'Clear skies':'clear',
       'Rain likely, temps in low 40s.':'rain', 'Hazy':'cloudy', 'Partly Clouidy':'cloudy',
       'Sunny Skies':'sunny', 'Overcast':'cloudy', 'Cloudy, 50% change of rain':'rain', 'Fair':'cloudy',
       'Light Rain':'rain', 'Partly clear':'clear', 'Mostly Coudy':'cloudy', '10% Chance of Rain':'cloudy',
       'Cloudy, chance of rain':'cloudy', 'Heat Index 95':'sunny',
       'Sunny, highs to upper 80s':'sunny', 'Sun & clouds':'cloudy',
       'Heavy lake effect snow':'snow', 'Mostly sunny':'sunny', 'Cloudy, Rain':'rain',
       'Sunny, Windy':'sunny', 'Mostly Sunny Skies':'sunny', 'Rainy':'rain',
       '30% Chance of Rain':'cloudy', 'Cloudy, light snow accumulating 1-3"':'snow',
       'cloudy':'cloudy', 'Clear and Sunny':'sunny', 'Coudy':'cloudy', 'Clear and sunny':'sunny',
       'Clear to Partly Cloudy':'cloudy',
       'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.':'rain',
       'Rain shower':'rain', 'Cold':'cold'}


player_df2=player_df.copy() # don't want to touch original data
player_df2['weather']=player_df2['weather'].map(weather_map)
player_df2['stadiumtype']=player_df['stadiumtype'].map(st_map)

#remove duplication based on gameid and temperature missinng value fill out by mean value it won't affect much.
player_df3=player_df2.drop_duplicates('gameid')
player_df_nodup=player_df3.copy()
player_df_nodup[(player_df_nodup['temperature']!=-999)].groupby('stadiumtype').mean()
temp_indoornc_index=player_df_nodup[(player_df_nodup['temperature']==-999) & (player_df_nodup['stadiumtype']=='indoor_nc')].temperature.index
temp_indoorwc_index=player_df_nodup[(player_df_nodup['temperature']==-999) & (player_df_nodup['stadiumtype']=='indoor_wc')].temperature.index
temp_outdoornc_index=player_df_nodup[(player_df_nodup['temperature']==-999) & (player_df_nodup['stadiumtype']=='outdoor_nc')].temperature.index

player_df_nodup.loc[temp_indoornc_index,'temperature']=72
player_df_nodup.loc[temp_indoorwc_index,'temperature']=67
player_df_nodup.loc[temp_outdoornc_index,'temperature']=59

# temperature categorize.
player_df_nodup['temperature'].unique()
temp_bins=[0, 10, 20, 30,40,50,60,70,80,90,100]
player_df_nodup['tempe_cat'] = pd.cut(player_df_nodup['temperature'],temp_bins)

play_inj_df=pd.merge(player_df_nodup,t_i_rate, left_on=['gameid'], right_on=['gameid'], how='left')

inj_pick=inj_detailed[['gameid','bodypart','heavy_inj']]
tot_df=pd.merge(play_inj_df,inj_pick, left_on=['gameid'], right_on=['gameid'], how='left')


Injury percentage per game on each different stadium

In [None]:
a=tot_df.groupby('stadiumtype')['playerkey'].count()
a=pd.DataFrame(a)
a.columns=['tot_game']
a.reset_index(inplace=True)
b=tot_df.groupby('stadiumtype')['injury_cnt_p_game'].sum()
b=pd.DataFrame(b)
b.columns=['inj_cnt']
b.reset_index(inplace=True)
stadium_inj_pct=pd.merge(a,b, left_on='stadiumtype', right_on='stadiumtype')
stadium_inj_pct['pct']= round(stadium_inj_pct['inj_cnt']/stadium_inj_pct['tot_game']*100,2)
stadium_inj_pct.plot('stadiumtype','pct', kind="bar", figsize=(10,4), title='Injury percentage')
plt.legend([])
del a,b
for a, b in enumerate(stadium_inj_pct['pct']):
    plt.text(a,b, str(b))

plt.show()

**Definition of stadium type :** 
     - indoor_nc  : Indoor stadium without weather control(open)
     - indoor_wc  : Indoor stadium with weather control(closed)
     - outdoor_nc : Outdoor stadium, no weather control

The above results show that the percentage of injuries occurring in indoor stadiums is higher than that of outdoor stadiums, even though there is about 1% difference. 

In [None]:
# number of games in different stadium types
tot_df.groupby(['stadiumtype'])['playerkey'].count().plot(kind='bar',figsize=(10,5))
for a, b in enumerate(tot_df.groupby(['stadiumtype'])['playerkey'].count()):
    plt.text(a,b, str(b))
plt.show()

As shown above, the number of games held in the outdoor_nc was about 3 times more than the number of games held in the inoor, and about 10 times more than that of the inoor_nc.

In [None]:
# injury frequency and percentage by stadium type 
st_bd_cnt =tot_df.groupby(['stadiumtype','bodypart'])['playerkey'].count() # 부상부위 빈도수
st_bd_cnt_frame=pd.DataFrame(st_bd_cnt)
st_bd_cnt_frame.columns = ['bd_cnt']
st_bd_cnt_frame.reset_index(inplace=True)
st_cnt=tot_df.groupby(['stadiumtype'])['playerkey'].count()# stadium 별 부상 빈도
##st_bd_p=st_bd_cnt.copy()
st_bd_p_inc=(st_bd_cnt['indoor_nc']/st_cnt['indoor_nc'])*100
st_bd_p_iwc=(st_bd_cnt['indoor_wc']/st_cnt['indoor_wc'])*100
st_bd_p_onc=(st_bd_cnt['outdoor_nc']/st_cnt['outdoor_nc'])*100
st_bd_p_inwc=pd.concat([st_bd_p_inc,st_bd_p_iwc], axis=0)
st_bd_p=pd.concat([st_bd_p_inwc,st_bd_p_onc], axis=0)
st_bd_p_frame=pd.DataFrame(st_bd_p)
st_bd_p_frame.columns = ['pct']
st_bd_p_frame.reset_index(inplace=True)
#st_bd_p_frame.drop('bodypart', axis=1,inplace=True)
st_bd_cnt_frame['pct']=st_bd_p_frame['pct'].copy()
a=st_bd_cnt_frame.pivot(index='bodypart',columns='stadiumtype',values='pct')
b=st_bd_cnt_frame.pivot(index='bodypart',columns='stadiumtype',values='bd_cnt')
#st_bd_cnt_frame.plot('bodypart','pct',kind="bar", figsize=(3,4), title='Injury percentage')
b.plot(kind='bar',figsize=(10,4)) 
plt.ylabel('Frequency/game')
a.plot(kind='bar',figsize=(10,4)) 
plt.ylabel('%/game')

From the above results, the number of injuries sustained in the outdoor_nc was the largest in ankle and knee, but the percentage of injuries per game was the highest in the indoor_nc, with more than 1%. The rate of injury in the indoor stadium is more than 5 % and higher than outdoor's.

In [None]:
# injury frequency and percentage by fieldtype

player_play_fieldtype_freq=tot_df.groupby(['fieldtype','bodypart'])['playerday'].count()
pp_fieldtype_freq=pd.DataFrame(player_play_fieldtype_freq)
pp_fieldtype_freq.columns = ['play_cnt']
pp_fieldtype_freq.reset_index(inplace=True)
ft_cnt=tot_df.groupby(['fieldtype'])['playerkey'].count()
# bodypart injury percentage per game on each different fieldtype
ft_bd_p_na=(player_play_fieldtype_freq['Natural']/ft_cnt['Natural'])*100
ft_bd_p_sy=(player_play_fieldtype_freq['Synthetic']/ft_cnt['Synthetic'])*100

ft_bd_p=pd.concat([ft_bd_p_na,ft_bd_p_sy], axis=0)
ft_bd_p_frame=pd.DataFrame(ft_bd_p)
ft_bd_p_frame.columns = ['pct']
ft_bd_p_frame.reset_index(inplace=True)
pp_fieldtype_freq['pct']=ft_bd_p_frame['pct'].copy()
a=pp_fieldtype_freq.pivot(index='bodypart',columns='fieldtype',values='pct')
b=pp_fieldtype_freq.pivot(index='bodypart',columns='fieldtype',values='play_cnt')

fig, axes = plt.subplots(3)
b.plot(kind='bar',figsize=(15,20),title='Number of Injury', ax=axes[0]) 
axes[0].set_ylabel('Frequency')
a.plot(kind='bar',figsize=(15,20),title='Percentage of Injury',ax=axes[1]) 
axes[1].set_ylabel('Percentage(%)')

pp_fieldtype_inj_pct=pd.read_csv('/kaggle/input/pp-fieldtype-inj-freq/pp_fieldtype_inj_freq.csv')



pp_fieldtype_inj_pct.groupby('fieldtype')['inj_pct'].mean().plot(kind='bar',figsize=(15,20),ax=axes[2])
plt.title('Injury percentage on each field type')
axes[2].set_ylabel('Percentage(%)')

Based on that results above, on synthetic turf, ankle injury rate is twice that of natural turf and toes injury rate is more than 8 times.Finally, the graph shows the rate of injury per player. Athletes have an injury rate of about 5 percent on synthetic turf and about 3 percent on natural turf.

**Weather/Temperature**

In [None]:
# injury percentage by weather on each different fieldtype 
a=tot_df.groupby(['weather','fieldtype'])['injury_cnt_p_game'].sum()
we_fi_freq=pd.DataFrame(a)
we_fi_freq.columns = ['inj_cnt']
we_fi_freq.reset_index(inplace=True)
we_cnt=tot_df.groupby(['weather','fieldtype'])['playerkey'].count()
b=pd.DataFrame(we_cnt)
b.columns=['game_cnt']
b.reset_index(inplace=True)
we_fi_freq['game_cnt']=b['game_cnt']
we_fi_freq['pct']=(we_fi_freq['inj_cnt']/we_fi_freq['game_cnt'])*100

a=we_fi_freq.pivot(index='weather',columns='fieldtype',values='pct')
b=we_fi_freq.pivot(index='weather',columns='fieldtype',values='inj_cnt')

fig, axes = plt.subplots(2)
b.plot(kind='bar', ax=axes[0]) 
axes[0].set(ylabel='Frequency/game', title='Number of injury / game')

a.plot(kind='bar',figsize=(10,13),ax=axes[1]) 
axes[1].set(ylabel='%/game', title='Percentage of injury /game')

The above results show the rate of injury per game for each weather condition. From the results, the rate of injuries on cold days is the highest at about 33 %, with injuries occurring in 3 out of 9 games held. It is important to note that all of this occurred on natural turf.

In [None]:
# injury percentage by temperature on each different fieldtype 
a=tot_df.groupby(['tempe_cat','fieldtype'])['injury_cnt_p_game'].sum()
tp_fi_freq=pd.DataFrame(a)
tp_fi_freq.columns = ['inj_cnt']
tp_fi_freq.reset_index(inplace=True)
tp_cnt=tot_df.groupby(['tempe_cat','fieldtype'])['playerkey'].count()
b=pd.DataFrame(tp_cnt)
b.columns=['game_cnt']
b.reset_index(inplace=True)
tp_fi_freq['game_cnt']=b['game_cnt']
tp_fi_freq['pct']=(tp_fi_freq['inj_cnt']/tp_fi_freq['game_cnt'])*100

a=tp_fi_freq.pivot(index='tempe_cat',columns='fieldtype',values='pct')
b=tp_fi_freq.pivot(index='tempe_cat',columns='fieldtype',values='inj_cnt')

fig, axes = plt.subplots(2)
b.plot(kind='bar', ax=axes[0]) 
axes[0].set(ylabel='Frequency/game', title='Number of injury / game')

a.plot(kind='bar',figsize=(10,13),ax=axes[1]) 
axes[1].set(ylabel='%/game', title='Percentage of injury /game')


Looking first at the environment of synthetic turf, the most injuries occurred at temperatures between 60 and 70 degrees, but more than 4% were injured at temperatures between 70 and 80 degrees. In particular, it is surprising that about 9% of injuries per game occurred at temperatures higher than 60 degrees. Natural turf showed the highest incidence at low temperatures of 30-40 degrees and at high temperatures of 80-90 degrees, and the lowest in the 40-50 degrees range.
Overall, 40-60 degrees showed low injury rates per game regardless of the field type.
The point to note here is between 60 and 80 degrees. In this temperature range, the incidence of injuries per game in synthetic turf was more than double that of natural turf.

Speed matter?



In [None]:
import pandas as pd
pt_inj_expt = pd.read_csv("/kaggle/input/pt-inj-expt/pt_inj_expt.csv")
inj_detailed_df2=pd.read_csv('/kaggle/input/inj-detailed-df2/inj_detailed_df2.csv')

In [None]:
a=inj_detailed_df2.groupby(['playerkey'])['s'].mean()
a_frame=pd.DataFrame(a)
a_frame.columns =['inj_avg_s']
a_frame.reset_index(inplace=True)
b=pt_inj_expt.groupby('playerkey')['s'].mean()
b_frame=pd.DataFrame(b)
b_frame.columns =['norm_avg_s']
b_frame.reset_index(inplace=True)



c=pd.merge(a_frame,b_frame, left_on='playerkey', right_on='playerkey', how='left' )# injured
d=pd.merge(b_frame,a_frame, left_on='playerkey', right_on='playerkey', how='left' )# total

In [None]:
# Average speed of same athletes in both cases: Injured vs. Not injured  
fig, axes=plt.subplots(2, figsize=(15,12))
c.boxplot(column=['inj_avg_s','norm_avg_s'],vert=True,notch=True, ax=axes[0])
axes[0].set_title('Average speed of athletes when Injured vs. Not injured ')
axes[0].set_ylabel('speed')
d.boxplot(column=['inj_avg_s','norm_avg_s'], notch=True,patch_artist=True,vert= True, ax=axes[1])
axes[1].set_title('Average speed Injured vs. Not injured')
axes[1].set_ylabel('speed')
plt.show()

The above first results show the average speed of the injured player in the injury game and the average speed in the normal game. As the results show, the average speed in the injured game was about 0.7 y/s slower. The speed at which no injuries are sustained is about the same as that of other players.

In [None]:
# avg speed on each different field type injured vs. not injured
f =player_df.groupby(['playkey','fieldtype'])['gameid'].count()
ff=pd.DataFrame(f)
ff.reset_index(inplace=True)
ff.drop('gameid', axis=1, inplace=True)
#del f
pt_inj_expt_df2=pd.merge(pt_inj_expt,ff, left_on='playkey',right_on='playkey', how='left')

f=pt_inj_expt_df2.groupby('fieldtype')['s'].mean()
f_frame=pd.DataFrame(f)
f_frame.columns =['norm_avg_s']
f_frame.reset_index(inplace=True)
g=inj_detailed_df2.groupby(['fieldtype'])['s'].mean()
g_frame=pd.DataFrame(g)
g_frame.columns =['inj_avg_s']
g_frame.reset_index(inplace=True)
f_field=pd.merge(g_frame, f_frame, left_on='fieldtype', right_on='fieldtype', how='left')
f_field.plot(x='fieldtype',kind='bar',title='Average speed by fieldtype',figsize=(15,7))
#f_field.set_title('Average speed by fieldtype')
plt.ylabel('speed')
plt.legend(['Injury','Normal'])

The following are the average speed results of the injured and the normal athletes for each field type. As shown in the above results, the average speed of the injured players, regardless of their field type, was about 0.25 to 0.5y/s slow. The average speed of players on synthetic turf is slightly faster, but speed gap between injured and not injured  are bigger.

In [None]:
import scipy.stats as stats
inj_playkey_list=injury_df['playkey'].tolist()
#fig, axes = plt.subplots(1,2)
fig = plt.figure(figsize=(7,9))
a=player_track_info_df.query('playkey in @inj_playkey_list')['s']
sp_a= stats.gaussian_kde(a)
n,x, _ = plt.hist(a, bins=10, histtype=u'step', density=True, color='blue')
plt.title('distribution of player speed injured vs. not injured')
plt.plot(sp_a(x))
b=player_track_info_df.query('playkey not in @inj_playkey_list')['s'].sample(1000000)
sp_b= stats.gaussian_kde(b)
n,x1, _ = plt.hist(b, bins=10, histtype=u'step', density=True, color='orange')
plt.plot(sp_b(x1))
plt.legend(['Injured', 'Not injured'])
plt.show()


The above results are based on a Gaussian distribution of the speed of the injured and the uninjured athletes. The distribution showed a difference in the highest value, but the distribution of speed is not very different.

In [None]:
import scipy.stats as stats
#inj_playkey_list=injury_df['playkey'].tolist()
#fig, axes = plt.subplots(1,2)
fig = plt.figure(figsize=(7,9))
a=player_track_info_df.query('playkey in @inj_playkey_list')['dis']
sp_a= stats.gaussian_kde(a)
n,x, _ = plt.hist(a, bins=10, histtype=u'step', density=True, color='blue')
plt.title('distribution of player distance injured vs. not injured')
plt.plot(sp_a(x))
b=player_track_info_df.query('playkey not in @inj_playkey_list')['dis'].sample(1000000)
sp_b= stats.gaussian_kde(b)
n,x1, _ = plt.hist(b, bins=10, histtype=u'step', density=True, color='orange')
plt.plot(sp_b(x1))
plt.legend(['Injured', 'Not injured'])
plt.show()

The above results are based on a Gaussian distribution of distance between injured and uninjured athletes. The distribution appears to be more diverse in the case of the injured. Diversity can increase the probability of injury. However, as an athlete, diversity does not seem to be a factor that needs to be removed to avoid injury. The injury sample size is also too small, so there may be other results when more data is available.


In [None]:
# Creating master table considering relationship
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

tot_df_test =tot_df.drop(['playerkey', 'gameid','playkey','playergameplay',
                          'position','positiongroup','bodypart',
                         'heavy_inj'], axis=1)

tot_df_test['injury_cnt_p_game']=tot_df_test['injury_cnt_p_game'].astype(np.int64)

tot_df_test_num = pd.get_dummies(tot_df_test)

test =['fieldtype_Natural', 'fieldtype_Synthetic', 'playtype_Kickoff', 'playtype_Punt','playtype_Pass','playtype_Rush','rosterposition_Cornerback','rosterposition_Linebacker',
        'stadiumtype_indoor_wc','stadiumtype_indoor_nc','stadiumtype_outdoor_nc', 
        'weather_clear','weather_cold','tempe_cat_(30, 40]','tempe_cat_(60, 70]','tempe_cat_(70, 80]',
       'tempe_cat_(80, 90]','injury_cnt_p_game']
corr = tot_df_test_num[test].corr()
fig = plt.figure(figsize=(14,12))
sns.heatmap(corr, annot=True, cmap=sns.diverging_palette(220,20, as_cmap=True))
plt.title('Correlation Heatmap')
plt.show()


The correlation map above shows that field type shows opposite characteristics for injury. Natural turf seems to help prevent injury, and the kicking style play seems to increase injury. Cold weather, indoor statium type seems to have a bad effect on injury. At temperature, 70-80 range seem to have the most adverse effect on injuries.

**FEATURE SELECTION BY ML(Garbage In Garbage Out)**

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split
import statsmodels.api as sm




X = tot_df_test_num[tot_df_test_num.columns.difference(['injury_cnt_p_game'])]
y = tot_df_test_num['injury_cnt_p_game']

# 1.
logistic = LogisticRegression(C=0.2, penalty="l1", random_state=0).fit(X,y)
model = SelectFromModel(logistic, prefit=True)
X_new = model.transform(X)


# 2.
#model = SelectKBest(f_classif, k=10)
#X_new = model.fit_transform(X, y)



#for 1,2
selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                index=X.index, columns=X.columns)
selected_columns = selected_features.columns[selected_features.var()!=0]
selected_columns = selected_columns.insert(0,'injury_cnt_p_game')
for i in range(0,len(selected_columns)):
    if selected_columns[i]=='playerday':
        selected_columns = selected_columns.delete(i)
        break
        
cor=tot_df_test_num[selected_columns].corr()# correlation map

fig = plt.figure(figsize=(10,8))
sns.heatmap(cor, annot=True, cmap=sns.diverging_palette(220,20, as_cmap=True))
plt.title('Correlation Heatmap by Logistic')
plt.show()

The above correlation map is a logistic regression that classifies features that affect injury more among many features. Let’s look at some of the different features I’ve found in our exploratory data analysis(EDA). If looked at the features classified by the logistic regression algorithm, the playergame, the defensive/offensive Lineman position, and the cloud/sunny in weather were newly classified as influencing features.

In [None]:
# 2.
model = SelectKBest(f_classif, k=10)
X_new = model.fit_transform(X, y)


selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                index=X.index, columns=X.columns)
selected_columns = selected_features.columns[selected_features.var()!=0]
selected_columns = selected_columns.insert(0,'injury_cnt_p_game')
for i in range(0,len(selected_columns)):
    if selected_columns[i]=='playerday':
        selected_columns = selected_columns.delete(i)
        break
        
cor=tot_df_test_num[selected_columns].corr()# correlation map

fig = plt.figure(figsize=(10,8))
sns.heatmap(cor, annot=True, cmap=sns.diverging_palette(220,20, as_cmap=True))
plt.title('Correlation Heatmap by SelectKBest')
plt.show()

The above result is a correlation heatmap drawn in features that affect injuries found by other algorithms. Cornerback position and weather feature differ from those obtained by logistic regression algorithm. However, the importance of features found by this algorithm is more similar to those identified by EDA.

In [None]:
# linear regression
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import  train_test_split
import statsmodels.api as sm


X = tot_df_test_num[tot_df_test_num.columns.difference(['injury_cnt_p_game'])]
y = tot_df_test_num['injury_cnt_p_game']

# 4.
cols = list(X.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(model,10)
X_rfe = rfe.fit_transform(X,y)
model.fit(X_rfe,y)
temp=pd.Series(rfe.support_, index=cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

##############################

selected_columns = selected_features_rfe.insert(0,'injury_cnt_p_game')      
cor=tot_df_test_num[selected_columns].corr()# correlation map

fig = plt.figure(figsize=(10,8))
sns.heatmap(cor, annot=True, cmap=sns.diverging_palette(220,20, as_cmap=True))
plt.title('Correlation Heatmap by Linear Regression')
plt.show()



The above result is a choice of 10 important features through linear regression. Another difference in algorithms is the importance of categorized temperature features. In addition, compared to other algorithms, more factors were chosen to prevent injury than to cause injury.

In [None]:
# Lasso
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV
from sklearn.model_selection import  train_test_split
import statsmodels.api as sm
import matplotlib


X = tot_df_test_num[tot_df_test_num.columns.difference(['injury_cnt_p_game'])]
y = tot_df_test_num['injury_cnt_p_game']

# Lasso regression()
reg=LassoCV()
reg.fit(X,y)
coef=pd.Series(reg.coef_, index = X.columns)
imp_coef = coef.sort_values()
matplotlib.rcParams['figure.figsize']=(8.0,10.0)
imp_coef.plot(kind='barh')
plt.title('feature importance using Lasso Model')


Lasso regression automates certain parts of model selection, like feature selection or parameter elimination, and is a good algorithm for simplification. According to the above results, the algorithm chose six features of player play per game, number of games, field type, rush, and temperature as important features for the model to identify the player's injury. 
It's not surprising that the player's rush play is a factor that affects the occurrence of the injury. However, it is hard to understand that the higher the temperature, the clearer the weather is the cause of injury.
Due to the high temperature, or the weather being too clear, poor preparation or lack of tension over injuries may be a factor.
It is even more difficult to understand that the more games an athlete plays, or the more time they play, the less risk of injury. Perhaps, they are not causal data about risk, but result data.
It is not surprising that natural grass helps prevent injury