In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import itertools

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# image utils
from PIL import Image

# import plotting
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

# import machine learning
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler, SMOTE
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score

## Load Data

Load the data as pandas dataframes:

In [None]:
play_df = pd.read_csv('../input/nfl-playing-surface-analytics/PlayList.csv')
player_df = pd.read_csv('../input/nfl-playing-surface-analytics/PlayerTrackData.csv')
injury_df = pd.read_csv('../input/nfl-playing-surface-analytics/InjuryRecord.csv')

In [None]:
play_df.shape

In [None]:
injury_df.shape

In [None]:
player_df.shape

In [None]:
play_df.head()

In [None]:
injury_df.head()

In [None]:
player_df.head()

In [None]:
player_df.PlayKey.value_counts()

In [None]:
def relable (injury_df):
   if (injury_df['DM_M1'] == 0) & (injury_df['DM_M7']== 0) & (injury_df['DM_M28']== 0) & (injury_df['DM_M42']== 0):
      return 'No Injury'
   if (injury_df['DM_M1'] == 1) & (injury_df['DM_M7']== 0) & (injury_df['DM_M28']== 0) & (injury_df['DM_M42']== 0):
      return 'Near-term Injury'
   if (injury_df['DM_M1'] == 1) & (injury_df['DM_M7']== 1) & (injury_df['DM_M28']== 0) & (injury_df['DM_M42']== 0):
      return 'Short-term Injury'
   if (injury_df['DM_M1'] == 1) & (injury_df['DM_M7']== 1) & (injury_df['DM_M28']== 1) & (injury_df['DM_M42']== 0):
      return 'Medium-term Injury'
   if (injury_df['DM_M1'] == 1) & (injury_df['DM_M7']== 1) & (injury_df['DM_M28']== 1) & (injury_df['DM_M42']== 1):
      return 'Long-term Injury'


In [None]:
injury_df['Injury Type'] = injury_df.apply (lambda injury_df: relable(injury_df), axis=1)

In [None]:
injury_df.head()

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
injury_df = reduce_mem_usage(injury_df)
play_df = reduce_mem_usage(play_df)
player_df = reduce_mem_usage(player_df)

In [None]:
injury_df['Injury Type'].value_counts().plot.bar(rot=30)

In [None]:
injury_df['Injury Type'].value_counts()

In [None]:
injury_df['BodyPart'].value_counts().plot.bar(rot=0)

In [None]:
pd.crosstab(injury_df['BodyPart'],injury_df['Surface'],margins=True)

In [None]:
round(pd.crosstab(injury_df['BodyPart'],injury_df['Surface'],normalize = 'columns'),4)*100

In [None]:
injury_df['Surface'].value_counts(normalize=True)

In [None]:
pd.crosstab(injury_df['Surface'],injury_df['BodyPart'],normalize='index',margins=True)

In [None]:
pd.crosstab(injury_df['Surface'],injury_df['BodyPart'],normalize='index').plot.bar(rot=0,figsize=(8,5))

In [None]:
pd.crosstab(injury_df['Injury Type'],injury_df['Surface'],margins=True)

In [None]:
injury_df['Injury Type'].value_counts(normalize=True).plot.bar(rot=20)

In [None]:
pd.crosstab(injury_df['Surface'],injury_df['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(8,5))

In [None]:
pd.crosstab(injury_df['Injury Type'],injury_df['BodyPart'],margins=True)

In [None]:
pd.crosstab(injury_df['BodyPart'], injury_df['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(8,5))

In [None]:
result = pd.merge(injury_df,play_df,how='left', on=['PlayerKey','GameID','PlayKey'])
result.head()

In [None]:
result.shape

In [None]:
list(result.columns.values)

In [None]:
pd.crosstab(result['RosterPosition'], result['Injury Type'],).plot.bar(rot=0,figsize=(15,8))

In [None]:
pd.crosstab(result['RosterPosition'], result['Injury Type'],margins=True,normalize='index')

In [None]:
pd.crosstab(result['RosterPosition'], result['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(15,8))

In [None]:
pd.crosstab(result['BodyPart'], result['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(15,8))

In [None]:
 pd.crosstab(result[result.Surface == 'Natural']['BodyPart'], result[result.Surface == 'Natural']['Injury Type'],normalize='index',).plot.bar(rot=0,figsize=(15,8))

In [None]:
 pd.crosstab(result[result.Surface == 'Synthetic']['BodyPart'], result[result.Surface == 'Synthetic']['Injury Type'],normalize='index',).plot.bar(rot=0,figsize=(12,6))

In [None]:
result.Weather.unique()

In [None]:
L1 = ['Cloudy', 'Coudy', 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
      'Cloudy, 50% change of rain', 'Cloudy and Cool','Mostly cloudy','Cold']
d1 = dict.fromkeys(L1, 'Cloudy')

L2 = ['Partly Cloudy','Sun & clouds']
d2 = dict.fromkeys(L2, 'Partly Cloudy')

L3 = ['Clear and warm', 'Sunny', 'Clear','Clear skies', 'Mostly sunny', 'Mostly Sunny','Clear skies','Fair']
d3 = dict.fromkeys(L3, 'Sunny')

L4 = ['Rain','Rain shower']
d4 = dict.fromkeys(L4, 'Rain')

L5 = ['Indoor', 'Controlled Climate']
d5 = dict.fromkeys(L5, 'Indoor')

d = {**d1, **d2, **d3, **d4, **d5}

result['Weather'] = result['Weather'].map(d)

In [None]:
 pd.crosstab(result['Weather'], result['Injury Type'],normalize='index',).plot.bar(rot=0,figsize=(12,6))

In [None]:
 pd.crosstab(result[result.Surface == 'Natural']['Weather'], result[result.Surface == 'Natural']['Injury Type'],normalize='index',).plot.bar(rot=0,figsize=(12,6))

In [None]:
 pd.crosstab(result[result.Surface == 'Synthetic']['Weather'], result[result.Surface == 'Synthetic']['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(12,6))

In [None]:
result['StadiumType'].unique()

In [None]:
L1 = ['Indoors', 'Indoor',
      'Dome', 'Retr. Roof-Closed','Indoor, Roof Closed','Closed Dome','Domed, closed','Retr. Roof - Closed']
d1 = dict.fromkeys(L1, 'Indoors')

L2 = ['Open','Outdoor','Outddors', 'Outdoors','Oudoor','Retr. Roof - Open','Retractable Roof']
d2 = dict.fromkeys(L2, 'Outdoor')

d = {**d1, **d2}

result['StadiumType'] = result['StadiumType'].map(d)

In [None]:
pd.crosstab(result['StadiumType'], result['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(12,7))

In [None]:
pd.crosstab(result['Position'], result['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(15,8))

In [None]:
pd.crosstab(result['PlayType'], result['Injury Type'],normalize='index').plot.bar(rot=0,figsize=(15,8))

In [None]:
r1 = result.drop(columns=['PlayerKey','GameID','PlayKey','BodyPart','Surface','DM_M1','DM_M7','DM_M28','DM_M42'])
r1.head()

In [None]:
cat_columns

In [None]:
# split into X and y
y = r1['Injury Type']
X = r1.drop(columns=['Injury Type'])

In [None]:
skf = StratifiedKFold(n_splits=2)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index, :], X.values[test_index, :]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
model = xgb.XGBClassifier(max_depth=3,
                      learning_rate=0.1,
                      n_estimators=100,
                      objective='binary:logistic',
                      booster='gbtree',
                      tree_method='auto',
                      n_jobs=50,
                      gamma=0,
                      min_child_weight=1,
                      max_delta_step=0,
                      subsample=1,
                      colsample_bytree=1,
                      colsample_bylevel=1,
                      colsample_bynode=1,
                      reg_alpha=0,
                      reg_lambda=1,
                      scale_pos_weight=1,
                      base_score=0.5,
                      random_state=42)
model.fit(X, y)

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
cohen_kappa = cohen_kappa_score(y_test, y_pred)

print('Accuracy: {}'.format(accuracy))
print('Cohen kappa: {}'.format(cohen_kappa))
print('Confusion Matrix: \n {}'.format(conf_matrix))