In [None]:
from IPython.display import HTML
style = """
<style>
    .header1 { font-family:'Arial';font-size:30px; color:Black; font-weight:800;}
    .header2 { 
        font-family:'Arial';
        font-size:18px; 
        color:Black; 
        font-weight:600;
        border-bottom: 1px solid; 
        margin-bottom: 8px;
        margin-top: 8px;
        width: 100%;
        
    }
    .header3 { font-family:'Arial';font-size:16px; color:Black; font-weight:600;}
    .para { font-family:'Arial';font-size:14px; color:Black;}
    .flex-columns {
        display: flex;
        flex-direction: row;
        flex-wrap: wrap;
    }
    .flex-container {
         padding: 20px;
    }
    
    .flex-container-large {
         padding: 20px;
         max-width: 40%;
    }
    
    .flex-container-small {
         padding: 20px;
         max-width: 17.5%;
    }
    
    .list-items {
        margin: 10px;
    }
    
    .list-items li {
        color: #3692CC;
        font-weight: 500;
    }
</style>
"""
HTML(style)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

<div class="header1"> Building The Model </div>
<div class="header2">Goal: Create a model which predicts concussions based on key features</div>
<div class="para">
    This sections builds on Part 1 which can be found in my submission. This notebook has been split due to memory restrictions. Some code blocks have been hidden for readability, click the (...) to expand them.
 </div>

<div class="header3">Why build an ML Model?</div>
<div class="para">
    Building a model which accurately predict based off features researched above, would verify that the slecetd features are indeed good indicators of concussions. If we are able to succesfully classify concussions based off a set of lead / lag indicators then we can attempt to visualise the behaviour of the model and perform a sensitivity analysis to understand what particular values of each features make it most likely for concussions to occur. We can then use this criteria to select video replays which represent a general / typical concussion event.
</div>

<div class="header3">Feature selection</div>
<div class="para">
In the above research we have gone through almost every useful feature provided in the dataset. Some additional research into data on turf type / weather conditions has been left out as it did not prove a significant factros.

Based on the above feature suitability study the following features have been chosen:
<div class="list-items">
    <li>Yard Line Distance</li>
    <li>Score Difference</li>
    <li>Temperature</li>
    <li>Player Role</li>
    <li>Velocity</li>
    <li>Play Duration</li>
    <li><b>Label:</b><i> Concussed</i></li>
</div>

We will train the model to predict whether a particular player in a give play / down will be concussed.
</div>

In [None]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import GridSearchCV
import itertools
import datetime

def round_time(dt=None, round_to=60):
    if dt == None: 
        dt = datetime.datetime.now()
    seconds = (dt - dt.min).seconds
    rounding = (seconds+round_to/2) // round_to * round_to
    return dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond)

## Helper function from sklearn docs
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    ax0.imshow(cm, interpolation='nearest', cmap=cmap)
    ax0.set_title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
def plot_2d_space(X, y,ax_):   
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    for l, c, m in zip(np.unique(y), colors, markers):
        ax_.scatter(
            X[y==l, 0],
            X[y==l, 1],
            c=c, label=l, marker=m)
    ax_.legend(loc='upper right', labels=['No Concussion', 'Concussion'])

In [None]:
data_selected_raw = pd.read_csv('../input/nfl-selected-features/data_selected.csv') 
data_selected = data_selected_raw.drop(columns=['Season_Year','GameKey','PlayID','GSISID'])
features = data_selected[['YardLineDist','ScoreDifference','Role','Temperature','Velocity','Play_Duration','Concussed']]

# One hot encode categorical role
role_enc = pd.get_dummies(features['Role'])
features_w_enc_role = features.merge(right=role_enc, how='inner', left_index=True, right_index=True)
features_w_enc_role = features_w_enc_role.drop(columns=['Role'])
features_w_enc_role = features_w_enc_role.dropna()

# Smote Oversampling
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(features_w_enc_role.drop(columns=['Concussed']),features_w_enc_role['Concussed'])

In [None]:
plt.figure(figsize=(12,5))
ax0 = plt.subplot2grid((1,2),(0,0))
ax1 = plt.subplot2grid((1,2),(0,1))
ax0.set_title('Imbalanced dataset projection (Fig 1)')
ax1.set_title('SMOTE rebalanced dataset projection (Fig 2)')

# Imbalanced original data set
X_pca, y_pca = features_w_enc_role.drop(columns=['Concussed']),features_w_enc_role['Concussed']
pca = PCA(n_components=2)
X_pca_fit = pca.fit_transform(X_pca)
plot_2d_space(X_pca_fit, y_pca, ax0)

# Rebalance with SMOTE
X_pca, y_pca = X_sm, y_sm
pca = PCA(n_components=2)
X_sm_pca = pca.fit_transform(X_sm)
plot_2d_space(X_sm_pca, y_pca,ax1)

<div class="header2">Class Imablance and Oversampling</div>
<div class="para">
        There are 6 features used in this model, so its difficult to visualise the feature space! I've used a technique call principle component analysis which allows us to view a projection of the data onto 2d space for illustration purposes. In the figure 1. it is immediately obvious There is an enormous imbalance between non concussions (blue dots) and concussions (orange dots). This is a problem, as any machine learing algorithm which tries to predict on this data set can achieve excellent accuracy by simply predicting every example to be a non concussion and be right almost 100% of the time. This is an extremely common issue in many applications including medical scans and fraud detection, an generally accepted solution is to use an oversampling technique called SMOTE (Synthetic Minority Oversampling Technique). 
        </div>
<div class="header2">Fixing the imablance</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">What is SMOTE</div>
        <div class="para">
        Synthetic Minority Oversampling Technique is a complicated way of saying: for each concussion example select the 5 nearset other concussion examples, form a line in the feature space between them and randomly interpolate along the line to create new synthetic concussions. This works on the assumption that ineterpolating between similar features will yield new examples of the same class.
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Effect of SMOTE on Dataset</div>
        <div class="para">
        We can visualise what SMOTE has done to our data in figure 2, by synthetically generating more examples we have now balanced the dataset. This will help to better produce better generalisation of decision boundaries found using the machine learning algorithm.
        </div>
    </div>
</div>

<div class="header2">Linear Seperability of the Data and Support Vector Machines</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Why Support Vector Machines</div>
        <div class="para">
                In figure 1 and 2 it becomes obvious that there is no linear decision boundary, a line we could draw in order to perfectly seperate the two classes (Non concussions and Concussions). This rules out most linear classifier except for support vector machines, which can use the kernel trick to project our finite dimensional feature vector into an infinite dimensional space in which they can be easily separated.
         </div>
    </div>
    <div class="flex-container">
        <div class="header3">What does this actually mean?</div>
        <div class="para"> In english this means the support vector machine will be able to linearly separate the 2 classes by engineering an inifite number of features through a clever mathematical trick called a kernel, these infinite features will be special combinations of our original 6 features and will make our data points linearly seperable in this infinite space.
        </div>
    </div>
</div>

<div class="header2">Lets build the SVM model</div>

In [None]:
X_train, X_ct, y_train, y_ct = train_test_split(X_sm, y_sm, test_size=0.6, random_state=12)

In [None]:
# Cross validation set for model selection
X_cv, X_test, y_cv, y_test = train_test_split(X_ct, y_ct, test_size=0.75, random_state=12)

In [None]:
f1_scores = {}
confusion_matrices = {}

# This may take some time on your kernel be patient :)
for c_try in [0.001,0.01,0.1,1,10]:
    print("Using: %.4f" % c_try)
    clf_opt = SVC(probability=True, C=c_try)
    clf_opt.fit(X_train, y_train)
    
    y_predicted = clf_opt.predict(X_cv)

    model_f1_score = f1_score(y_cv, y_predicted)
    conf_matrix = confusion_matrix(y_cv, y_predicted)

    f1_scores[c_try] = [model_f1_score]
    confusion_matrices[c_try] = [conf_matrix]

In [None]:
print("F1 Score C=1: %.4f" % f1_scores[1][0])
print("F1 Score C=10: %.4f" % f1_scores[10][0])

plt.figure(figsize=(15,4))
ax1 = plt.subplot2grid((1,3), (0,0))
ax2 = plt.subplot2grid((1,3), (0,1))
ax3 = plt.subplot2grid((1,3), (0,2))

c_performance = pd.DataFrame(f1_scores).transpose()
c_performance_log = c_performance.copy()
c_performance_log.index = np.log10(c_performance_log.index)

c_performance_log.plot(ax=ax1)
ax1.set_xticks([-3,-2,-1,0,1])
ax1.set_xlabel('C Value (Log Base 10)')
ax1.set_ylabel('F1 Score')
ax1.set_title('F1 score curve for varying C')

cf1 = np.array(confusion_matrices[1][0])
cf10 = np.array(confusion_matrices[10][0])

norm_cm = cf1.astype('float') / cf1.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, ax=ax2, annot=True, fmt=".3f", cmap='Blues', yticklabels=['Normal', 'Concussion'], xticklabels=['Normal', 'Concussion'])
ax2.set_title('Confusion Matrix (C=1)')

norm_cm = cf10 / cf10.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, ax=ax3, annot=True, fmt=".3f", cmap='Blues', yticklabels=['Normal', 'Concussion'], xticklabels=['Normal', 'Concussion'])
ax3.set_title('Confusion Matrix (C=10)')

<div class="header2">Model Selection Metrics</div>
<div class="para">
        Above we have trained 5 SVM models on a training set of 40% of our data and an additional 15% of our data as a validation set.
        We can see from the f1 score graph which has a log scale for the C Value (x = 1 corresponds to 10 ^ 1 = 10) that a C value of 10 leads to the best model score and the curve is more or less flattening out, thus this is what we will utilise in the final model. We will talk more about the scoring mechanism and train / test set proportions below
        </div>

In [None]:
# Undersample all data 185674 data points is too many for under 60 features, SVC is O(n^3) and is slow on large datasets
# Using SMOTE oversampled dataset C=1 (Optimal)
# Note random_state has been fixed to ensure when you build my code you get the same SVM as I do.

clf_init = SVC(probability=True, C=10)
clf_init.fit(X_train, y_train)
y_predicted = clf_init.predict(X_test)

model_f1_score = f1_score(y_test, y_predicted)
conf_matrix = confusion_matrix(y_test, y_predicted)

In [None]:
X_test_distances = pd.DataFrame(clf_init.decision_function(X_test))
X_train_distances = pd.DataFrame(clf_init.decision_function(X_train))

In [None]:
from sklearn.externals import joblib
joblib.dump(clf_init, 'nfl_concussions_model_w_probs_v2.pkl', compress=9)

In [None]:
print("F1 Score: %.4f" % model_f1_score)

plt.figure(figsize=(15,4))

ax1 = plt.subplot2grid((1,3), (0,0))
ax2 = plt.subplot2grid((1,3), (0,1))
ax3 = plt.subplot2grid((1,3), (0,2))

sns.distplot(X_train_distances[y_train == 1], color='r', kde=False, ax = ax1, norm_hist=True)
sns.distplot(X_train_distances[y_train == 0], color='b', kde=False, ax = ax1, norm_hist=True)

ax1.legend(['Concussion','No Concussion'])
ax1.set_xlim((-2,2))
ax1.axvline(x=1, linestyle='dashed', linewidth=0.8, c='black')
ax1.axvline(x=-1, linestyle='dashed', linewidth=0.8, c='black')
ax1.axvline(x=0, linestyle='dashed', linewidth=1, c='black')
ax1.set_title('Histogram of Projections \n Training Separation From Decision Boundary')

sns.distplot(X_test_distances[y_test == 1], color='r', kde=False, ax = ax2, norm_hist=True)
sns.distplot(X_test_distances[y_test == 0], color='b', kde=False, ax = ax2, norm_hist=True)

ax2.legend(['Concussion','No Concussion'])
ax2.set_xlim((-2,2))
ax2.axvline(x=1, linestyle='dashed', linewidth=0.8, c='black')
ax2.axvline(x=-1, linestyle='dashed', linewidth=0.8, c='black')
ax2.axvline(x=0, linestyle='dashed', linewidth=1, c='black')
ax2.set_title('Histogram of Projections \n Test Data Separation From Decision Boundary')

norm_cm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, ax=ax3, annot=True, fmt=".3f", cmap='Blues', yticklabels=['Normal', 'Concussion'], xticklabels=['Normal', 'Concussion'])

ax1.set_ylim((0,1))
ax2.set_ylim((0,1))
ax3.set_title('Confusion Matrix')
ax3.set_xlabel('Predicted Label')
ax3.set_ylabel('Real Label')

<div class="header2">Performance Metrics</div>
<div class="para">
        We trained the SVM on 40% of our data and used 45% for testing and held the remaining 15% for cross validation in the model selection stage, using a C parameter of 10 (This was discovered to be the optimum in testing above). The SKlearn SVC model has Big O Performance greater than quadratic, meaning that using a dataset with 180,000 data points would be computationally expensive and thus we only used 40% of the data to train the model. 
        
        In order to evaluate the performance of our SVM we utilise histograms of projections, we interpret the centre vertical line as the decision boundary and the two vertical lines to the left and right as the margin of seperation between the two classes on which the support vectors lie (Concussed / Non Concussed). The confusion matrix quadrants show explicity the proportions of the data correctly / incorrectly labelled. The F1 Score (Top Left) is a score with scale 0 to 1 where one represents both high recall (Correctly Classifed concussions and few concussions classified as non concussion) and high precision (Correctly classified concussions and few non concussions classified as concussions).
        </div>
<div class="header2">Details</div>
<div class="flex-columns">
    <div class="flex-container">
        <div class="header3">Historgrams of Projections</div>
        <div class="para">
        From the histogram of projections we can see that in both training and testing the red data points (concussions) are all on the far side of the right hand margin, this is desriable as it indicates perfect classfication of concussions. We can see however that there is some leakge across the decision boundary for non concussions. By looking at the shape of the distribution we can see that preformance is consistent across test and training data, indicating that the model is generalising well to the remaining 60% of our dataset.
        </div>
    </div>
    <div class="flex-container">
        <div class="header3">Confusion Matrix & F1 Score</div>
        <div class="para">
        The F1 Score is already very close to one (0.9986), we are unlikely to get much better than this. We can see from the confusion matrix that no missclasfications of concussions occured whilst only 0.3% of non concussions were incorrectly classfied.
        </div>
    </div>
</div>
<div class="header2">Summary</div>
<div class="para">
       The model appears to be working efficiently and effectively. Next step is a sensitivity analysis to our original features.
</div>

In [None]:
# Helper Function
def platt_generate_graphs(cols,cols_non_binary,resolution=500, show_key=False, disable_plot=False):
    synthetic_probabilities = {}
    
    for col in cols:
        if col in cols_non_binary:
            x_min_col = x_min[col][0]
            x_max_col = x_max[col][0]
            x_range_col = x_max_col - x_min_col

            synthetic_range = np.arange(start=x_min_col,stop=x_max_col,step=x_range_col/500)
            X_synthetic = pd.concat([x_mean]*resolution, ignore_index=True)
        else:
            x_min_col = 0
            x_max_col = 1
            x_range_col = 1
            synthetic_range = np.arange(start=x_min_col,stop=x_max_col+1,step=x_range_col)
            X_synthetic = pd.concat([x_mean]*2, ignore_index=True)

        X_synthetic[col] = synthetic_range
        class_probabilities = clf_init.predict_proba(X_synthetic)

        synthetic_probabilities[col] = pd.DataFrame(index=synthetic_range,data=class_probabilities)
    
    if not disable_plot:   
        plt.figure(figsize=(20,190))
        PLOT_COLUMNS = 4

        mean_probability = clf.predict_proba(x_mean)[0]

        for (i, key) in enumerate(synthetic_probabilities):
            if key in cols_non_binary or show_key is False:
                ax1 = plt.subplot2grid((len(cols), PLOT_COLUMNS), (int(i/PLOT_COLUMNS),i%PLOT_COLUMNS))
                synthetic_probabilities[key][1].plot(ax=ax1, color='black', linewidth=1.2)
                ax1.set_title(key)
                ax1.axhline(y=mean_probability[1], linestyle='dashed', linewidth=0.8, color='red')

    return synthetic_probabilities

# Initialisation
X_stats = pd.DataFrame(X_sm).describe()
X_stats.columns = features_w_enc_role.drop(columns=['Concussed']).columns
X_concussed = features_w_enc_role[features_w_enc_role['Concussed'] == 1].drop(columns=['Concussed'])
cols = X_stats.columns

# Calculate sensitivity around each concussion example
concussed_synth_probs = []

for i in range(X_concussed.shape[0]):
    x_mean = pd.DataFrame(X_concussed.iloc[i,:]).transpose()
    x_max = pd.DataFrame(X_stats.loc['max',:]).transpose()
    x_min = pd.DataFrame(X_stats.loc['min',:]).transpose()

    synth_probs = platt_generate_graphs(
        X_stats.columns,['YardLineDist','ScoreDifference','Role','Temperature','Velocity','Play_Duration'], disable_plot=True)
    concussed_synth_probs.append(synth_probs)

# Compared to an average example (Maybe compare to avg noncussed be more meaningful)
x_mean = pd.DataFrame(X_stats.loc['mean',:]).transpose()
mean_probability = clf_init.predict_proba(x_mean)[0]
axes = []

# Sum over all sensitivity distributions
summed_synthetic_probabilities = concussed_synth_probs[0] 

for i in range(1,len(concussed_synth_probs)):
    for key in summed_synthetic_probabilities:
        summed_synthetic_probabilities[key] = summed_synthetic_probabilities[key] + concussed_synth_probs[i][key]

# Calculate an average agreement of feature sensitivity
avg_synthetic_probabilities = {}

for key in summed_synthetic_probabilities:     
    avg_synthetic_probabilities[key] = summed_synthetic_probabilities[key] / len(concussed_synth_probs) 
    
# Recombine all binary role features into single categorical
binary_role_features = [col for col in avg_synthetic_probabilities if col not in ['YardLineDist','ScoreDifference','Role','Temperature','Velocity','Play_Duration']]
role_recombined = {}

# We calculate the relative increase / decrease in probability of concussions as this is more visual
for key_nb in binary_role_features:
    role_recombined[key_nb] = [avg_synthetic_probabilities[key_nb][1][1] - avg_synthetic_probabilities[key_nb][1][0]]
    
role_probabilities = pd.DataFrame(role_recombined).transpose().sort_values(by=0)

axes = {}

# Plot the average sensitivity
plt.figure(figsize=(20,20))
# plt.suptitle('SVM Model Sensitivity Analysis By Feature')
PLOT_COLUMNS = 5

for (i, key) in enumerate(avg_synthetic_probabilities):
    if key not in binary_role_features:
        if len(axes) < i + 1:
            ax1 = plt.subplot2grid((3, PLOT_COLUMNS), (int(i/(PLOT_COLUMNS-2)),2 + i%(PLOT_COLUMNS-2)))
            axes[key] = ax1
        else:
            ax1 = axes[key]

        avg_synthetic_probabilities[key][1].plot(ax=ax1, color='black', linewidth=1.2)
        ax1.set_title(key)
        ax1.axhline(y=mean_probability[1], linestyle='dashed', linewidth=0.8, color='red')
        ax1.minorticks_on()
    
axr = plt.subplot2grid((3, PLOT_COLUMNS), (0,0), colspan=2, rowspan=2)
axr.set_title('Increase in concussion probability by Role')
role_probabilities.plot(kind='barh', ax=axr)

critical_ranges = {
    'YardLineDist': 0.2,
    'ScoreDifference': 0.15,
    'Temperature': 0.15,
    'Velocity': 0.55,
    'Play_Duration': 0.15,
}

# Annotations 
for (i, key) in enumerate(avg_synthetic_probabilities):
    if key not in binary_role_features:
        ax_ = axes[key]
        critical_values = avg_synthetic_probabilities[key][1] > critical_ranges[key]
        x_values = avg_synthetic_probabilities[key][critical_values][1].index.tolist()
        y_values = avg_synthetic_probabilities[key][critical_values][1].tolist()
        ax_.fill_between(x=x_values,y1=y_values, color='r', alpha=0.3, interpolate=True)

<div class="header2">Sensitivity Analysis</div>
<div class="para">
The SVM model uses the kernel trick to make predictions over a non linear decision boundary, this means the feature space in which classification occurs is extremely high dimensional (Actually infinite), thus we can't make a direct link back to the original features we used.Instead we train a logistic hypothesis function on top of the SVM which estimates the probability of concussion given the original feature set.<br><br>

The graphs above represent the probability of concussion for varying values of a given feature (all other features held at a constant value). These probabilities are evaluated locally around the data points for real concussions in order to find the critical regions for concussions in each feature (Highlighted in red).
        </div>
<div class="header2">Feature Sensitivity</div>
<div class="flex-columns">
    <div class="flex-container-large">
        <div class="header3">Role</div>
        <div class="para">
        Since roles are either played or not played by a given player the role probabilities of conussion have been calculated as the difference between their baseline probability and the probability of concussion when a player takes on a particular role. 
        
        We learn from the model that in general players who play in role sof punter, punt recievers, guards, full backs and wide recievers have increased likelihoods of recieving a concussion. Playing any other role would actually reduce your liklihood of reciieving a concussion 
        </div>
    </div>
    <div class="flex-container-large">
        <div class="header3">Yard Line Distance</div>
        <div class="para">
        Yard Line Distance measures the distance from an endzone that a specific punt play began. According to the SVM model punt play which begins between the 19 - 35 yard range is more likely to result in a conussion compared to any other yard line distance.
        </div>
    </div>
</div>
<div class="flex-columns">
    <div class="flex-container-small">
            <div class="header3">Score Difference</div>
            <div class="para">
            Score differences between two teams in the range -14 to 15 points are more likely to involve a concussion compared to any other score range. 
        </div>
     </div>
    <div class="flex-container-small">
        <div class="header3">Pitch Temperature</div>
        <div class="para">
        When play temperatures range between 33F to 64F they are more likely to involve a concussion. 
        </div>
    </div>
    <div class="flex-container-small">
        <div class="header3">Velocity</div>
        <div class="para">
        Similarly to early analysis the SVM model suggests that players travelling at less than 1.6 m/s are more likely to recieve a concussion during play.
        </div>
    </div>
    <div class="flex-container-small">
        <div class="header3">Play Duration</div>
        <div class="para">
        When a play lasts less than 41 seconds there is greater likelihood of concussion, this does seem consistent with the fact that concussions plays tend to be longer in duration than regular plays.
        </div>
    </div>
</div>
<div class="header2">Summary</div>
<div class="para">
       Given the above critical regions identified we will now proceed to examine video footage which matches the following criteria:
       <div class="list-items">
            <li>Yard Line Distance: <b> 19 to 35 Yards</b></li>
            <li>Score Difference:  <b> -14 to 15 Points</b></li>
            <li>Temperature:  <b> 33 to 64 F</b></li>
            <li>Velocity:  <b> 0 to 2.1 m/s</b></li>
            <li>Play Duration:  <b> 0 to 41 secs</b></li>
        </div>
       Identfiying these key concussion incidents will help us form a rule that will aim to reduce concussions that are highly likely according to the SVM model critical feature regions.
</div>

In [None]:
# Apply critical region restrictions
X_analysis = data_selected_raw[data_selected_raw['Concussed'] == 1]
yard_line_restrict =(X_analysis['YardLineDist'] >= 19) & (X_analysis['YardLineDist'] <= 35)
score_difference_restrict = (X_analysis['ScoreDifference'] >= -14) & (X_analysis['ScoreDifference'] <= 15)
temperature_restrict = (X_analysis['Temperature'] <= 64) & (X_analysis['Temperature'] >= 33)
velocity_restrict = X_analysis['Velocity'] <= 2.1
duration_restrict = X_analysis['Play_Duration'] <= 41

restricted_concussions = X_analysis[yard_line_restrict & score_difference_restrict & temperature_restrict & velocity_restrict & duration_restrict]

# Video Replay Data
video_data_2016 = pd.read_csv('../input/NFL-Punt-Analytics-Competition/video_footage-injury.csv')
video_data_2017 = pd.read_csv('../input/NFL-Punt-Analytics-Competition/video_footage-control.csv')

video_data_2016.columns = video_data_2017.columns
video_data = video_data_2016.append(video_data_2017)

# Player Number
player_punt_data = pd.read_csv('../input/NFL-Punt-Analytics-Competition/player_punt_data.csv')
# Some players have held more than one number / position
player_punt_data_dedupl = player_punt_data.groupby(by='GSISID').agg(' '.join)
restricted_concussions_number = restricted_concussions.merge(right=player_punt_data_dedupl, on='GSISID', how='left')

# Video - Selected Data
video_data_restricted = video_data.merge(right=restricted_concussions_number, how='inner', left_on=['season','gamekey','playid'], right_on=['Season_Year','GameKey','PlayID'])

#Video Review
video_review = pd.read_csv('../input/NFL-Punt-Analytics-Competition/video_review.csv')
video_review_restricted = video_review.merge(right=video_data_restricted, how='inner', on=['Season_Year','GameKey','PlayID','GSISID'])
video_review_restricted = video_review_restricted[[c for c in video_review_restricted.columns if c[-2:] != '_y']]
video_review_restricted.columns = video_review_restricted.columns.str.replace('_x', '')
video_review_restricted['Primary_Partner_GSISID'] = video_review_restricted['Primary_Partner_GSISID'].astype('int64')

# Merging Concussion Partner Data
video_review_restricted_partner = video_review_restricted.merge(right=player_punt_data_dedupl, left_on='Primary_Partner_GSISID', right_on='GSISID', how='left', suffixes=('','_partner'))
video_review_restricted_partner = video_review_restricted_partner.merge(right=data_selected_raw, left_on=['Season_Year','GameKey','PlayID','Primary_Partner_GSISID'], right_on=['Season_Year','GameKey','PlayID','GSISID'], how='left', suffixes=('','_partner'))
video_review_restricted_partner[['Season_Year','GameKey','PlayID','GSISID','Number','Number_partner','Primary_Impact_Type','YardLineDist','ScoreDifference','Temperature','Velocity','Velocity_partner','Player_Activity_Derived','Primary_Partner_GSISID','Preview Link','Play_Duration']]

### Game Previews to Analyse
<a href='https://nfl-vod.cdn.anvato.net/league/5691/18/11/25/284954/284954_75F12432BA90408C92660A696C1A12C8_181125_284954_huber_punt_3200.mp4'>Preview 1: GKey: 231 Play: 1976 Number: 81 (Season 2016) (Red Team) </a><br>
<a href='http://a.video.nfl.com//films/vodzilla/153247/Punt_by_Tress_Way-QsI21aYF-20181119_160141260_5000k.mp4'>Preview 2: GKey: 280 Play: 2918 Number: 87 (Season 2016) </a><br>
<a href='http://a.video.nfl.com//films/vodzilla/153272/Haack_42_yard_punt-iP6aZSRU-20181119_165050694_5000k.mp4'>Preview 3: GKey: 448 Play: 2792 Number: 33 (Season 2017) </a><br>
<a href='http://a.video.nfl.com//films/vodzilla/153291/Palardy_53_yard_punt-XTESVMq9-20181119_170509550_5000k.mp4'>Preview 1: GKey: 567 Play: 1407 Number: 38 / 32 (Season 2017) </a><br>

In [None]:
video_review_restricted_partner

In [None]:
video_review_restricted_partner['YardLineDist_Concussion'] = [21,15, 41, 14]
video_review_restricted_partner['Play_Duration_Concussion'] = [13,2, 12, 12]
video_review_restricted_partner['Play_Duration'][0] = 14
video_review_restricted_partner['Play_Duration'][1] = 10
video_review_restricted_partner['Play_Duration'][2] = 13
video_review_restricted_partner['Play_Duration'][3] = 12

In [None]:
axes = {}

# Plot the average sensitivity
plt.figure(figsize=(20,16))
# plt.suptitle('SVM Model Sensitivity Analysis By Feature')
PLOT_COLUMNS = 7

rows = video_review_restricted_partner[['GSISID','PlayID','Season_Year','GameKey']]
VIDEOS_TO_REPLAY =  rows.shape[0]

for k in range(VIDEOS_TO_REPLAY):
    GSISID_v,PLAYID_v,SEASON_v,GAMEKEY_v = rows.iloc[k,:].tolist()
    
    GSISID = video_review_restricted_partner['GSISID'] == GSISID_v
    PLAYID = video_review_restricted_partner['PlayID'] == PLAYID_v
    SEASON = video_review_restricted_partner['Season_Year'] == SEASON_v
    GAMEKEY = video_review_restricted_partner['GameKey'] == GAMEKEY_v
    selected_play = video_review_restricted_partner[GSISID & PLAYID & SEASON & GAMEKEY]

    for (i, key) in enumerate(avg_synthetic_probabilities):
        if key not in binary_role_features:
            if '%d_%s' % (k,key) not in axes:
                ax1 = plt.subplot2grid((VIDEOS_TO_REPLAY*3, PLOT_COLUMNS), (k*2 + int(i/(PLOT_COLUMNS-2)),2 + i%(PLOT_COLUMNS-2)), rowspan=2)
                axes['%d_%s' % (k,key)] = ax1
            else:
                ax1 = axes['%d_%s' % (k,key)]

            avg_synthetic_probabilities[key][1].plot(ax=ax1, color='grey', linewidth='1.2')
            ax1.set_title(key)
    #       ax1.axhline(y=mean_probability[1], linestyle='dashed', linewidth='0.8', color='red')
            ax1.minorticks_on()
    
    critical_ranges = {
        'YardLineDist': 0.18,
        'ScoreDifference': 0.15,
        'Temperature': 0.15,
        'Velocity': 0.50,
        'Play_Duration': 0.122,
    }

    axr = plt.subplot2grid((VIDEOS_TO_REPLAY*3, PLOT_COLUMNS), (k*2,0), colspan=2, rowspan=2)
    

    axr.set_title('SEASON: %d GAME KEY: %d PLAY ID: %d\n PLAYER ID: %d Role:' %(SEASON_v,GAMEKEY_v,PLAYID_v, GSISID_v))

    filtered_roles = role_probabilities[role_probabilities[0] >= 0]
    colors = np.array(['k']*filtered_roles.shape[0])
    colors[filtered_roles.index == selected_play['Role'].values[0]] = 'c'
    filtered_roles.plot(kind='barh',colors=''.join(colors), ax=axr)

    # Annotations 
    for (i, key) in enumerate(avg_synthetic_probabilities):
        if key not in binary_role_features:
            ax_ = axes['%d_%s' % (k,key)]
            critical_values = avg_synthetic_probabilities[key][1] > critical_ranges[key]
            x_values = avg_synthetic_probabilities[key][critical_values][1].index.tolist()
            y_values = avg_synthetic_probabilities[key][critical_values][1].tolist()
            
            ax_.fill_between(x=x_values,y1=y_values, color='r', alpha=0.3, interpolate=True)
            arrow_value = selected_play[key]
            ax_.annotate('',xy=(arrow_value, 0), xytext=(arrow_value, np.max(y_values)+0.05), arrowprops=dict(headwidth=5.5,width=0.8,facecolor='black'))
            ax_.set_ylim((0,np.max(y_values)+0.05))

            if key == 'Velocity':
                arrow_value_partner = selected_play["%s_partner" % key]
                
                # Deals with NANS
                if arrow_value_partner.values[0] > -1:
                    ax_.annotate('',xy=(arrow_value_partner, 0), xytext=(arrow_value_partner, np.max(y_values)+0.05), arrowprops=dict(headwidth=5.5,width=0.8,color='red'))
            
            if key == 'YardLineDist':
                arrow_value_partner = selected_play["YardLineDist_Concussion"]
                
                # Deals with NANS
                if arrow_value_partner.values[0] > -1:
                    ax_.annotate('',xy=(arrow_value_partner, 0), xytext=(arrow_value_partner, np.max(y_values)+0.05), arrowprops=dict(headwidth=5.5,width=0.8,color='blue'))
            
            if key == 'Play_Duration':
                arrow_value_concussion = selected_play["%s_Concussion" % key]
                ax_.annotate('',xy=(arrow_value_concussion, 0), xytext=(arrow_value_concussion, np.max(y_values)+0.05), arrowprops=dict(headwidth=5.5,width=0.8,color='green'))
            
plt.tight_layout()

<div class="header2">General Observations</div>
<div class="para">
<div class="list-items">
<li>In all incidents punt reciever had possesion of the ball and was advancing towards opponents endzone in 3 out of 4 cases</li>
<li>Punts were kicked at almost the same yard line distance and all concussions occurred within the punt recieving team's half of the field</li>
<li>Score differences tended to exceed +-10 points</li>
<li>All concussions involved one player with a high average velocity and the other at almost stationary average velocity</li>
<li>3 out of 4 concussions involved a helmet to helmet collision</li>
</div>
</div>
<div class="header2">Holes in NGS</div>
<div class="para">
I've attempted to obtain more in depth information in terms of the actual velocities of players at the point of impact and their precise yard distances, but there are large holes in the NGS data, thus some data is missing or is approximated using average estimators. Evidence from the videos may not perfectly coincide (but has been manually corrected where possible) but in general approximations have been made where appropriate.
</div>

<div class="header2">Section 2 Article 7  PLAYERS IN A DEFENCELESS POSTURE of NFL Rules</div>
<div class="para">
This section of the rulebook states: “It is a foul if a player initiates unnecessary contact against a player who is in a defenceless posture”.<br><br>
This definition currently includes:
<div class="list-items">
<li>Act of Throwing Pass</li>
<li>Running pass route</li>
<li>Attempting a catch</li>
<li>Intended receiver following interception</li>
<li>Tackled stopped runner</li>
<li>PR Attempting Field Kick</li>
<li>…</li>
</div>
My proposition is to extend this definition to cover the rule and condition stated below.
</div>

<div class="header2">Rule</div>
<div class="para">
<div class="list-items">
<li>Any player moving slowly relative to his opponent should be considered defenceless and must not be contacted above the shoulder.</li>
</div>
</div>
<div class="header2">Condition</div>
<div class="para">
<div class="list-items">
<li>This rule applies if and only if the punt play starts between the 20 - 30 yard lines and  the player is not capable of avoiding or
warding off the impending contact of an opponent. </li>
</div>
</div>
<div class="header2">Effect</div>
<div class="para">
<div class="list-items">
<li>Reduces the risk of high velocity helmet to helmet contacts with slow moving players who may not see / be able to avoid the oncoming tackle. Thus reducing the risk dangerous tackles which could lead to concussions with high probability</li>
</div>
</div>
<div class="header2">Note on Critical Regions</div>
<div class="para">
<div class="list-items">
We can see the critical regions we have selected align well with the concussions that occurred in 2016 / 2017, as a significant proportion of the 37 concussions satisfy at least four of the critical regions, as in the graph below.
</div>
</div>


In [None]:
X_analysis['YLD_Satisfy'] = yard_line_restrict.replace(to_replace = {True: 1, False: 0})
X_analysis['Score_Satisfy'] = score_difference_restrict.replace(to_replace = {True: 1, False: 0})
X_analysis['Temp_Satisfy'] = temperature_restrict.replace(to_replace = {True: 1, False: 0})
X_analysis['Velocity_Satisfy'] = velocity_restrict.replace(to_replace = {True: 1, False: 0})
X_analysis['Duration_Satisfy'] = duration_restrict.replace(to_replace = {True: 1, False: 0})
X_analysis['Satisfy_Count'] = X_analysis['YLD_Satisfy'] + X_analysis['Score_Satisfy'] + X_analysis['Temp_Satisfy'] + X_analysis['Velocity_Satisfy'] + X_analysis['Duration_Satisfy'] 

ax = X_analysis.groupby(by=['Satisfy_Count']).count()['GSISID'].plot(kind='barh', title='Frequency of the number of critical regions satisfied by concussion examples')
ax.set_ylabel('Number of critical regions satsified')
ax.set_xlabel('Concussion Examples')