In [None]:
import requests
import IPython.display as Disp
url = 'https://kaggle-examples-work-bucket.s3.us-east-2.amazonaws.com/Football.DecisionTree.v3.jpg'
Disp.Image(requests.get(url).content,height=800,width=800)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Summary of Research

Focused on the results of 18,000+ plays as characterized by the following:
* quarter 
* down
* distance
* yard line 
* defenders in box
* number of pass rushers 
* score
* the quantity of players for each position on the field

## Steps 
First I created the database and ran both a principled components analysis and correlation matrix. The results of these experiments are intuitively obvious as a dynamic game with a fixed number of players on the field, positions and outcomes. Next I ran a decision tree on these variables on whether the offense had zero or negative yardage on a play. This led to a fairly transparent but still convoluted decision tree. I cleaned up the tree by hand considering what are the main decision points leading to better outcomes for defenses and created an infographic to summarize. 

## Future Research
Future research would focus on game theoretic, mixed equilibrium outcomes (given knowledge of play strategies offenses would counter - how do we choose what to do with probability). **I believe you could use the lower level data to determine speed and spacial coverage of the defending backs and wide receivers to make an improved decision tree.**

# Appendix: Research Walkthrough
The data has a few interesting details. First, it is entirely passing plays so any analysis derived comes from the assumption that the passing play is known, thus the analysis should be under the assumption you are on the offensive side of the ball. Second, much of the provided data is low level position data per play, this might be extraneous to derive general principles or performance indicators, so we will focus more on play meta-data. Last, we separate our data into control information (knowns) and possible dependent variables we want to predict or understand. The control information is going to be down, distance, score, teams and the types of players on the field.

## Create Main Data Table
For this analysis, we're going to focus on the plays and games data table which requires a merge on gameid. 


In [None]:
#import libraries 
import pandas as pd 

In [None]:
#read in the plays data 
plays = pd.read_csv("../input/nfl-big-data-bowl-2021/plays.csv")
plays.head()

In [None]:
#read in the games data 
games = pd.read_csv("../input/nfl-big-data-bowl-2021/games.csv")
games.head()

In [None]:
#merge the two datasets and save as df 
df = plays.merge(games,how="left",on="gameId")
df.head()

In [None]:
#check the size of the dataset
df.shape

In [None]:
#check the types on each column 
df.dtypes

## Create New Columns for Analysis

In [None]:
#create a defense dataframe 
defense = df.personnelD.str.split(',', expand=True)
defense.head()

In [None]:
#delete columns
defense.drop(defense.columns[[3, 4, 5]], axis = 1, inplace = True) 
defense.head()

In [None]:
#rename columns 
defense.rename(columns = {0:'DL',1:'LB',2:'DB'}, inplace = True) 
defense.head()

In [None]:
defense = defense.fillna(0)
defense['DL'] = defense['DL'].str.strip()
defense['LB'] = defense['LB'].str.strip()
defense['DB'] = defense['DB'].str.strip()

In [None]:
#keep only the numeric values
defense["DL_num"] = pd.to_numeric(defense['DL'].str[:1])
defense["LB_num"] = pd.to_numeric(defense['LB'].str[:1])
defense["DB_num"] = pd.to_numeric(defense['DB'].str[:1])
defense.head()

In [None]:
#create a offense dataframe 
offense = df.personnelO.str.split(',', expand=True)
offense.head()

In [None]:
#delete columns
offense.drop(offense.columns[[3, 4, 5, 6, 7]], axis = 1, inplace = True) 
offense.head()

In [None]:
#rename columns 
offense.rename(columns = {0:'RB',1:'TE',2:'WR'}, inplace = True) 
offense.head()

In [None]:
offense = offense.fillna(0)
offense['RB'] = offense['RB'].str.strip()
offense['TE'] = offense['TE'].str.strip()
offense['WR'] = offense['WR'].str.strip()

In [None]:
#keep only the numeric values
offense['RB_num'] = pd.to_numeric(offense['RB'].str[:1])
offense['TE_num'] = pd.to_numeric(offense['TE'].str[:1])
offense['WR_num'] = pd.to_numeric(offense['WR'].str[:1])

In [None]:
#concatenate the data frames, offense and defense, back to df 
df = pd.concat((df,defense), axis=1)
df = pd.concat((df,offense), axis=1)
df.head()

In [None]:
#investigate duplicate indices 
df[df.index.duplicated()]

In [None]:
#remove duplicated indices 
df = df[~df.index.duplicated()]

## Investigate Relationships within the Numeric Columns 

In [None]:
import numpy as np
import matplotlib.pyplot as plt  
import seaborn as sns 

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

In [None]:
#select only numeric colummns
num_df = df.loc[:,['quarter','down','yardsToGo','yardlineNumber','defendersInTheBox','numberOfPassRushers','preSnapVisitorScore',
          'preSnapHomeScore','absoluteYardlineNumber','offensePlayResult','playResult','epa','week','DL_num','LB_num',
           'DB_num','RB_num','TE_num','WR_num']]
num_df.head()

In [None]:
#save the column names 
col_names = num_df.columns

In [None]:
#check for nulls in the data 
num_df.isnull().sum()

In [None]:
#standardize the data for processing 
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(num_df)
x = pd.DataFrame(x)
x.head()

In [None]:
x.columns = col_names
x.head()

In [None]:
#see size of x
x.shape

In [None]:
#check for nulls in the data 
x.isnull().sum()

In [None]:
#drop the NA rows 
x = x.dropna()
x.shape

## PCA Analysis  
"PCA is dimension reduction technique which takes set of possibly correlated variables and tranforms into linearly uncorrelated principal components. It is used to emphasize variations and bring out strong patterns in a dataset.

In simple words, principal component analysis is a method of extracting important variables from a large set of variables available in a data set. It extracts low dimensional set of features from a high dimensional data set with a motive to capture as much information as possible." - https://ostwalprasad.github.io/machine-learning/PCA-using-python.html

In [None]:
from sklearn.decomposition import PCA
pcamodel = PCA(n_components=5)
pca = pcamodel.fit_transform(x)
pca.shape

In [None]:
plt.bar(range(1,len(pcamodel.explained_variance_ )+1),pcamodel.explained_variance_ )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(pcamodel.explained_variance_ )+1),
         np.cumsum(pcamodel.explained_variance_),
         c='red',
         label="Cumulative Explained Variance")
plt.legend(loc='upper left')

In [None]:
plt.plot(pcamodel.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

#PCA1 is at 0 in xscale

In [None]:
plt.plot(pcamodel.explained_variance_)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
plt.scatter(pca[:, 0], pca[:, 1])

In [None]:
ax = sns.heatmap(pcamodel.components_,
                 cmap='YlGnBu',
                 yticklabels=[ "PCA"+str(x) for x in range(1,pcamodel.n_components_+1)],
                 xticklabels=list(x.columns),
                 cbar_kws={"orientation": "horizontal"})
ax.set_aspect("equal")

In [None]:
def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley,s=5)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'green', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'black', ha = 'center', va = 'center')
 
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()

myplot(pca[:,0:2],np.transpose(pcamodel.components_[0:2, :]),list(x.columns))
plt.show()

In [None]:
import seaborn as sn
corrMatrix = x.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
#https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(x, 20))

In [None]:
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

## 
#This code runs slowly
## 

# run kmeans with many different k
distortions = []
K = range(2, 50)
for k in K:
    k_means = KMeans(n_clusters=k, random_state=42).fit(x)
    k_means.fit(x)
    distortions.append(sum(np.min(cdist(x, k_means.cluster_centers_, 'euclidean'), axis=1)) / x.shape[0])
    #print('Found distortion for {} clusters'.format(k))

In [None]:
#Estimate the size of K 

X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
k = 15
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(x)
#x['y'] = y_pred

## Dimensionality Reduction with t-SNE

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(verbose=1, perplexity=100, random_state=42)
X_embedded = tsne.fit_transform(x)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", 1)

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], palette=palette)
plt.title('t-SNE with no Labels')
plt.savefig("t-sne_covid19.png")
plt.show()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.hls_palette(k, l=.4, s=.9)

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()

In [None]:
#drop the NA rows 
num_df = num_df.dropna()
num_df.shape

## Decision Tree Modeling 

### Clean up the main data file
We want to select columns relevant to our analysis, get rid of nulls and recode the penalty column

In [None]:
#show data and check for NA's 
num_df.head()

In [None]:
#check for nulls in the data 
num_df.isnull().sum()

In [None]:
#drop all missing 
num_df = num_df.dropna()
num_df.shape

In [None]:
#add a label column which will be whether there was progress on the play 
num_df['label'] = np.where(num_df['offensePlayResult']<=0, 1, 0)
num_df.dtypes

### Begin Modeling with Decision Trees

In [None]:
#Load Libraries 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [None]:
feature_cols = ['quarter', 'down', 'yardsToGo', 'yardlineNumber','defendersInTheBox','numberOfPassRushers',
                'preSnapVisitorScore','preSnapHomeScore','absoluteYardlineNumber','week','DL_num','LB_num',
               'DB_num','RB_num','TE_num','WR_num']
X = num_df[feature_cols] # Features
y = num_df.label # Target variable

X.head()

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=4)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(edgecolor='white',facecolor='black', linewidth=10,figsize=(20,20))
 
plot_tree(clf,filled=True,rounded=True, fontsize=8, class_names=["Offense","Defense"],feature_names=X.columns);
