In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **NFL Big Data Bowl 2022**

#### Punt (punt) is one of the most important actions by a special team. Understanding what leads to scoring after the Punt is one of the key tasks for the coaching staff. This notebook contains an attempt to create a model predicting the likelihood of what the result of the draw a ball performance will be. 

<center>
<img src="https://i.gifer.com/8Zr9.gif" alt="drawing"/>
<img src="https://static.www.nfl.com/image/upload/v1554321393/league/nvfr7ogywskqrfaiu38m.svg" alt="drawing" width="320"/>
</center>

## Primary data processing. Choosing the purpose and features for a model

In [None]:
#Data loading
#-------------------------------
games_path = '../input/nfl-big-data-bowl-2022/games.csv'
plays_path = '../input/nfl-big-data-bowl-2022/plays.csv'
players_path = '../input/nfl-big-data-bowl-2022/players.csv'
PFFScoutingData_path = '../input/nfl-big-data-bowl-2022/PFFScoutingData.csv'
tracking2018_path = '../input/nfl-big-data-bowl-2022/tracking2018.csv'
tracking2019_path = '../input/nfl-big-data-bowl-2022/tracking2019.csv'
tracking2020_path = '../input/nfl-big-data-bowl-2022/tracking2020.csv'

games_data = pd.read_csv(games_path)
plays_data = pd.read_csv(plays_path)
players_data = pd.read_csv(players_path)
PFFScoutingData_data = pd.read_csv(PFFScoutingData_path)
#tracking2018_data = pd.read_csv(tracking2018_path)
#tracking2019_data = pd.read_csv(tracking2019_path)
#tracking2020_data = pd.read_csv(tracking2020_path)

In [None]:
#Let's see what the data provided by the NFL
#First of all, let's look at the ball related data contained in plays_path and PFFScoutingData_path
#-------------------------------
def describe_data(data, n = 10, d = False):
    print(data.shape)
    print(data.columns)
    print()
    if d == True:
        print(data.describe())
    print()
    return data.head(n)

In [None]:
describe_data(plays_data)

In [None]:
describe_data(PFFScoutingData_data)

#### We will consider only those features that are related to the ball (position, the distance at which it was kicked, etc.). All others, including those from other datasets, are not considered.  
> **specialTeamsPlayType**: Formation of play: Extra Point, Field Goal, Kickoff or Punt (text)  
> **specialTeamsResult**: Special Teams outcome of play dependent on play type: Blocked Kick Attempt, Blocked Punt, Downed, Fair Catch, Kick Attempt Good, Kick Attempt No Good, Kickoff Team Recovery, Muffed, Non-Special Teams Result, Out of Bounds, Return or Touchback (text)  
> **yardlineNumber**: Yard line at line-of-scrimmage (numeric)  
> **kickLength**: Kick length in air of kickoff, field goal or punt (numeric)  
> **kickReturnYardage**: Yards gained by return team if there was a return on a kickoff or punt (numeric)  
> **playResult**: Net yards gained by the kicking team, including penalty yardage (numeric)  
> **absoluteYardlineNumber**: Location of ball downfield in tracking data coordinates (numeric)  
> **snapDetail**: On Punts, whether the snap was on target and if not, provides detail (H: High, L: Low, <: Left, >: Right, OK: Accurate Snap, text)  
> **kickType**: Kickoff or Punt Type (text). Possible values for punt plays:   
> > N: Normal - standard punt style  
> > R: Rugby style punt  
> > A: Nose down or Aussie-style punts  

> **kickContactType**: Detail on how a punt was fielded, or what happened when it wasn't fielded (text). Possible values:  
> > BB: Bounced Backwards  
> > BC: Bobbled Catch from Air  
> > BF: Bounced Forwards  
> > BOG: Bobbled on Ground  
> > CC: Clean Catch from Air  
> > CFFG: Clean Field From Ground  
> > DEZ: Direct to Endzone  
> > ICC: Incidental Coverage Team Contact  
> > KTB: Kick Team Knocked Back  
> > KTC: Kick Team Catch  
> > KTF: Kick Team Knocked Forward  
> > MBC: Muffed by Contact with Non-Designated Returner  
> > MBDR: Muffed by Designated Returner  
> > OOB: Directly Out Of Bounds  

> **operationTime**: Timing from snap to kick on punt plays in seconds: (numeric)  
> **hangTime**: Hangtime of player's punt or kickoff attempt in seconds. Timing is taken from impact with foot to impact with the ground or a player. (numeric)



In [None]:
#choosed features from Plays
col_plays_use_cat = ['specialTeamsPlayType', 'specialTeamsResult']
col_plays_use_num = ['yardlineNumber', 'kickLength', 'kickReturnYardage', 'playResult', 'absoluteYardlineNumber']

#choosed features from PFFScoutingData
col_PFF_use_cat = ['snapDetail', 'kickType', 'kickContactType']
col_PFF_use_num = ['operationTime', 'hangTime']

In [None]:
#Combining DF for work
#-------------------------------
def creater_df():
    df1_temp = plays_data[['gameId', 'playId',] + col_plays_use_cat + col_plays_use_num].copy().set_index(['gameId', 'playId'])
    df2_temp = PFFScoutingData_data[['gameId', 'playId',] + col_PFF_use_cat + col_PFF_use_num].copy().set_index(['gameId', 'playId'])
    df_temp = df1_temp.join(df2_temp, lsuffix='_CAN', rsuffix='_UK')
    return df_temp

#### Let's define the purpose of the model. Let's look at the results of the actions of the special teams. The future model is conceived to predict the outcome of the Punt. There are eight outcomes for him, each of which happened a completely different number of times for all the games presented in the data. It can be assumed that the model will be able to distinguish them, which means that the target's choice of the specialTeamsResult model is justified. 

In [None]:
#Displaying statistics of actions of special teams
#-------------------------------
df = creater_df()
df_temp = pd.DataFrame(df.groupby(['specialTeamsPlayType', 'specialTeamsResult']).specialTeamsResult.count())
df_temp


In [None]:
#Creating a DF for work. DF will only contain data for Punt
#-------------------------------

df = creater_df()
df_punt = df.loc[df['specialTeamsPlayType'].isin(['Punt'])]

#Check for unique data in each columns
def unique_incol(data):
    for col in data.columns:
        print(data[col].name)
        print(data[col].count())
        print(data[col].isnull().sum(axis = 0))
        print(data[col].unique(), "\n")

#unique_incol(df_punt)
#print("---------------------------------------")

#Let's throw out the values by rows that cannot be filled in (categorical)
df_punt = df_punt.dropna(subset=['kickType', 'kickContactType', 'snapDetail'])
#kickReturnYardage has 60% NaN values, let's select this column at this stage
df_punt.drop(columns = ['kickReturnYardage'], inplace=True)
#Let's fill the remaining NaN values with row averages
df_punt['operationTime'] = round(df_punt['operationTime'].fillna(df_punt['operationTime'].mean()), 2)
df_punt['hangTime'] = round(df_punt['hangTime'].fillna(df_punt['hangTime'].mean()), 2)

unique_incol(df_punt)
df_punt.index = df_punt.index.droplevel(0)
df_punt

## Data visualization


#### Let's look at the numerical features for each Punt outcome. To do this, we will build violin graphs for numerical tables, and categorical we will group into compact tables according to the number of characteristics for each outcome.
#### These actions will allow you to evaluate how much each outcome for different features differs and whether it makes sense to use these features. 

In [None]:
#Visualization of the relationship between the results of special commands and numerical characteristics
#-------------------------------
fig = plt.figure(figsize=(18, 15))
gs = fig.add_gridspec(3, 2)

ax = fig.add_subplot(gs[0, 0])
sns.violinplot(data = df_punt, x='specialTeamsResult', y='yardlineNumber')

ax = fig.add_subplot(gs[0, 1])
sns.violinplot(data = df_punt, x='specialTeamsResult', y='kickLength')

ax = fig.add_subplot(gs[1, 0])
sns.violinplot(data = df_punt, x='specialTeamsResult', y='playResult')

ax = fig.add_subplot(gs[1, 1])
sns.violinplot(data = df_punt, x='specialTeamsResult', y='absoluteYardlineNumber')

ax = fig.add_subplot(gs[2, 0])
sns.violinplot(data = df_punt, x='specialTeamsResult', y='operationTime')

ax = fig.add_subplot(gs[2, 1])
sns.violinplot(data = df_punt, x='specialTeamsResult', y='hangTime')

fig.tight_layout()


In [None]:
#View results of special teams and categorical features
#-------------------------------

# 1. Building a table of values for the result of a special action and what happened with the Punt
df_temp = pd.DataFrame(df_punt.groupby(['specialTeamsResult','kickContactType']).kickContactType.count().unstack().reset_index())
#df2.columns = df2.columns.droplevel(0)
#df2.columns = df2.columns.map(''.join)
df_temp = df_temp.fillna(0)
print(df_temp, 5*'\n')

# 2. Building a table of values of the result of a special team and whether the Punt had a binding
df2_temp = pd.DataFrame(df_punt.groupby(['specialTeamsResult','snapDetail']).snapDetail.count().unstack().reset_index())
print(df2_temp, 5*'\n')

# 3.Building a table of the results of a special action and a Punt type
df3_temp = pd.DataFrame(df_punt.groupby(['specialTeamsResult','kickType']).kickType.count().unstack().reset_index())
df3_temp = df3_temp.fillna(0)
print(df3_temp)


#### Based on the violin graphs, the operationTime characteristic is practically the same for all Punt results, so it makes no sense to take it into account.  
#### The kickContactType characteristic can cause data leakage, since it contains typing of the Punt's outcomes, and the goal of the model to predict the outcomes themselves. Looking ahead, when training the model, adding kickContactType increased the accuracy by 15% at once, which proved to the creation of a data leak.

## Model training

#### The assigned task with the selected data set belongs to the classification task. Since there is a set of characteristics for each outcome, their combinations during the match ensure that the desired outcome of the Punt is played.

In [None]:
#Preparing categorical data for models
#-------------------------------

from sklearn.preprocessing import LabelEncoder
data2 = df_punt.copy()
label_encoder = LabelEncoder()

columns_LE = {
    "1": 'kickContactType',
    "2": 'snapDetail',
    "3": 'kickType'}

for name, column in columns_LE.items():
    print(data2[column].unique())
    mapped_education = pd.Series(label_encoder.fit_transform(data2[column]))
    data2[column] = label_encoder.fit_transform(data2[column])
    print(dict(enumerate(label_encoder.classes_)))
    print(data2[column].unique())

#data2

In [None]:
#Selection of the most successful model based on all selected features
#-------------------------------
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


models = {
    "XGBClassifier": XGBClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),  
    "Stochastic Gradient Descent Classifier": SGDClassifier(),
    "Support Vector Classifier": SVC(),
    "Linear Support Vector Classifier": LinearSVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifer": RandomForestClassifier(random_state = 5)         
         }
 
cols_to_use2 = ['yardlineNumber', 'kickLength', 'playResult', 'hangTime', 'snapDetail', 'kickType']
X4 = data2[cols_to_use2]
y4 = data2.specialTeamsResult
X4_train, X4_valid, y4_train, y4_valid = train_test_split(X4, y4, test_size=0.4, random_state = 11)

for name, model in models.items():
    model.fit(X4_train, y4_train)
    print(name + " trained")
    
print("-------------------------", '\n')

for name, model in models.items():
    print(name)
    predictions4 = model.predict(X4_valid)
    print("Accuracy: %.2f%%" % (accuracy_score(y4_valid, predictions4, normalize=True) * 100.0))

#### The XGBClassifier and RandomForestClassifier models showed the highest accuracy. Let's try to select parameters for them that will further increase their accuracy

In [None]:
#Finding the best parameters for the XGBClassifier model
#-------------------------------

from sklearn.model_selection import train_test_split
X5 = data2[cols_to_use2]
y5 = data2.specialTeamsResult
X5_train, X5_valid, y5_train, y5_valid = train_test_split(X5, y5, test_size=0.4, random_state = 11)

from xgboost import XGBClassifier
my_model5 = XGBClassifier(booster='gbtree', max_depth=7, eta=0.07, gamma=0.01, subsample=0.8, colsample_bytree = 1, min_child_weight=2)
my_model5.fit(X5_train, y5_train)

predictions5 = my_model5.predict(X5_valid)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y5_valid, predictions5, normalize=True)
print("Primary Accuracy: 70.57% (with standart parameters)")
print("Accuracy: %.2f%%" % (accuracy * 100.0))

print('\n')
print(my_model5)

In [None]:
#Finding the best parameters for the RandomForestClassifier model
#-------------------------------

from sklearn.model_selection import train_test_split
X6 = data2[cols_to_use2]
y6 = data2.specialTeamsResult
X6_train, X6_valid, y6_train, y6_valid = train_test_split(X6, y6, test_size=0.4, random_state = 11)

from sklearn.ensemble import RandomForestClassifier
my_model6 = RandomForestClassifier(max_depth=15, n_estimators=500, max_features = 'auto', random_state = 10)
my_model6.fit(X6_train, y6_train)

predictions6 = my_model6.predict(X6_valid)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y6_valid, predictions6, normalize=True)
print("Primary Accuracy: 69.63% (with standart parameters)")
print("Accuracy: %.2f%%" % (accuracy * 100.0))

print('\n')
print(my_model6)

### The outcome

### The XGBClassifier model shows the best result and achieves an accuracy of 71.97%.  
### In the future, it is planned to supplement the data from player tracking. The combination of data on the position of the ball and the players on the field after a punther strike allow receive is the best quality of the model and its application in real play on the field.