# Random Forest

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('../data/tackles.csv').groupby(['gameId', 'playId']).sum()
df = df.join(
    other = pd.read_csv('../data/plays.csv').set_index(['gameId', 'playId']),
    on = ['gameId', 'playId'],
    how = 'inner'
)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nflId,tackle,assist,forcedFumble,pff_missedTackle,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,...,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
gameId,playId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022090800,56,43294,1,0,0,0,42489,Stefon Diggs,(15:00) (Shotgun) J.Allen pass short right to ...,1,1,...,0.413347,0.586653,-3.1e-05,3.1e-05,1.298699,0.00442,,,,
2022090800,80,53532,1,0,0,0,46076,Josh Allen,"(14:29) (No Huddle, Shotgun) J.Allen scrambles...",1,2,...,0.413316,0.586684,-0.013497,0.013497,1.303119,0.823571,,,,
2022090800,101,42816,1,0,0,0,47857,Devin Singletary,(13:54) D.Singletary right end to BUF 45 for 7...,1,1,...,0.399819,0.600181,-0.01485,0.01485,2.12669,0.562363,,,,
2022090800,122,38577,1,0,0,0,47857,Devin Singletary,(13:15) (Shotgun) J.Allen pass short right to ...,1,2,...,0.384969,0.615031,-0.019032,0.019032,2.689053,0.399209,,,,
2022090800,146,81912,0,2,0,0,47857,Devin Singletary,(12:33) (Shotgun) D.Singletary left tackle to ...,1,1,...,0.365938,0.634062,0.021616,-0.021616,3.088262,-0.514056,,,,


In [3]:
df_cleaned = df[[
    'quarter',
    'down',
    'possessionTeam',
    'yardsToGo',
    'yardlineSide',
    'yardlineNumber',
    'offenseFormation',
    'defendersInTheBox',
    'pff_missedTackle'
]]
df_cleaned.isna().sum()

quarter                0
down                   0
possessionTeam         0
yardsToGo              0
yardlineSide         164
yardlineNumber         0
offenseFormation       4
defendersInTheBox      5
pff_missedTackle       0
dtype: int64

A small number of plays without `offenseFormation` or `defendersInTheBox` observations will be dropped.

In [4]:
df_cleaned.loc[:, 'yardlineSide'] = df_cleaned.apply(lambda x:
    # 0 = Midfield
    0 if x['yardlineSide'] == 'NA'
    # 1 = Own territory
    else 1 if x['yardlineSide'] == x['possessionTeam']
    # 2 = Opp territory
    else 2,
axis=1)
df_cleaned.loc[:, 'offenseFormation'] = df_cleaned['offenseFormation'].map({
    'EMPTY': 1,
    'I_FORM': 2,
    'JUMBO': 3,
    'PISTOL': 4,
    'SHOTGUN': 5,
    'SINGLEBACK': 6,
    'WILDCAT': 7
})
df_cleaned = df_cleaned.dropna()

In [5]:
# Select relevant features and target variable
# The features should be things we know at the start of the play, to predict whether a tackle could be missed during the play.
features = df_cleaned[['quarter', 'down', 'yardsToGo', 'yardlineNumber', 'yardlineSide', 'offenseFormation', 'defendersInTheBox']]
target = df_cleaned['pff_missedTackle'].clip(0, 1)
features

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter,down,yardsToGo,yardlineNumber,yardlineSide,offenseFormation,defendersInTheBox
gameId,playId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022090800,56,1,1,10,25,1,5.0,6.0
2022090800,80,1,2,4,31,1,1.0,6.0
2022090800,101,1,1,10,38,1,2.0,6.0
2022090800,122,1,2,3,45,1,5.0,6.0
2022090800,146,1,1,10,49,2,5.0,6.0
...,...,...,...,...,...,...,...,...
2022110700,3658,4,3,1,44,1,4.0,7.0
2022110700,3686,4,1,10,46,1,2.0,8.0
2022110700,3707,4,2,5,49,2,4.0,9.0
2022110700,3740,4,3,1,45,2,4.0,10.0


In [6]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [7]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(x_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy: .2f}')
print('\nClassification Report:\n', classification_rep)

Accuracy:  0.83

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.96      0.90      2048
           1       0.20      0.06      0.09       356

    accuracy                           0.83      2404
   macro avg       0.53      0.51      0.50      2404
weighted avg       0.76      0.83      0.78      2404

