In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_percentage_error, explained_variance_score

In [3]:
# Import plays data
df = pd.read_csv("./data/plays.csv")

In [4]:
df.head()

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,specialTeamsPlayType,specialTeamsResult,kickerId,...,penaltyCodes,penaltyJerseyNumbers,penaltyYards,preSnapHomeScore,preSnapVisitorScore,passResult,kickLength,kickReturnYardage,playResult,absoluteYardlineNumber
0,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,,,,0,0,,66.0,,40,45
1,2018090600,366,"(9:20) C.Johnston punts 56 yards to ATL 36, Ce...",1,4,4,PHI,Punt,Return,45603.0,...,UNSd,PHI 18,-15.0,0,0,,56.0,5.0,36,18
2,2018090600,658,"(5:03) M.Bryant 21 yard field goal is GOOD, Ce...",1,4,3,ATL,Field Goal,Kick Attempt Good,27091.0,...,,,,0,0,,21.0,,0,13
3,2018090600,677,M.Bosher kicks 64 yards from ATL 35 to PHI 1. ...,1,0,0,ATL,Kickoff,Return,37267.0,...,,,,0,3,,64.0,30.0,34,75
4,2018090600,872,"(:33) C.Johnston punts 65 yards to end zone, C...",1,4,18,PHI,Punt,Touchback,45603.0,...,,,,0,3,,65.0,,45,45


In [19]:
df.passResult.value_counts()

C     23
I     17
S      3
IN     2
Name: passResult, dtype: int64

In [20]:
# Create binary dummy variables for multilabel categorical feature variables
playTypeDummies = pd.get_dummies(df.specialTeamsPlayType)
teamResultDummies = pd.get_dummies(df.specialTeamsResult)
passResultDummies = pd.get_dummies(df.passResult)

In [21]:
playTypeDummies.head()

Unnamed: 0,Extra Point,Field Goal,Kickoff,Punt
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,0,0,1,0
4,0,0,0,1


In [55]:
X = df.copy().drop(['gameId', 'playId', 'playDescription', 'possessionTeam', 'specialTeamsPlayType', 'specialTeamsResult', 'kickerId', 'returnerId', 'kickBlockerId', 'yardlineSide', 'gameClock', 'penaltyCodes', 'penaltyJerseyNumbers', 'playResult', 'passResult'], axis=1)

# Add dummy variable columns to observation data
X = X.join(playTypeDummies)
X = X.join(teamResultDummies)
X = X.join(passResultDummies)

# Replace yardage NaNs with 0
X.penaltyYards = X.penaltyYards.fillna(0)
X.kickReturnYardage = X.kickReturnYardage.fillna(0)

# Replace kick length NaN with 0
X.kickLength = X.kickLength.fillna(0)


y = pd.to_numeric(df.playResult).astype(float)

In [56]:
X.head()

Unnamed: 0,quarter,down,yardsToGo,yardlineNumber,penaltyYards,preSnapHomeScore,preSnapVisitorScore,kickLength,kickReturnYardage,absoluteYardlineNumber,...,Kickoff Team Recovery,Muffed,Non-Special Teams Result,Out of Bounds,Return,Touchback,C,I,IN,S
0,1,0,0,35,0.0,0,0,66.0,0.0,45,...,0,0,0,0,0,1,0,0,0,0
1,1,4,4,8,-15.0,0,0,56.0,5.0,18,...,0,0,0,0,1,0,0,0,0,0
2,1,4,3,3,0.0,0,0,21.0,0.0,13,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,35,0.0,0,3,64.0,30.0,75,...,0,0,0,0,1,0,0,0,0,0
4,1,4,18,35,0.0,0,3,65.0,0.0,45,...,0,0,0,0,0,1,0,0,0,0


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

# Random Forest Predictive Model

In [58]:
from sklearn.ensemble import RandomForestRegressor

In [59]:
# Build random forest model
rf = RandomForestRegressor(n_estimators=1000, oob_score=True)
rf.fit(X_train, y_train)

In [60]:
# Make predictions using test data
y_pred = rf.predict(X_test)

In [65]:
mean_absolute_percentage_error(y_test.values, y_pred)

32564699762363.58

In [67]:
explained_variance_score(y_test.values, y_pred)

0.991881357436049

In [69]:
rf.oob_score_

0.9908064688545024

In [72]:
forest_importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values()
print(forest_importances)

IN                          0.000000e+00
Kickoff Team Recovery       6.183419e-07
Out of Bounds               4.102358e-06
S                           9.869230e-06
Fair Catch                  1.593397e-05
I                           2.592218e-05
Muffed                      8.272068e-05
Downed                      8.306002e-05
Kickoff                     8.764479e-05
Return                      1.223444e-04
Punt                        1.923887e-04
Extra Point                 2.302052e-04
C                           2.756668e-04
Blocked Kick Attempt        3.437246e-04
quarter                     3.539525e-04
Field Goal                  6.128104e-04
preSnapHomeScore            8.575313e-04
down                        9.681534e-04
Non-Special Teams Result    1.000498e-03
absoluteYardlineNumber      1.052733e-03
yardsToGo                   1.165877e-03
preSnapVisitorScore         1.353272e-03
Blocked Punt                1.461907e-03
yardlineNumber              5.637125e-03
Touchback       