## Installations

In [1]:
!pip install gdown



## Imports

In [2]:
import numpy as np
import pandas as pd
import gdown

In [3]:
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

## week1.csv

### download week1.csv

In [4]:
!gdown https://drive.google.com/uc?id=1ipjCqLU-4XVIfhZhS1mxWD_dyatbPlPA&export=download

Downloading...
From: https://drive.google.com/uc?id=1ipjCqLU-4XVIfhZhS1mxWD_dyatbPlPA
To: /content/week1.csv
100% 128M/128M [00:01<00:00, 72.4MB/s]


### load week1_df

In [5]:
week1_df = pd.read_csv('week1.csv')
week1_df

Unnamed: 0,gameId,playId,nflId,frameId,time,jerseyNumber,team,playDirection,x,y,s,a,dis,o,dir,event
0,2021090900,97,25511.0,1,2021-09-10T00:26:31.100,12.0,TB,right,37.77,24.22,0.29,0.30,0.03,165.16,84.99,
1,2021090900,97,25511.0,2,2021-09-10T00:26:31.200,12.0,TB,right,37.78,24.22,0.23,0.11,0.02,164.33,92.87,
2,2021090900,97,25511.0,3,2021-09-10T00:26:31.300,12.0,TB,right,37.78,24.24,0.16,0.10,0.01,160.24,68.55,
3,2021090900,97,25511.0,4,2021-09-10T00:26:31.400,12.0,TB,right,37.73,24.25,0.15,0.24,0.06,152.13,296.85,
4,2021090900,97,25511.0,5,2021-09-10T00:26:31.500,12.0,TB,right,37.69,24.26,0.25,0.18,0.04,148.33,287.55,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118117,2021091300,4845,,30,2021-09-14T03:54:20.600,,football,left,52.78,25.23,3.58,1.95,0.37,,,pass_forward
1118118,2021091300,4845,,31,2021-09-14T03:54:20.700,,football,left,50.31,26.46,17.16,0.25,2.77,,,
1118119,2021091300,4845,,32,2021-09-14T03:54:20.800,,football,left,48.66,26.99,17.10,1.05,1.73,,,
1118120,2021091300,4845,,33,2021-09-14T03:54:20.900,,football,left,47.04,27.53,16.98,1.67,1.71,,,


## games.csv

### download games.csv

In [6]:
!gdown https://drive.google.com/uc?id=1ESKJkdAJBK7Wg5FFv-dn3EBL66rT05a-&export=download

Downloading...
From: https://drive.google.com/uc?id=1ESKJkdAJBK7Wg5FFv-dn3EBL66rT05a-
To: /content/games.csv
  0% 0.00/6.74k [00:00<?, ?B/s]100% 6.74k/6.74k [00:00<00:00, 8.51MB/s]


### load games_df

In [7]:
games_df = pd.read_csv('games.csv')
games_df

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr
0,2021090900,2021,1,09/09/2021,20:20:00,TB,DAL
1,2021091200,2021,1,09/12/2021,13:00:00,ATL,PHI
2,2021091201,2021,1,09/12/2021,13:00:00,BUF,PIT
3,2021091202,2021,1,09/12/2021,13:00:00,CAR,NYJ
4,2021091203,2021,1,09/12/2021,13:00:00,CIN,MIN
...,...,...,...,...,...,...,...
117,2021103109,2021,8,10/31/2021,16:05:00,SEA,JAX
118,2021103110,2021,8,10/31/2021,16:25:00,DEN,WAS
119,2021103111,2021,8,10/31/2021,16:25:00,NO,TB
120,2021103112,2021,8,10/31/2021,20:20:00,MIN,DAL


## plays.csv

### download plays.csv

In [8]:
!gdown https://drive.google.com/uc?id=1G1T2NkiYkRQxvS4s3edPaAV1rYz_2bBA&export=download

Downloading...
From: https://drive.google.com/uc?id=1G1T2NkiYkRQxvS4s3edPaAV1rYz_2bBA
To: /content/plays.csv
  0% 0.00/2.35M [00:00<?, ?B/s] 67% 1.57M/2.35M [00:00<00:00, 13.9MB/s]100% 2.35M/2.35M [00:00<00:00, 18.7MB/s]


### load plays.csv

In [9]:
plays_df = pd.read_csv('plays.csv')
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


## Data Cleaning/Preprocessing

### check offenseFormation isna

In [10]:
plays_df[plays_df['offenseFormation'].isna()]

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
916,2021091211,1517,"(:20) (No Huddle, Shotgun) Aa.Rodgers pass inc...",2,3,6,GB,NO,GB,43,...,,53.0,,"1 RB, 1 TE, 3 WR",,"3 DL, 2 LB, 6 DB",,0,2-Man,Man
1570,2021091904,3676,(5:25) T.Bridgewater pass short right to A.Okw...,4,1,10,DEN,JAX,JAX,39,...,,,,,,,,1,Cover-1,Man
1654,2021091906,639,(2:42) M.Jones pass incomplete deep middle to ...,1,2,5,NE,NYJ,NYJ,39,...,,49.0,,"2 RB, 1 TE, 2 WR",,"3 DL, 4 LB, 4 DB",,1,Cover-1,Man
4887,2021101004,3207,(11:03) (Shotgun) T.Lawrence pass deep left in...,4,3,7,JAX,TEN,TEN,9,...,,19.0,,"1 RB, 1 TE, 3 WR",,"2 DL, 3 LB, 6 DB",,0,Cover-1,Man
6874,2021102402,2491,"(5:45) (No Huddle, Shotgun) T.Tagovailoa scram...",3,2,6,MIA,ATL,ATL,6,...,,16.0,,"1 RB, 2 TE, 2 WR",,"2 DL, 4 LB, 5 DB",,0,Bracket,Other
6899,2021102402,3689,(5:37) (Shotgun) T.Tagovailoa pass left to J.W...,4,1,14,MIA,ATL,ATL,44,...,,66.0,,"1 RB, 2 TE, 2 WR",,"2 DL, 4 LB, 5 DB",,1,Cover-3,Zone
7912,2021103105,1905,(:11) (Shotgun) D.Mills pass short right inten...,2,1,10,HOU,LA,LA,47,...,,57.0,,"1 RB, 1 TE, 3 WR",,"2 DL, 3 LB, 6 DB",,0,Quarters,Zone


### filter NaN values
* offenseFormation
* personnelO
* personnelD

In [11]:
plays_df = plays_df[~plays_df['offenseFormation'].isna()]
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


In [12]:
plays_df = plays_df[~plays_df['personnelO'].isna()]
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


In [13]:
plays_df['personnelO'].unique()

array(['1 RB, 1 TE, 3 WR', '1 RB, 2 TE, 2 WR', '0 RB, 2 TE, 3 WR',
       '1 RB, 0 TE, 4 WR', '2 RB, 1 TE, 2 WR', '2 RB, 0 TE, 3 WR',
       '2 RB, 2 TE, 1 WR', '1 RB, 3 TE, 1 WR', '2 RB, 3 TE, 0 WR',
       '0 RB, 0 TE, 5 WR', '0 RB, 1 TE, 4 WR', '6 OL, 2 RB, 2 TE, 0 WR',
       '2 QB, 2 RB, 0 TE, 2 WR', '2 QB, 1 RB, 1 TE, 2 WR',
       '6 OL, 1 RB, 1 TE, 2 WR', '2 QB, 1 RB, 2 TE, 1 WR',
       '6 OL, 1 RB, 2 TE, 1 WR', '2 QB, 1 RB, 0 TE, 3 WR',
       '6 OL, 2 RB, 1 TE, 1 WR', '3 RB, 0 TE, 2 WR',
       '2 QB, 6 OL, 1 RB, 1 TE, 1 WR', '0 RB, 3 TE, 2 WR',
       '6 OL, 1 RB, 3 TE, 0 WR', '6 OL, 2 RB, 0 TE, 2 WR',
       '6 OL, 1 RB, 0 TE, 3 WR', '1 RB, 1 TE, 2 WR,1 LB',
       '1 RB, 4 TE, 0 WR', '2 QB, 2 RB, 1 TE, 1 WR',
       '2 QB, 1 RB, 3 TE, 0 WR', '7 OL, 1 RB, 0 TE, 2 WR'], dtype=object)

In [14]:
plays_df = plays_df[~plays_df['personnelD'].isna()]
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


In [15]:
plays_df['personnelD'].unique()

array(['4 DL, 2 LB, 5 DB', '4 DL, 4 LB, 3 DB', '3 DL, 3 LB, 5 DB',
       '4 DL, 3 LB, 4 DB', '3 DL, 4 LB, 4 DB', '2 DL, 4 LB, 5 DB',
       '2 DL, 2 LB, 7 DB', '1 DL, 5 LB, 5 DB', '2 DL, 3 LB, 6 DB',
       '4 DL, 1 LB, 6 DB', '3 DL, 2 LB, 6 DB', '5 DL, 2 LB, 4 DB',
       '6 DL, 1 LB, 4 DB', '3 DL, 1 LB, 7 DB', '1 DL, 4 LB, 6 DB',
       '4 DL, 6 LB, 1 DB', '0 DL, 3 LB, 8 DB', '1 DL, 3 LB, 7 DB',
       '5 DL, 1 LB, 5 DB', '5 DL, 3 LB, 3 DB', '0 DL, 5 LB, 6 DB',
       '2 DL, 5 LB, 4 DB', '6 DL, 3 LB, 2 DB', '3 DL, 5 LB, 3 DB',
       '5 DL, 5 LB, 1 DB', '1 DL, 2 LB, 8 DB', '6 DL, 4 LB, 1 DB',
       '4 DL, 5 LB, 2 DB', '6 DL, 2 LB, 3 DB'], dtype=object)

### parse personnelO, personnelD

In [16]:
def parse_personnelO(row):
    parts = row.split(',')
    counts = {'RB': 0, 'TE': 0, 'WR': 0, 'QB': 0, 'OL': 0}

    for part in parts:
        number, position = part.split()
        counts[position.strip()] = int(number)

    return pd.Series([counts['RB'], counts['TE'], counts['WR'], counts['QB'], counts['OL']])

def parse_personnelD(row):
    parts = row.split(',')
    counts = {'DL': 0, 'LB': 0, 'DB': 0}

    for part in parts:
        number, position = part.split()
        counts[position.strip()] = int(number)

    return pd.Series([counts['DL'], counts['LB'], counts['DB']])

# Apply the function and assign results to new columns
plays_df[['personnalO_RB', 'personnelO_TE', 'personnelO_WR', 'personnelO_QB', 'personnelO_OL']] = plays_df['personnelO'].apply(parse_personnelO)
plays_df[['personnalD_DL', 'personnelD_LB', 'personnelD_DB']] = plays_df['personnelD'].apply(parse_personnelD)

plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,pff_passCoverage,pff_passCoverageType,personnalO_RB,personnelO_TE,personnelO_WR,personnelO_QB,personnelO_OL,personnalD_DL,personnelD_LB,personnelD_DB
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,Cover-1,Man,1,1,3,0,0,4,2,5
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,Cover-3,Zone,1,2,2,0,0,4,4,3
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,Cover-3,Zone,0,2,3,0,0,3,3,5
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,Cover-3,Zone,1,2,2,0,0,4,3,4
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,Cover-3,Zone,1,1,3,0,0,3,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,Bracket,Other,1,1,3,0,0,1,3,7
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,Cover-2,Zone,1,1,3,0,0,4,1,6
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,Cover-2,Zone,1,1,3,0,0,4,1,6
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,Cover-2,Zone,1,1,3,0,0,4,1,6


### drop personnelO, personnelD

In [17]:
plays_df = plays_df.drop(['personnelD', 'personnelO'], axis=1)

In [18]:
plays_df.columns

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber',
       'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult',
       'penaltyYards', 'prePenaltyPlayResult', 'playResult', 'foulName1',
       'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
       'absoluteYardlineNumber', 'offenseFormation', 'defendersInBox',
       'dropBackType', 'pff_playAction', 'pff_passCoverage',
       'pff_passCoverageType', 'personnalO_RB', 'personnelO_TE',
       'personnelO_WR', 'personnelO_QB', 'personnelO_OL', 'personnalD_DL',
       'personnelD_LB', 'personnelD_DB'],
      dtype='object')

### one-hot encode possessionTeam

In [19]:
plays_df = plays_df.join(pd.get_dummies(plays_df['possessionTeam'], prefix='possessionTeam'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,possessionTeam_NO,possessionTeam_NYG,possessionTeam_NYJ,possessionTeam_PHI,possessionTeam_PIT,possessionTeam_SEA,possessionTeam_SF,possessionTeam_TB,possessionTeam_TEN,possessionTeam_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,0,0,0,0,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,0,0,0,0,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,0,0,0,0,0,0,0,0,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,0,0,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,0,1,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,0,1,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,0,1,0,0,0,0,0,0,0,0


### one-hot encode defensiveTeam

In [20]:
plays_df = plays_df.join(pd.get_dummies(plays_df['defensiveTeam'], prefix='defensiveTeam'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,0,0,0,0,0,0,0,0,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,0,0,0,0,0,0,0,1,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,0,0,0,0,0,0,0,1,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,0,0,0,0,0,0,0,1,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,0,1,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,0,0,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,0,0,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,0,0,0,0,0,0,0,0,0,0


### drop offensiveTeam and defensiveTeam columns

In [21]:
plays_df = plays_df.drop(['possessionTeam', 'defensiveTeam'], axis=1)

### one-hot encode yardlineSide

In [22]:
plays_df = plays_df.join(pd.get_dummies(plays_df['yardlineSide'], prefix='yardlineSide'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,...,yardlineSide_NO,yardlineSide_NYG,yardlineSide_NYJ,yardlineSide_PHI,yardlineSide_PIT,yardlineSide_SEA,yardlineSide_SF,yardlineSide_TB,yardlineSide_TEN,yardlineSide_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,33,13:33,0,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,2,13:18,0,...,0,0,0,0,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,34,12:23,0,...,0,0,0,0,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,TB,39,09:56,0,...,0,0,0,0,0,0,0,1,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,TB,44,09:46,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,NYG,8,01:56,17,...,0,1,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,25,01:07,20,...,0,1,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,28,01:01,20,...,0,1,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,20,00:39,20,...,0,1,0,0,0,0,0,0,0,0


### drop yardlineSide

In [23]:
plays_df = plays_df.drop(['yardlineSide'], axis=1)
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,yardlineSide_NO,yardlineSide_NYG,yardlineSide_NYJ,yardlineSide_PHI,yardlineSide_PIT,yardlineSide_SEA,yardlineSide_SF,yardlineSide_TB,yardlineSide_TEN,yardlineSide_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,1,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,1,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,1,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,1,0,0,0,0,0,0,0,0


### one-hot encode pff_passCoverage

In [24]:
plays_df = plays_df.join(pd.get_dummies(plays_df['pff_passCoverage'], prefix='pff_passCoverage'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,pff_passCoverage_Cover-0,pff_passCoverage_Cover-1,pff_passCoverage_Cover-2,pff_passCoverage_Cover-3,pff_passCoverage_Cover-6,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,0,0,0,1,0,0,0,0,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,0,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,0,1,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,0,1,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,0,1,0,0,0,0,0,0,0


### one-hot encode pff_passCoverageType

In [25]:
plays_df = plays_df.join(pd.get_dummies(plays_df['pff_passCoverageType'], prefix='pff_passCoverageType'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,pff_passCoverage_Cover-3,pff_passCoverage_Cover-6,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone,pff_passCoverageType_Man,pff_passCoverageType_Other,pff_passCoverageType_Zone
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,1,0,0,0,0,0,0,0,0,1
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,1,0,0,0,0,0,0,0,0,1
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,1,0,0,0,0,0,0,0,0,1
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,0,0,0,0,0,0,0,1,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,0,0,0,0,0,0,0,0,1
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,0,0,0,0,0,0,0,0,1
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,0,0,0,0,0,0,0,0,1


### drop pff_passCoverageType, pff_passCoverage

In [26]:
plays_df = plays_df.drop(['pff_passCoverageType', 'pff_passCoverage'], axis=1)

### split gameClock

In [27]:
plays_df[['gameClock_minutes', 'gameClock_seconds']] = plays_df['gameClock'].str.split(':', expand=True).astype(int)
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone,pff_passCoverageType_Man,pff_passCoverageType_Other,pff_passCoverageType_Zone,gameClock_minutes,gameClock_seconds
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,0,0,0,0,1,0,0,13,33
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,0,0,0,0,0,0,0,1,13,18
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,0,0,0,0,0,0,0,1,12,23
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,0,0,0,0,0,0,0,1,9,56
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,0,0,0,0,0,0,0,1,9,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,0,0,0,0,0,1,0,1,56
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,0,0,0,0,0,0,1,1,7
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,0,0,0,0,0,0,1,1,1
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,0,0,0,0,0,0,1,0,39


### drop gameClock

In [28]:
plays_df = plays_df.drop(['gameClock'], axis=1)

In [29]:
plays_df.columns

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'yardlineNumber', 'preSnapHomeScore', 'preSnapVisitorScore',
       'passResult',
       ...
       'pff_passCoverage_Goal Line', 'pff_passCoverage_Miscellaneous',
       'pff_passCoverage_Prevent', 'pff_passCoverage_Quarters',
       'pff_passCoverage_Red Zone', 'pff_passCoverageType_Man',
       'pff_passCoverageType_Other', 'pff_passCoverageType_Zone',
       'gameClock_minutes', 'gameClock_seconds'],
      dtype='object', length=145)

## LGBM

### features and target

In [30]:
possTeam_cols = [x for x in plays_df.columns if 'possessionTeam_' in x]
possTeam_cols

['possessionTeam_ARI',
 'possessionTeam_ATL',
 'possessionTeam_BAL',
 'possessionTeam_BUF',
 'possessionTeam_CAR',
 'possessionTeam_CHI',
 'possessionTeam_CIN',
 'possessionTeam_CLE',
 'possessionTeam_DAL',
 'possessionTeam_DEN',
 'possessionTeam_DET',
 'possessionTeam_GB',
 'possessionTeam_HOU',
 'possessionTeam_IND',
 'possessionTeam_JAX',
 'possessionTeam_KC',
 'possessionTeam_LA',
 'possessionTeam_LAC',
 'possessionTeam_LV',
 'possessionTeam_MIA',
 'possessionTeam_MIN',
 'possessionTeam_NE',
 'possessionTeam_NO',
 'possessionTeam_NYG',
 'possessionTeam_NYJ',
 'possessionTeam_PHI',
 'possessionTeam_PIT',
 'possessionTeam_SEA',
 'possessionTeam_SF',
 'possessionTeam_TB',
 'possessionTeam_TEN',
 'possessionTeam_WAS']

In [31]:
defTeam_cols = [x for x in plays_df.columns if 'defensiveTeam_' in x]
defTeam_cols

['defensiveTeam_ARI',
 'defensiveTeam_ATL',
 'defensiveTeam_BAL',
 'defensiveTeam_BUF',
 'defensiveTeam_CAR',
 'defensiveTeam_CHI',
 'defensiveTeam_CIN',
 'defensiveTeam_CLE',
 'defensiveTeam_DAL',
 'defensiveTeam_DEN',
 'defensiveTeam_DET',
 'defensiveTeam_GB',
 'defensiveTeam_HOU',
 'defensiveTeam_IND',
 'defensiveTeam_JAX',
 'defensiveTeam_KC',
 'defensiveTeam_LA',
 'defensiveTeam_LAC',
 'defensiveTeam_LV',
 'defensiveTeam_MIA',
 'defensiveTeam_MIN',
 'defensiveTeam_NE',
 'defensiveTeam_NO',
 'defensiveTeam_NYG',
 'defensiveTeam_NYJ',
 'defensiveTeam_PHI',
 'defensiveTeam_PIT',
 'defensiveTeam_SEA',
 'defensiveTeam_SF',
 'defensiveTeam_TB',
 'defensiveTeam_TEN',
 'defensiveTeam_WAS']

In [32]:
gameClock_cols = [x for x in plays_df.columns if 'gameClock_' in x]
gameClock_cols

['gameClock_minutes', 'gameClock_seconds']

In [33]:
yardlineSide_cols = [x for x in plays_df.columns if 'yardlineSide_' in x]
yardlineSide_cols

['yardlineSide_ARI',
 'yardlineSide_ATL',
 'yardlineSide_BAL',
 'yardlineSide_BUF',
 'yardlineSide_CAR',
 'yardlineSide_CHI',
 'yardlineSide_CIN',
 'yardlineSide_CLE',
 'yardlineSide_DAL',
 'yardlineSide_DEN',
 'yardlineSide_DET',
 'yardlineSide_GB',
 'yardlineSide_HOU',
 'yardlineSide_IND',
 'yardlineSide_JAX',
 'yardlineSide_KC',
 'yardlineSide_LA',
 'yardlineSide_LAC',
 'yardlineSide_LV',
 'yardlineSide_MIA',
 'yardlineSide_MIN',
 'yardlineSide_NE',
 'yardlineSide_NO',
 'yardlineSide_NYG',
 'yardlineSide_NYJ',
 'yardlineSide_PHI',
 'yardlineSide_PIT',
 'yardlineSide_SEA',
 'yardlineSide_SF',
 'yardlineSide_TB',
 'yardlineSide_TEN',
 'yardlineSide_WAS']

In [34]:
pff_passCoverage_cols = [x for x in plays_df.columns if 'pff_passCoverage_' in x]
pff_passCoverage_cols

['pff_passCoverage_2-Man',
 'pff_passCoverage_Bracket',
 'pff_passCoverage_Cover-0',
 'pff_passCoverage_Cover-1',
 'pff_passCoverage_Cover-2',
 'pff_passCoverage_Cover-3',
 'pff_passCoverage_Cover-6',
 'pff_passCoverage_Goal Line',
 'pff_passCoverage_Miscellaneous',
 'pff_passCoverage_Prevent',
 'pff_passCoverage_Quarters',
 'pff_passCoverage_Red Zone']

In [35]:
pff_passCoverageType_cols = [x for x in plays_df.columns if 'pff_passCoverageType_' in x]
pff_passCoverageType_cols

['pff_passCoverageType_Man',
 'pff_passCoverageType_Other',
 'pff_passCoverageType_Zone']

In [36]:
feats_list = ['quarter', 'down', 'yardsToGo', 'yardlineNumber', 'absoluteYardlineNumber', 'preSnapHomeScore', 'preSnapVisitorScore']
feats_list.extend(possTeam_cols)
feats_list.extend(defTeam_cols)
feats_list.extend(gameClock_cols)
feats_list.extend(yardlineSide_cols)
feats_list.extend(pff_passCoverage_cols)
feats_list.extend(pff_passCoverageType_cols)
target_col = ['offenseFormation']

In [37]:
feats = plays_df[feats_list]
target = plays_df[target_col]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=0.3, random_state=42)

In [39]:
import lightgbm

### hyperparameters

In [40]:
lr = 1e-1
n_estimators = 75 # 8
random_state = 42
num_leaves = 32

### lgbm instantiation

In [41]:
lgbm = lightgbm.LGBMClassifier(
    learning_rate=lr,
    n_estimators=n_estimators,
    random_state=random_state,
    num_leaves = num_leaves
)

### fit lgbm

In [42]:
lgbm.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 561
[LightGBM] [Info] Number of data points in the train set: 5985, number of used features: 117
[LightGBM] [Info] Start training from score -1.820747
[LightGBM] [Info] Start training from score -3.345153
[LightGBM] [Info] Start training from score -5.652489
[LightGBM] [Info] Start training from score -3.901221
[LightGBM] [Info] Start training from score -0.446130
[LightGBM] [Info] Start training from score -1.974382
[LightGBM] [Info] Start training from score -8.697012


### check feature importances

In [43]:
print(len(lgbm.feature_name_))
print(lgbm.feature_name_)

120
['quarter', 'down', 'yardsToGo', 'yardlineNumber', 'absoluteYardlineNumber', 'preSnapHomeScore', 'preSnapVisitorScore', 'possessionTeam_ARI', 'possessionTeam_ATL', 'possessionTeam_BAL', 'possessionTeam_BUF', 'possessionTeam_CAR', 'possessionTeam_CHI', 'possessionTeam_CIN', 'possessionTeam_CLE', 'possessionTeam_DAL', 'possessionTeam_DEN', 'possessionTeam_DET', 'possessionTeam_GB', 'possessionTeam_HOU', 'possessionTeam_IND', 'possessionTeam_JAX', 'possessionTeam_KC', 'possessionTeam_LA', 'possessionTeam_LAC', 'possessionTeam_LV', 'possessionTeam_MIA', 'possessionTeam_MIN', 'possessionTeam_NE', 'possessionTeam_NO', 'possessionTeam_NYG', 'possessionTeam_NYJ', 'possessionTeam_PHI', 'possessionTeam_PIT', 'possessionTeam_SEA', 'possessionTeam_SF', 'possessionTeam_TB', 'possessionTeam_TEN', 'possessionTeam_WAS', 'defensiveTeam_ARI', 'defensiveTeam_ATL', 'defensiveTeam_BAL', 'defensiveTeam_BUF', 'defensiveTeam_CAR', 'defensiveTeam_CHI', 'defensiveTeam_CIN', 'defensiveTeam_CLE', 'defensiveTe

In [44]:
print(len(lgbm.feature_importances_))
print(lgbm.feature_importances_)

120
[ 612  637 1036 1542 1387  989 1279   69   27  140   46   41   27   98
   79   43   43   71   75   44   74   46   53   60   31   42   60  157
   49   44   42   62  103   63   35   73   43   53   53   58   60   62
   26   23   41   30   35   57   65   70   40   27   34   50   42   34
   20   37   46   47   37   40   48   43   78   47   58   95   58   38
    8 1218 1972   31   33   47   30   25   21   49   29   15   37   39
   20   39    9   25   54   11   21   24   46   39   22   21   21   38
   83   21   32   70   28   35   55   47    0   82  140   56  176   85
    0    0    0   91   72  118   76  111]


In [45]:
lgbm.classes_

array(['EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK',
       'WILDCAT'], dtype=object)

### test lgbm

In [46]:
preds = lgbm.predict(X_test)

In [47]:
print(len(X_test))

2565


### compute metrics

In [48]:
np.unique(preds, return_counts=True)

(array(['EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK',
        'WILDCAT'], dtype=object),
 array([ 199,   13,   13,   18, 2020,  291,   11]))

In [49]:
np.unique(y_test['offenseFormation'].values, return_counts=True)

(array(['EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK',
        'WILDCAT'], dtype=object),
 array([ 427,   87,    9,   33, 1650,  358,    1]))

In [50]:
set(preds)

{'EMPTY', 'I_FORM', 'JUMBO', 'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT'}

In [51]:
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds, average='weighted'))
print(f1_score(y_test, preds, average='macro'))
print(f1_score(y_test, preds, average='micro'))

0.6304093567251462
0.5897462373395747
0.2669269081113375
0.6304093567251462


### save model

In [57]:
import joblib

In [58]:
joblib.dump(lgbm, 'lgbm.pkl')

['lgbm.pkl']

In [59]:
lgbm_pickle = joblib.load('lgbm.pkl')

In [60]:
lgbm_pickle.predict(X_test)

array(['EMPTY', 'SHOTGUN', 'SHOTGUN', ..., 'SHOTGUN', 'EMPTY', 'SHOTGUN'],
      dtype=object)