## Installations

In [1]:
!pip install gdown
!pip install gower



## Imports

In [2]:
import numpy as np
import pandas as pd
import gdown

In [37]:
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer

In [4]:
import gower
from collections import Counter

## plays.csv

### download plays.csv

In [5]:
!gdown https://drive.google.com/uc?id=1G1T2NkiYkRQxvS4s3edPaAV1rYz_2bBA&export=download

Downloading...
From: https://drive.google.com/uc?id=1G1T2NkiYkRQxvS4s3edPaAV1rYz_2bBA
To: /content/plays.csv
  0% 0.00/2.35M [00:00<?, ?B/s]100% 2.35M/2.35M [00:00<00:00, 133MB/s]


### load plays.csv

In [6]:
plays_df = pd.read_csv('plays.csv')
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


## Data Cleaning/Preprocessing

### check offenseFormation isna

In [7]:
plays_df[plays_df['offenseFormation'].isna()]

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
916,2021091211,1517,"(:20) (No Huddle, Shotgun) Aa.Rodgers pass inc...",2,3,6,GB,NO,GB,43,...,,53.0,,"1 RB, 1 TE, 3 WR",,"3 DL, 2 LB, 6 DB",,0,2-Man,Man
1570,2021091904,3676,(5:25) T.Bridgewater pass short right to A.Okw...,4,1,10,DEN,JAX,JAX,39,...,,,,,,,,1,Cover-1,Man
1654,2021091906,639,(2:42) M.Jones pass incomplete deep middle to ...,1,2,5,NE,NYJ,NYJ,39,...,,49.0,,"2 RB, 1 TE, 2 WR",,"3 DL, 4 LB, 4 DB",,1,Cover-1,Man
4887,2021101004,3207,(11:03) (Shotgun) T.Lawrence pass deep left in...,4,3,7,JAX,TEN,TEN,9,...,,19.0,,"1 RB, 1 TE, 3 WR",,"2 DL, 3 LB, 6 DB",,0,Cover-1,Man
6874,2021102402,2491,"(5:45) (No Huddle, Shotgun) T.Tagovailoa scram...",3,2,6,MIA,ATL,ATL,6,...,,16.0,,"1 RB, 2 TE, 2 WR",,"2 DL, 4 LB, 5 DB",,0,Bracket,Other
6899,2021102402,3689,(5:37) (Shotgun) T.Tagovailoa pass left to J.W...,4,1,14,MIA,ATL,ATL,44,...,,66.0,,"1 RB, 2 TE, 2 WR",,"2 DL, 4 LB, 5 DB",,1,Cover-3,Zone
7912,2021103105,1905,(:11) (Shotgun) D.Mills pass short right inten...,2,1,10,HOU,LA,LA,47,...,,57.0,,"1 RB, 1 TE, 3 WR",,"2 DL, 3 LB, 6 DB",,0,Quarters,Zone


### filter NaN values
* offenseFormation
* personnelO
* personnelD

In [8]:
plays_df = plays_df[~plays_df['offenseFormation'].isna()]
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


In [9]:
plays_df = plays_df[~plays_df['personnelO'].isna()]
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


In [10]:
plays_df['personnelO'].unique()

array(['1 RB, 1 TE, 3 WR', '1 RB, 2 TE, 2 WR', '0 RB, 2 TE, 3 WR',
       '1 RB, 0 TE, 4 WR', '2 RB, 1 TE, 2 WR', '2 RB, 0 TE, 3 WR',
       '2 RB, 2 TE, 1 WR', '1 RB, 3 TE, 1 WR', '2 RB, 3 TE, 0 WR',
       '0 RB, 0 TE, 5 WR', '0 RB, 1 TE, 4 WR', '6 OL, 2 RB, 2 TE, 0 WR',
       '2 QB, 2 RB, 0 TE, 2 WR', '2 QB, 1 RB, 1 TE, 2 WR',
       '6 OL, 1 RB, 1 TE, 2 WR', '2 QB, 1 RB, 2 TE, 1 WR',
       '6 OL, 1 RB, 2 TE, 1 WR', '2 QB, 1 RB, 0 TE, 3 WR',
       '6 OL, 2 RB, 1 TE, 1 WR', '3 RB, 0 TE, 2 WR',
       '2 QB, 6 OL, 1 RB, 1 TE, 1 WR', '0 RB, 3 TE, 2 WR',
       '6 OL, 1 RB, 3 TE, 0 WR', '6 OL, 2 RB, 0 TE, 2 WR',
       '6 OL, 1 RB, 0 TE, 3 WR', '1 RB, 1 TE, 2 WR,1 LB',
       '1 RB, 4 TE, 0 WR', '2 QB, 2 RB, 1 TE, 1 WR',
       '2 QB, 1 RB, 3 TE, 0 WR', '7 OL, 1 RB, 0 TE, 2 WR'], dtype=object)

In [11]:
plays_df = plays_df[~plays_df['personnelD'].isna()]
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,,18.0,SHOTGUN,"1 RB, 1 TE, 3 WR",4.0,"1 DL, 3 LB, 7 DB",SCRAMBLE,0,Bracket,Other
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",SCRAMBLE,0,Cover-2,Zone
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,,38.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Cover-2,Zone


In [12]:
plays_df['personnelD'].unique()

array(['4 DL, 2 LB, 5 DB', '4 DL, 4 LB, 3 DB', '3 DL, 3 LB, 5 DB',
       '4 DL, 3 LB, 4 DB', '3 DL, 4 LB, 4 DB', '2 DL, 4 LB, 5 DB',
       '2 DL, 2 LB, 7 DB', '1 DL, 5 LB, 5 DB', '2 DL, 3 LB, 6 DB',
       '4 DL, 1 LB, 6 DB', '3 DL, 2 LB, 6 DB', '5 DL, 2 LB, 4 DB',
       '6 DL, 1 LB, 4 DB', '3 DL, 1 LB, 7 DB', '1 DL, 4 LB, 6 DB',
       '4 DL, 6 LB, 1 DB', '0 DL, 3 LB, 8 DB', '1 DL, 3 LB, 7 DB',
       '5 DL, 1 LB, 5 DB', '5 DL, 3 LB, 3 DB', '0 DL, 5 LB, 6 DB',
       '2 DL, 5 LB, 4 DB', '6 DL, 3 LB, 2 DB', '3 DL, 5 LB, 3 DB',
       '5 DL, 5 LB, 1 DB', '1 DL, 2 LB, 8 DB', '6 DL, 4 LB, 1 DB',
       '4 DL, 5 LB, 2 DB', '6 DL, 2 LB, 3 DB'], dtype=object)

### parse personnelO, personnelD

In [13]:
def parse_personnelO(row):
    parts = row.split(',')
    counts = {'RB': 0, 'TE': 0, 'WR': 0, 'QB': 0, 'OL': 0}

    for part in parts:
        number, position = part.split()
        counts[position.strip()] = int(number)

    return pd.Series([counts['RB'], counts['TE'], counts['WR'], counts['QB'], counts['OL']])

def parse_personnelD(row):
    parts = row.split(',')
    counts = {'DL': 0, 'LB': 0, 'DB': 0}

    for part in parts:
        number, position = part.split()
        counts[position.strip()] = int(number)

    return pd.Series([counts['DL'], counts['LB'], counts['DB']])

# Apply the function and assign results to new columns
plays_df[['personnalO_RB', 'personnelO_TE', 'personnelO_WR', 'personnelO_QB', 'personnelO_OL']] = plays_df['personnelO'].apply(parse_personnelO)
plays_df[['personnalD_DL', 'personnelD_LB', 'personnelD_DB']] = plays_df['personnelD'].apply(parse_personnelD)

plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,pff_passCoverage,pff_passCoverageType,personnalO_RB,personnelO_TE,personnelO_WR,personnelO_QB,personnelO_OL,personnalD_DL,personnelD_LB,personnelD_DB
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,Cover-1,Man,1,1,3,0,0,4,2,5
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,Cover-3,Zone,1,2,2,0,0,4,4,3
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,Cover-3,Zone,0,2,3,0,0,3,3,5
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,Cover-3,Zone,1,2,2,0,0,4,3,4
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,Cover-3,Zone,1,1,3,0,0,3,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,Bracket,Other,1,1,3,0,0,1,3,7
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,Cover-2,Zone,1,1,3,0,0,4,1,6
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,Cover-2,Zone,1,1,3,0,0,4,1,6
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,Cover-2,Zone,1,1,3,0,0,4,1,6


### drop personnelO, personnelD

In [14]:
plays_df = plays_df.drop(['personnelD', 'personnelO'], axis=1)

In [15]:
plays_df.columns

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber',
       'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult',
       'penaltyYards', 'prePenaltyPlayResult', 'playResult', 'foulName1',
       'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
       'absoluteYardlineNumber', 'offenseFormation', 'defendersInBox',
       'dropBackType', 'pff_playAction', 'pff_passCoverage',
       'pff_passCoverageType', 'personnalO_RB', 'personnelO_TE',
       'personnelO_WR', 'personnelO_QB', 'personnelO_OL', 'personnalD_DL',
       'personnelD_LB', 'personnelD_DB'],
      dtype='object')

### one-hot encode possessionTeam

In [16]:
plays_df = plays_df.join(pd.get_dummies(plays_df['possessionTeam'], prefix='possessionTeam'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,possessionTeam_NO,possessionTeam_NYG,possessionTeam_NYJ,possessionTeam_PHI,possessionTeam_PIT,possessionTeam_SEA,possessionTeam_SF,possessionTeam_TB,possessionTeam_TEN,possessionTeam_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,0,0,0,0,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,0,0,0,0,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,0,0,0,0,0,0,0,0,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,0,0,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,0,1,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,0,1,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,0,1,0,0,0,0,0,0,0,0


### one-hot encode defensiveTeam

In [17]:
plays_df = plays_df.join(pd.get_dummies(plays_df['defensiveTeam'], prefix='defensiveTeam'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,0,0,0,0,0,0,0,0,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,0,0,0,0,0,0,0,1,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,0,0,0,0,0,0,0,1,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,0,0,0,0,0,0,0,1,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,KC,NYG,NYG,8,...,0,1,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,KC,NYG,25,...,0,0,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,KC,NYG,28,...,0,0,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,KC,NYG,20,...,0,0,0,0,0,0,0,0,0,0


### drop offensiveTeam and defensiveTeam columns

In [18]:
plays_df = plays_df.drop(['possessionTeam', 'defensiveTeam'], axis=1)

### one-hot encode yardlineSide

In [19]:
plays_df = plays_df.join(pd.get_dummies(plays_df['yardlineSide'], prefix='yardlineSide'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,...,yardlineSide_NO,yardlineSide_NYG,yardlineSide_NYJ,yardlineSide_PHI,yardlineSide_PIT,yardlineSide_SEA,yardlineSide_SF,yardlineSide_TB,yardlineSide_TEN,yardlineSide_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,33,13:33,0,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,2,13:18,0,...,0,0,0,0,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,34,12:23,0,...,0,0,0,0,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,TB,39,09:56,0,...,0,0,0,0,0,0,0,1,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,TB,44,09:46,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,NYG,8,01:56,17,...,0,1,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,NYG,25,01:07,20,...,0,1,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,NYG,28,01:01,20,...,0,1,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,NYG,20,00:39,20,...,0,1,0,0,0,0,0,0,0,0


### drop yardlineSide

In [20]:
plays_df = plays_df.drop(['yardlineSide'], axis=1)
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,yardlineSide_NO,yardlineSide_NYG,yardlineSide_NYJ,yardlineSide_PHI,yardlineSide_PIT,yardlineSide_SEA,yardlineSide_SF,yardlineSide_TB,yardlineSide_TEN,yardlineSide_WAS
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,1,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,1,0,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,1,0,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,1,0,0,0,0,0,0,0,0


### one-hot encode pff_passCoverage

In [21]:
plays_df = plays_df.join(pd.get_dummies(plays_df['pff_passCoverage'], prefix='pff_passCoverage'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,pff_passCoverage_Cover-0,pff_passCoverage_Cover-1,pff_passCoverage_Cover-2,pff_passCoverage_Cover-3,pff_passCoverage_Cover-6,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,0,0,0,1,0,0,0,0,0,0
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,0,0,0,0,0,0,0,0,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,0,1,0,0,0,0,0,0,0
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,0,1,0,0,0,0,0,0,0
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,0,1,0,0,0,0,0,0,0


### one-hot encode pff_passCoverageType

In [22]:
plays_df = plays_df.join(pd.get_dummies(plays_df['pff_passCoverageType'], prefix='pff_passCoverageType'))
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,pff_passCoverage_Cover-3,pff_passCoverage_Cover-6,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone,pff_passCoverageType_Man,pff_passCoverageType_Other,pff_passCoverageType_Zone
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,1,0,0,0,0,0,0,0,0,1
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,1,0,0,0,0,0,0,0,0,1
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,1,0,0,0,0,0,0,0,0,1
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,0,0,0,0,0,0,0,1,0
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,0,0,0,0,0,0,0,0,1
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,0,0,0,0,0,0,0,0,1
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,0,0,0,0,0,0,0,0,1


### drop pff_passCoverageType, pff_passCoverage

In [23]:
plays_df = plays_df.drop(['pff_passCoverageType', 'pff_passCoverage'], axis=1)

### split gameClock

In [24]:
plays_df[['gameClock_minutes', 'gameClock_seconds']] = plays_df['gameClock'].str.split(':', expand=True).astype(int)
plays_df

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,...,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone,pff_passCoverageType_Man,pff_passCoverageType_Other,pff_passCoverageType_Zone,gameClock_minutes,gameClock_seconds
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,33,13:33,0,0,...,0,0,0,0,0,1,0,0,13,33
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,2,13:18,0,0,...,0,0,0,0,0,0,0,1,13,18
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,34,12:23,0,0,...,0,0,0,0,0,0,0,1,12,23
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,39,09:56,0,0,...,0,0,0,0,0,0,0,1,9,56
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,44,09:46,0,0,...,0,0,0,0,0,0,0,1,9,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,2021110100,4310,(1:56) (Shotgun) P.Mahomes sacked at NYG 16 fo...,4,3,8,8,01:56,17,17,...,0,0,0,0,0,0,1,0,1,56
8553,2021110100,4363,(1:07) (Shotgun) D.Jones pass short right to E...,4,1,10,25,01:07,20,17,...,0,0,0,0,0,0,0,1,1,7
8554,2021110100,4392,"(1:01) (No Huddle, Shotgun) D.Jones sacked at ...",4,2,7,28,01:01,20,17,...,0,0,0,0,0,0,0,1,1,1
8555,2021110100,4411,"(:39) (No Huddle, Shotgun) D.Jones pass incomp...",4,3,15,20,00:39,20,17,...,0,0,0,0,0,0,0,1,0,39


### drop gameClock

In [25]:
plays_df = plays_df.drop(['gameClock'], axis=1)

In [26]:
plays_df.columns

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'yardlineNumber', 'preSnapHomeScore', 'preSnapVisitorScore',
       'passResult',
       ...
       'pff_passCoverage_Goal Line', 'pff_passCoverage_Miscellaneous',
       'pff_passCoverage_Prevent', 'pff_passCoverage_Quarters',
       'pff_passCoverage_Red Zone', 'pff_passCoverageType_Man',
       'pff_passCoverageType_Other', 'pff_passCoverageType_Zone',
       'gameClock_minutes', 'gameClock_seconds'],
      dtype='object', length=145)

## features

In [27]:
possTeam_cols = [x for x in plays_df.columns if 'possessionTeam_' in x]
possTeam_cols

['possessionTeam_ARI',
 'possessionTeam_ATL',
 'possessionTeam_BAL',
 'possessionTeam_BUF',
 'possessionTeam_CAR',
 'possessionTeam_CHI',
 'possessionTeam_CIN',
 'possessionTeam_CLE',
 'possessionTeam_DAL',
 'possessionTeam_DEN',
 'possessionTeam_DET',
 'possessionTeam_GB',
 'possessionTeam_HOU',
 'possessionTeam_IND',
 'possessionTeam_JAX',
 'possessionTeam_KC',
 'possessionTeam_LA',
 'possessionTeam_LAC',
 'possessionTeam_LV',
 'possessionTeam_MIA',
 'possessionTeam_MIN',
 'possessionTeam_NE',
 'possessionTeam_NO',
 'possessionTeam_NYG',
 'possessionTeam_NYJ',
 'possessionTeam_PHI',
 'possessionTeam_PIT',
 'possessionTeam_SEA',
 'possessionTeam_SF',
 'possessionTeam_TB',
 'possessionTeam_TEN',
 'possessionTeam_WAS']

In [28]:
defTeam_cols = [x for x in plays_df.columns if 'defensiveTeam_' in x]
defTeam_cols

['defensiveTeam_ARI',
 'defensiveTeam_ATL',
 'defensiveTeam_BAL',
 'defensiveTeam_BUF',
 'defensiveTeam_CAR',
 'defensiveTeam_CHI',
 'defensiveTeam_CIN',
 'defensiveTeam_CLE',
 'defensiveTeam_DAL',
 'defensiveTeam_DEN',
 'defensiveTeam_DET',
 'defensiveTeam_GB',
 'defensiveTeam_HOU',
 'defensiveTeam_IND',
 'defensiveTeam_JAX',
 'defensiveTeam_KC',
 'defensiveTeam_LA',
 'defensiveTeam_LAC',
 'defensiveTeam_LV',
 'defensiveTeam_MIA',
 'defensiveTeam_MIN',
 'defensiveTeam_NE',
 'defensiveTeam_NO',
 'defensiveTeam_NYG',
 'defensiveTeam_NYJ',
 'defensiveTeam_PHI',
 'defensiveTeam_PIT',
 'defensiveTeam_SEA',
 'defensiveTeam_SF',
 'defensiveTeam_TB',
 'defensiveTeam_TEN',
 'defensiveTeam_WAS']

In [29]:
gameClock_cols = [x for x in plays_df.columns if 'gameClock_' in x]
gameClock_cols

['gameClock_minutes', 'gameClock_seconds']

In [30]:
yardlineSide_cols = [x for x in plays_df.columns if 'yardlineSide_' in x]
yardlineSide_cols

['yardlineSide_ARI',
 'yardlineSide_ATL',
 'yardlineSide_BAL',
 'yardlineSide_BUF',
 'yardlineSide_CAR',
 'yardlineSide_CHI',
 'yardlineSide_CIN',
 'yardlineSide_CLE',
 'yardlineSide_DAL',
 'yardlineSide_DEN',
 'yardlineSide_DET',
 'yardlineSide_GB',
 'yardlineSide_HOU',
 'yardlineSide_IND',
 'yardlineSide_JAX',
 'yardlineSide_KC',
 'yardlineSide_LA',
 'yardlineSide_LAC',
 'yardlineSide_LV',
 'yardlineSide_MIA',
 'yardlineSide_MIN',
 'yardlineSide_NE',
 'yardlineSide_NO',
 'yardlineSide_NYG',
 'yardlineSide_NYJ',
 'yardlineSide_PHI',
 'yardlineSide_PIT',
 'yardlineSide_SEA',
 'yardlineSide_SF',
 'yardlineSide_TB',
 'yardlineSide_TEN',
 'yardlineSide_WAS']

In [31]:
pff_passCoverage_cols = [x for x in plays_df.columns if 'pff_passCoverage_' in x]
pff_passCoverage_cols

['pff_passCoverage_2-Man',
 'pff_passCoverage_Bracket',
 'pff_passCoverage_Cover-0',
 'pff_passCoverage_Cover-1',
 'pff_passCoverage_Cover-2',
 'pff_passCoverage_Cover-3',
 'pff_passCoverage_Cover-6',
 'pff_passCoverage_Goal Line',
 'pff_passCoverage_Miscellaneous',
 'pff_passCoverage_Prevent',
 'pff_passCoverage_Quarters',
 'pff_passCoverage_Red Zone']

In [32]:
pff_passCoverageType_cols = [x for x in plays_df.columns if 'pff_passCoverageType_' in x]
pff_passCoverageType_cols

['pff_passCoverageType_Man',
 'pff_passCoverageType_Other',
 'pff_passCoverageType_Zone']

In [33]:
feats_list = ['quarter', 'down', 'yardsToGo', 'yardlineNumber', 'absoluteYardlineNumber', 'preSnapHomeScore', 'preSnapVisitorScore']
feats_list.extend(possTeam_cols)
feats_list.extend(defTeam_cols)
feats_list.extend(gameClock_cols)
feats_list.extend(yardlineSide_cols)
feats_list.extend(pff_passCoverage_cols)
feats_list.extend(pff_passCoverageType_cols)
target_col = ['offenseFormation']
feats_list.extend(target_col)

In [43]:
knn_gower_df = plays_df[feats_list]

## map target_col string to int labels

In [44]:
knn_gower_df['offenseFormation'].unique()

array(['SHOTGUN', 'EMPTY', 'SINGLEBACK', 'I_FORM', 'JUMBO', 'PISTOL',
       'WILDCAT'], dtype=object)

In [45]:
offForm2codes = {'SHOTGUN':0, 'EMPTY':1, 'SINGLEBACK':2, 'I_FORM':3, 'JUMBO':4, 'PISTOL':5, 'WILDCAT':6}
codes2offForm = {0:'SHOTGUN', 1:'EMPTY', 2:'SINGLEBACK', 3:'I_FORM', 4:'JUMBO', 5:'PISTOL', 6:'WILDCAT'}

In [46]:
knn_gower_df['offenseFormation'] = knn_gower_df['offenseFormation'].map(offForm2codes)
knn_gower_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_gower_df['offenseFormation'] = knn_gower_df['offenseFormation'].map(offForm2codes)


Unnamed: 0,quarter,down,yardsToGo,yardlineNumber,absoluteYardlineNumber,preSnapHomeScore,preSnapVisitorScore,possessionTeam_ARI,possessionTeam_ATL,possessionTeam_BAL,...,pff_passCoverage_Cover-6,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone,pff_passCoverageType_Man,pff_passCoverageType_Other,pff_passCoverageType_Zone,offenseFormation
0,1,3,2,33,43.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,1,10,2,108.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,1,2,6,34,76.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,1,10,39,49.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2
4,1,3,15,44,54.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,4,3,8,8,18.0,17,17,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8553,4,1,10,25,35.0,20,17,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8554,4,2,7,28,38.0,20,17,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8555,4,3,15,20,30.0,20,17,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## knn impute

In [47]:
imputer = KNNImputer(n_neighbors=2)
imputed_knn_gower_df = imputer.fit_transform(knn_gower_df)

In [48]:
imputed_knn_gower_df

array([[ 1.,  3.,  2., ...,  0.,  0.,  0.],
       [ 1.,  1., 10., ...,  0.,  1.,  1.],
       [ 1.,  2.,  6., ...,  0.,  1.,  0.],
       ...,
       [ 4.,  2.,  7., ...,  0.,  1.,  0.],
       [ 4.,  3., 15., ...,  0.,  1.,  0.],
       [ 4.,  4., 15., ...,  0.,  1.,  0.]])

## knn gower

In [50]:
# Gower distance matrix
distance_matrix = gower.gower_matrix(imputed_knn_gower_df)

In [51]:
distance_matrix

array([[0.        , 0.10266075, 0.09046169, ..., 0.1105816 , 0.1091437 ,
        0.11064952],
       [0.10266075, 0.        , 0.01363831, ..., 0.10192857, 0.10485512,
        0.10636093],
       [0.09046169, 0.01363831, 0.        , ..., 0.09213879, 0.09763464,
        0.09914046],
       ...,
       [0.1105816 , 0.10192857, 0.09213879, ..., 0.        , 0.01165918,
        0.02969392],
       [0.1091437 , 0.10485512, 0.09763464, ..., 0.01165918, 0.        ,
        0.01915534],
       [0.11064952, 0.10636093, 0.09914046, ..., 0.02969392, 0.01915534,
        0.        ]], dtype=float32)

In [52]:
imputed_knn_gower_df = pd.DataFrame(imputed_knn_gower_df, columns=knn_gower_df.columns)
imputed_knn_gower_df

Unnamed: 0,quarter,down,yardsToGo,yardlineNumber,absoluteYardlineNumber,preSnapHomeScore,preSnapVisitorScore,possessionTeam_ARI,possessionTeam_ATL,possessionTeam_BAL,...,pff_passCoverage_Cover-6,pff_passCoverage_Goal Line,pff_passCoverage_Miscellaneous,pff_passCoverage_Prevent,pff_passCoverage_Quarters,pff_passCoverage_Red Zone,pff_passCoverageType_Man,pff_passCoverageType_Other,pff_passCoverageType_Zone,offenseFormation
0,1.0,3.0,2.0,33.0,43.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,1.0,10.0,2.0,108.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,1.0,2.0,6.0,34.0,76.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,10.0,39.0,49.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
4,1.0,3.0,15.0,44.0,54.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8545,4.0,3.0,8.0,8.0,18.0,17.0,17.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8546,4.0,1.0,10.0,25.0,35.0,20.0,17.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8547,4.0,2.0,7.0,28.0,38.0,20.0,17.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8548,4.0,3.0,15.0,20.0,30.0,20.0,17.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [67]:
# Function for KNN using Gower distance
def gower_knn(df, distance_matrix, target, codes2offForm, k=3):
    """
    Performs KNN using Gower distance.

    :param distance_matrix: Precomputed Gower distance matrix
    :param target: Index of the target row in the distance matrix
    :param k: Number of neighbors
    :return: Most common class among k-nearest neighbors
    """
    # Get the distances for the target
    distances = distance_matrix[target]

    # Get the indices of the k smallest distances
    k_nearest_indices = np.argsort(distances)[:k]

    # Assuming last column is the label/class
    k_nearest_labels = df.iloc[k_nearest_indices, -1]

    # Return the most common class/label
    pred_class_label = Counter(k_nearest_labels).most_common(1)[0][0]
    return codes2offForm[int(pred_class_label)], int(pred_class_label)

In [68]:
target = 3

res = gower_knn(imputed_knn_gower_df, distance_matrix, target, codes2offForm, 3)
res

('SINGLEBACK', 2)