In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [3]:
plays = pd.read_csv('~/Desktop/data110/plays.csv')

print(plays.shape)
plays.head()

(19239, 27)


Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,playType,yardlineSide,yardlineNumber,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018090600,75,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,15,ATL,play_type_pass,ATL,20,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018090600,146,(13:10) M.Ryan pass incomplete short right to ...,1,1,10,ATL,play_type_pass,PHI,39,...,0.0,13:10:00,49.0,,,I,0,0,-0.37236,False
2,2018090600,168,(13:05) (Shotgun) M.Ryan pass incomplete short...,1,2,10,ATL,play_type_pass,PHI,39,...,0.0,13:05:00,49.0,,,I,0,0,-0.702779,False
3,2018090600,190,(13:01) (Shotgun) M.Ryan pass deep left to J.J...,1,3,10,ATL,play_type_pass,PHI,39,...,0.0,13:01:00,49.0,,,C,33,33,3.04753,False
4,2018090600,256,(10:59) (Shotgun) M.Ryan pass incomplete short...,1,3,1,ATL,play_type_pass,PHI,1,...,0.0,10:59:00,11.0,,,I,0,0,-0.842272,False


In [4]:
# encode down and quarter categorical variables into true-false columns
plays = pd.get_dummies(plays, columns=['down', 'quarter'], drop_first=True)

In [5]:
# define X (features), y (target variable)
X = plays.drop(columns=['epa'])   
y = plays['epa']                  

num_features = X.shape[1]
print(f"{num_features} Total Features: \n")

# non-null count and data type for each feature
X.info()

31 Total Features: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gameId                  19239 non-null  int64  
 1   playId                  19239 non-null  int64  
 2   playDescription         19239 non-null  object 
 3   yardsToGo               19239 non-null  int64  
 4   possessionTeam          19239 non-null  object 
 5   playType                19239 non-null  object 
 6   yardlineSide            18985 non-null  object 
 7   yardlineNumber          19239 non-null  int64  
 8   offenseFormation        19098 non-null  object 
 9   personnelO              19210 non-null  object 
 10  defendersInTheBox       19177 non-null  float64
 11  numberOfPassRushers     18606 non-null  float64
 12  personnelD              19210 non-null  object 
 13  typeDropback            18600 non-null  object 
 14  preSnapVisitorSco

In [6]:
# we are not interested in epa for plays where penalties occur (we cannot control penalties)
# remove plays where penalties occur, remove features related to penalties
no_penalties = (plays['penaltyCodes'].isnull())

X = X[no_penalties].drop(columns=['penaltyCodes', 
                                  'penaltyJerseyNumbers', 
                                  'isDefensivePI'])
y = y[no_penalties]

X.shape, y.shape

((18033, 28), (18033,))

In [7]:
X = X.drop(columns=['gameId',         # these are identifying features that play no role in predicting epa
                    'playId', 
                    'possessionTeam', # team is an irrelevant factor
                    'playType',       # we are looking solely at passing plays
                    'yardlineSide',   # we will use the absoluteYardlineNumber
                    'yardlineNumber', 
                    'playResult'])    # net yds gained by offense, including penalty yardage (which we already excluded, so this variable is the same as offensePlayResult)

num_features = X.shape[1]
print(f"{num_features} Total Features: \n")

X.info()

21 Total Features: 

<class 'pandas.core.frame.DataFrame'>
Index: 18033 entries, 0 to 18605
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   playDescription         18033 non-null  object 
 1   yardsToGo               18033 non-null  int64  
 2   offenseFormation        17940 non-null  object 
 3   personnelO              18033 non-null  object 
 4   defendersInTheBox       18017 non-null  float64
 5   numberOfPassRushers     18033 non-null  float64
 6   personnelD              18033 non-null  object 
 7   typeDropback            18027 non-null  object 
 8   preSnapVisitorScore     18027 non-null  float64
 9   preSnapHomeScore        18027 non-null  float64
 10  gameClock               18027 non-null  object 
 11  absoluteYardlineNumber  18027 non-null  float64
 12  passResult              18033 non-null  object 
 13  offensePlayResult       18033 non-null  int64  
 14  down_2                

In [8]:
X['passResult'] = X.passResult.replace({'I': 0, 'C': 1, 'S': 2, 'IN': 3})

In [9]:
X['down_2'] = X.down_2.astype(int)
X['down_3'] = X.down_3.astype(int)
X['down_4'] = X.down_4.astype(int)

X['quarter_2'] = X.quarter_2.astype(int)
X['quarter_3'] = X.quarter_3.astype(int)
X['quarter_4'] = X.quarter_4.astype(int)
X['quarter_5'] = X.quarter_5.astype(int)

In [20]:
mmss = X.gameClock.str.split(":")

seconds = []

for time in mmss:
    count = 0
    for i in range(len(mmss)):
        for j in range(len(time)):
            if j == 0:
                count += int(time[i][j]) * 60
            elif j == 1:
                count += int(time[i][j])
            seconds.append(count)

print(seconds)

IndexError: list index out of range