# Initial Model - Logistic Regression

In [62]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

#### Read data

In [2]:
%%time

play_data = pd.read_csv(
    'data/play_by.csv', 
    parse_dates=['game_date'],
    low_memory=False
)

CPU times: user 7.77 s, sys: 1.7 s, total: 9.47 s
Wall time: 9.89 s


In [3]:
play_data.play_type.value_counts()

pass           186677
run            132692
no_play         42431
kickoff         25552
punt            23914
extra_point     10934
field_goal       9777
qb_kneel         3830
qb_spike          690
Name: play_type, dtype: int64

In [4]:
# Filter to only third downs where the choice is to pass or run (other actions are outliers)

play_data = play_data.loc[
    (play_data.down == 3) &
    (play_data.play_type.isin(['pass', 'run']))
]

In [5]:
# Add a constant
play_data = sm.add_constant(play_data, has_constant='add')

In [6]:
# Drop NAs
play_data = play_data.dropna(subset='third_down_converted')

In [7]:
print(play_data.game_date.min())
print(play_data.game_date.max())

2009-09-10 00:00:00
2018-12-17 00:00:00


In [8]:
# Check Shape
play_data.shape

(66720, 256)

#### Build Model With No Predictor

In [9]:
lr = sm.Logit(play_data['third_down_converted'], play_data[['const']]).fit()
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.669527
         Iterations 4
                            Logit Regression Results                            
Dep. Variable:     third_down_converted   No. Observations:                66720
Model:                            Logit   Df Residuals:                    66719
Method:                             MLE   Df Model:                            0
Date:                  Sun, 27 Aug 2023   Pseudo R-squ.:               1.684e-12
Time:                          20:08:14   Log-Likelihood:                -44671.
converged:                         True   LL-Null:                       -44671.
Covariance Type:              nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.4399      0.008    -55.471      0.000      -0.455      -0.424


In [10]:
np.exp(lr.params)

const    0.644079
dtype: float64

#### Only yards to go as predictor

In [11]:
lr = sm.Logit(play_data['third_down_converted'], play_data[['const', 'ydstogo']]).fit()
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.615930
         Iterations 6
                            Logit Regression Results                            
Dep. Variable:     third_down_converted   No. Observations:                66720
Model:                            Logit   Df Residuals:                    66718
Method:                             MLE   Df Model:                            1
Date:                  Sun, 27 Aug 2023   Pseudo R-squ.:                 0.08005
Time:                          20:08:15   Log-Likelihood:                -41095.
converged:                         True   LL-Null:                       -44671.
Covariance Type:              nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6038      0.015     39.550      0.000       0.574       0.634
ydstogo      

In [12]:
np.exp(lr.params)

const      1.829001
ydstogo    0.854203
dtype: float64

#### `sklearn` model

In [14]:
play_data['is_home_team'] = np.where(play_data['home_team'] == play_data['posteam'], 1, 0)

In [18]:
# Train test split
label = 'third_down_converted'
cols = ['ydstogo', 'is_home_team']

In [19]:
X = play_data[cols].values
y = play_data[label].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [21]:
lr = LogisticRegression()

In [22]:
lr.fit(X,y)

In [58]:
input_yds_to_go = 1
input_is_home_team = 1
input_val = np.array([[input_yds_to_go, input_is_home_team]])

In [59]:
input_val

array([[1, 1]])

In [60]:
predicted_proba = lr.predict_proba(input_val)[0][1]

In [61]:
print(f'With {input_val[0][0]} yard(s) to go, the odds of converting are {round(predicted_proba * 100)}%.')

With 1 yard(s) to go, the odds of converting are 62%.


In [64]:
# save the model to disk
filename = 'model/log_reg.pkl'
pickle.dump(lr, open(filename, 'wb'))