# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

pd.set_option("display.max_columns", None)

# Games Data

In [2]:
data_path = os.path.join("..", "data", "games.csv")
df = pd.read_csv(data_path, parse_dates=["Date", "Open"], date_format="%Y-%m-%d", index_col=0)

df = df[df["N"] == 0].drop(columns="N")
df = df[df["Season"] > 20]

display(df.head(), df.shape)

Unnamed: 0,Season,Date,HID,AID,POFF,Open,OddsH,OddsA,H,A,HSC,ASC,HFGM,AFGM,HFGA,AFGA,HFG3M,AFG3M,HFG3A,AFG3A,HFTM,AFTM,HFTA,AFTA,HORB,AORB,HDRB,ADRB,HRB,ARB,HAST,AAST,HSTL,ASTL,HBLK,ABLK,HTOV,ATOV,HPF,APF
23739,21,1995-11-07,0,11,0,1995-11-06,1.274083,3.794318,0,1,66,108,25.0,39.0,65.0,79.0,3.0,7.0,17.0,13.0,13.0,23.0,22.0,32.0,4.0,13.0,25.0,36.0,29.0,49.0,13.0,22.0,6.0,10.0,3.0,5.0,23.0,16.0,24.0,21.0
23740,21,1995-11-07,17,43,0,1995-11-06,1.597972,2.286949,1,0,114,106,46.0,40.0,83.0,77.0,6.0,13.0,12.0,30.0,16.0,13.0,24.0,17.0,12.0,4.0,31.0,25.0,43.0,29.0,30.0,29.0,11.0,7.0,1.0,5.0,21.0,21.0,19.0,25.0
23741,21,1995-11-08,41,39,0,1995-11-07,1.471072,2.640288,0,1,87,91,30.0,36.0,75.0,84.0,3.0,1.0,16.0,11.0,24.0,18.0,39.0,33.0,10.0,10.0,42.0,36.0,52.0,46.0,18.0,17.0,5.0,6.0,6.0,4.0,19.0,15.0,31.0,31.0
23742,21,1995-11-08,15,22,0,1995-11-07,1.257454,3.967424,0,1,97,105,31.0,43.0,72.0,93.0,8.0,2.0,21.0,14.0,27.0,17.0,34.0,25.0,13.0,11.0,35.0,27.0,48.0,38.0,19.0,26.0,3.0,12.0,9.0,0.0,22.0,8.0,23.0,25.0
23743,21,1995-11-08,13,19,0,1995-11-07,1.302199,3.542703,1,0,88,75,31.0,26.0,74.0,78.0,3.0,4.0,9.0,17.0,23.0,19.0,25.0,22.0,15.0,17.0,32.0,25.0,47.0,42.0,19.0,11.0,10.0,9.0,6.0,4.0,17.0,18.0,22.0,24.0


(5251, 40)

In [3]:
meta_columns = ["Season", "Date", "HID", "AID", "POFF", "Open", "H", "A", "OddsH", "OddsA"]
featuresH = ["HSC", "HFGM", "HFGA", "HFG3M", "HFG3A", "HFTM", "HFTA", "HORB", "HDRB", "HRB", "HAST", "HSTL", "HBLK", "HTOV", "HPF"]
featuresA = ["ASC", "AFGM", "AFGA", "AFG3M", "AFG3A", "AFTM", "AFTA", "AORB", "ADRB", "ARB", "AAST", "ASTL", "ABLK", "ATOV", "APF"]

In [4]:
rename_columnsH = {
    "HID": "TID", "AID": "OID", "H": "W", "OddsH": "OddsT", "OddsA": "OddsO", "POFF": "TPOFF",
    "HSC": "TSC", "HFGM": "TFGM", "HFGA": "TFGA", "HFG3M": "TFG3M", "HFG3A": "TFG3A", "HFTM": "TFTM", "HFTA": "TFTA", "HORB": "TORB", "HDRB": "TDRB", "HRB": "TRB", "HAST": "TAST", "HSTL": "TSTL", "HBLK": "TBLK", "HTOV": "TTOV", "HPF": "TPF",
    "ASC": "OSC", "AFGM": "OFGM", "AFGA": "OFGA", "AFG3M": "OFG3M", "AFG3A": "OFG3A", "AFTM": "OFTM", "AFTA": "OFTA", "AORB": "OORB", "ADRB": "ODRB", "ARB": "ORB", "AAST": "OAST", "ASTL": "OSTL", "ABLK": "OBLK", "ATOV": "OTOV", "APF": "OPF",
}


dfH = df.copy().drop(columns=["A", "Open"]).rename(columns=rename_columnsH)

display(dfH.head().set_index(["Season", "Date", "TID", "OID"]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TPOFF,OddsT,OddsO,W,TSC,OSC,TFGM,OFGM,TFGA,OFGA,TFG3M,OFG3M,TFG3A,OFG3A,TFTM,OFTM,TFTA,OFTA,TORB,OORB,TDRB,ODRB,TRB,ORB,TAST,OAST,TSTL,OSTL,TBLK,OBLK,TTOV,OTOV,TPF,OPF
Season,Date,TID,OID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
21,1995-11-07,0,11,0,1.274083,3.794318,0,66,108,25.0,39.0,65.0,79.0,3.0,7.0,17.0,13.0,13.0,23.0,22.0,32.0,4.0,13.0,25.0,36.0,29.0,49.0,13.0,22.0,6.0,10.0,3.0,5.0,23.0,16.0,24.0,21.0
21,1995-11-07,17,43,0,1.597972,2.286949,1,114,106,46.0,40.0,83.0,77.0,6.0,13.0,12.0,30.0,16.0,13.0,24.0,17.0,12.0,4.0,31.0,25.0,43.0,29.0,30.0,29.0,11.0,7.0,1.0,5.0,21.0,21.0,19.0,25.0
21,1995-11-08,41,39,0,1.471072,2.640288,0,87,91,30.0,36.0,75.0,84.0,3.0,1.0,16.0,11.0,24.0,18.0,39.0,33.0,10.0,10.0,42.0,36.0,52.0,46.0,18.0,17.0,5.0,6.0,6.0,4.0,19.0,15.0,31.0,31.0
21,1995-11-08,15,22,0,1.257454,3.967424,0,97,105,31.0,43.0,72.0,93.0,8.0,2.0,21.0,14.0,27.0,17.0,34.0,25.0,13.0,11.0,35.0,27.0,48.0,38.0,19.0,26.0,3.0,12.0,9.0,0.0,22.0,8.0,23.0,25.0
21,1995-11-08,13,19,0,1.302199,3.542703,1,88,75,31.0,26.0,74.0,78.0,3.0,4.0,9.0,17.0,23.0,19.0,25.0,22.0,15.0,17.0,32.0,25.0,47.0,42.0,19.0,11.0,10.0,9.0,6.0,4.0,17.0,18.0,22.0,24.0


In [5]:
rename_columnsA = {
    "HID": "OID", "AID": "TID", "A": "W", "OddsH": "OddsO", "OddsA": "OddsT", "POFF": "TPOFF",
    "HSC": "OSC", "HFGM": "OFGM", "HFGA": "OFGA", "HFG3M": "OFG3M", "HFG3A": "OFG3A", "HFTM": "OFTM", "HFTA": "OFTA", "HORB": "OORB", "HDRB": "ODRB", "HRB": "ORB", "HAST": "OAST", "HSTL": "OSTL", "HBLK": "OBLK", "HTOV": "OTOV", "HPF": "OPF",
    "ASC": "TSC", "AFGM": "TFGM", "AFGA": "TFGA", "AFG3M": "TFG3M", "AFG3A": "TFG3A", "AFTM": "TFTM", "AFTA": "TFTA", "AORB": "TORB", "ADRB": "TDRB", "ARB": "TRB", "AAST": "TAST", "ASTL": "TSTL", "ABLK": "TBLK", "ATOV": "TTOV", "APF": "TPF",
}


dfA = df.copy().drop(columns=["H", "Open"]).rename(columns=rename_columnsA)

display(dfA.head().set_index(["Season", "Date", "TID", "OID"]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TPOFF,OddsO,OddsT,W,OSC,TSC,OFGM,TFGM,OFGA,TFGA,OFG3M,TFG3M,OFG3A,TFG3A,OFTM,TFTM,OFTA,TFTA,OORB,TORB,ODRB,TDRB,ORB,TRB,OAST,TAST,OSTL,TSTL,OBLK,TBLK,OTOV,TTOV,OPF,TPF
Season,Date,TID,OID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
21,1995-11-07,11,0,0,1.274083,3.794318,1,66,108,25.0,39.0,65.0,79.0,3.0,7.0,17.0,13.0,13.0,23.0,22.0,32.0,4.0,13.0,25.0,36.0,29.0,49.0,13.0,22.0,6.0,10.0,3.0,5.0,23.0,16.0,24.0,21.0
21,1995-11-07,43,17,0,1.597972,2.286949,0,114,106,46.0,40.0,83.0,77.0,6.0,13.0,12.0,30.0,16.0,13.0,24.0,17.0,12.0,4.0,31.0,25.0,43.0,29.0,30.0,29.0,11.0,7.0,1.0,5.0,21.0,21.0,19.0,25.0
21,1995-11-08,39,41,0,1.471072,2.640288,1,87,91,30.0,36.0,75.0,84.0,3.0,1.0,16.0,11.0,24.0,18.0,39.0,33.0,10.0,10.0,42.0,36.0,52.0,46.0,18.0,17.0,5.0,6.0,6.0,4.0,19.0,15.0,31.0,31.0
21,1995-11-08,22,15,0,1.257454,3.967424,1,97,105,31.0,43.0,72.0,93.0,8.0,2.0,21.0,14.0,27.0,17.0,34.0,25.0,13.0,11.0,35.0,27.0,48.0,38.0,19.0,26.0,3.0,12.0,9.0,0.0,22.0,8.0,23.0,25.0
21,1995-11-08,19,13,0,1.302199,3.542703,0,88,75,31.0,26.0,74.0,78.0,3.0,4.0,9.0,17.0,23.0,19.0,25.0,22.0,15.0,17.0,32.0,25.0,47.0,42.0,19.0,11.0,10.0,9.0,6.0,4.0,17.0,18.0,22.0,24.0


In [6]:
featuresT = [
    "TPOFF", "OddsT", "OddsO", "W", "TSC", "OSC", "TFGM", "OFGM", "TFGA", "OFGA", "TFG3M", "OFG3M",
    "TFG3A", "OFG3A", "TFTM", "OFTM", "TFTA", "OFTA", "TORB", "OORB", "TDRB", "ODRB", "TRB", "ORB",
    "TAST", "OAST", "TSTL", "OSTL", "TBLK", "OBLK", "TTOV", "OTOV", "TPF", "OPF"
]

In [7]:
pd.concat([dfH, dfA]).head()

Unnamed: 0,Season,Date,TID,OID,TPOFF,OddsT,OddsO,W,TSC,OSC,TFGM,OFGM,TFGA,OFGA,TFG3M,OFG3M,TFG3A,OFG3A,TFTM,OFTM,TFTA,OFTA,TORB,OORB,TDRB,ODRB,TRB,ORB,TAST,OAST,TSTL,OSTL,TBLK,OBLK,TTOV,OTOV,TPF,OPF
23739,21,1995-11-07,0,11,0,1.274083,3.794318,0,66,108,25.0,39.0,65.0,79.0,3.0,7.0,17.0,13.0,13.0,23.0,22.0,32.0,4.0,13.0,25.0,36.0,29.0,49.0,13.0,22.0,6.0,10.0,3.0,5.0,23.0,16.0,24.0,21.0
23740,21,1995-11-07,17,43,0,1.597972,2.286949,1,114,106,46.0,40.0,83.0,77.0,6.0,13.0,12.0,30.0,16.0,13.0,24.0,17.0,12.0,4.0,31.0,25.0,43.0,29.0,30.0,29.0,11.0,7.0,1.0,5.0,21.0,21.0,19.0,25.0
23741,21,1995-11-08,41,39,0,1.471072,2.640288,0,87,91,30.0,36.0,75.0,84.0,3.0,1.0,16.0,11.0,24.0,18.0,39.0,33.0,10.0,10.0,42.0,36.0,52.0,46.0,18.0,17.0,5.0,6.0,6.0,4.0,19.0,15.0,31.0,31.0
23742,21,1995-11-08,15,22,0,1.257454,3.967424,0,97,105,31.0,43.0,72.0,93.0,8.0,2.0,21.0,14.0,27.0,17.0,34.0,25.0,13.0,11.0,35.0,27.0,48.0,38.0,19.0,26.0,3.0,12.0,9.0,0.0,22.0,8.0,23.0,25.0
23743,21,1995-11-08,13,19,0,1.302199,3.542703,1,88,75,31.0,26.0,74.0,78.0,3.0,4.0,9.0,17.0,23.0,19.0,25.0,22.0,15.0,17.0,32.0,25.0,47.0,42.0,19.0,11.0,10.0,9.0,6.0,4.0,17.0,18.0,22.0,24.0


In [8]:
agg = {f: "mean" for f in featuresT}
agg["Timestamp"] = "max"

dfT = pd.concat([dfH, dfA])
dfT["Timestamp"] = dfT["Date"].astype(int)

dfT = dfT \
    .groupby(["Season", "TID"]) \
    .rolling(1_000_000, min_periods=1, closed="left") \
    .agg(agg) \
    .reset_index() \
    .drop(columns="level_2") \
    .dropna() \
    .sort_values(by="Timestamp")

dfT["Timestamp"] = dfT["Timestamp"].astype(int)

display(dfT.head(), dfT.shape)

Unnamed: 0,Season,TID,TPOFF,OddsT,OddsO,W,TSC,OSC,TFGM,OFGM,TFGA,OFGA,TFG3M,OFG3M,TFG3A,OFG3A,TFTM,OFTM,TFTA,OFTA,TORB,OORB,TDRB,ODRB,TRB,ORB,TAST,OAST,TSTL,OSTL,TBLK,OBLK,TTOV,OTOV,TPF,OPF,Timestamp
1,21,0,0.0,1.274083,3.794318,0.0,66.0,108.0,25.0,39.0,65.0,79.0,3.0,7.0,17.0,13.0,13.0,23.0,22.0,32.0,4.0,13.0,25.0,36.0,29.0,49.0,13.0,22.0,6.0,10.0,3.0,5.0,23.0,16.0,24.0,21.0,815702400000000000
1076,21,17,0.0,1.597972,2.286949,1.0,114.0,106.0,46.0,40.0,83.0,77.0,6.0,13.0,12.0,30.0,16.0,13.0,24.0,17.0,12.0,4.0,31.0,25.0,43.0,29.0,30.0,29.0,11.0,7.0,1.0,5.0,21.0,21.0,19.0,25.0,815702400000000000
890,21,15,0.0,1.257454,3.967424,0.0,97.0,105.0,31.0,43.0,72.0,93.0,8.0,2.0,21.0,14.0,27.0,17.0,34.0,25.0,13.0,11.0,35.0,27.0,48.0,38.0,19.0,26.0,3.0,12.0,9.0,0.0,22.0,8.0,23.0,25.0,815788800000000000
709,21,12,0.0,1.441949,2.748731,1.0,107.0,97.0,39.0,32.0,74.0,74.0,3.0,10.0,10.0,22.0,26.0,23.0,31.0,36.0,11.0,10.0,36.0,26.0,47.0,36.0,19.0,19.0,6.0,7.0,3.0,3.0,16.0,13.0,25.0,22.0,815788800000000000
524,21,8,0.0,1.861227,1.877742,0.0,98.0,110.0,34.0,39.0,82.0,86.0,3.0,5.0,19.0,17.0,27.0,27.0,42.0,41.0,12.0,19.0,34.0,36.0,46.0,55.0,18.0,29.0,6.0,6.0,4.0,4.0,10.0,16.0,31.0,30.0,815788800000000000


(10382, 37)

In [9]:
df_bare = df[meta_columns].drop(columns=["Open"])
df_bare["Timestamp"] = df_bare["Date"].astype(int)

display(df_bare.head(), df_bare.shape)

Unnamed: 0,Season,Date,HID,AID,POFF,H,A,OddsH,OddsA,Timestamp
23739,21,1995-11-07,0,11,0,0,1,1.274083,3.794318,815702400000000000
23740,21,1995-11-07,17,43,0,1,0,1.597972,2.286949,815702400000000000
23741,21,1995-11-08,41,39,0,0,1,1.471072,2.640288,815788800000000000
23742,21,1995-11-08,15,22,0,0,1,1.257454,3.967424,815788800000000000
23743,21,1995-11-08,13,19,0,1,0,1.302199,3.542703,815788800000000000


(5251, 10)

In [10]:
df_features = pd.merge_asof(df_bare, dfT, on="Timestamp", left_by=["Season", "HID"], right_by=["Season", "TID"], allow_exact_matches=False).dropna()
df_features = pd.merge_asof(df_features, dfT, on="Timestamp", left_by=["Season", "AID"], right_by=["Season", "TID"], suffixes=("_H", "_A"), allow_exact_matches=False).dropna()

display(df_features.head(), df_features.shape)

Unnamed: 0,Season,Date,HID,AID,POFF,H,A,OddsH,OddsA,Timestamp,TID_H,TPOFF_H,OddsT_H,OddsO_H,W_H,TSC_H,OSC_H,TFGM_H,OFGM_H,TFGA_H,OFGA_H,TFG3M_H,OFG3M_H,TFG3A_H,OFG3A_H,TFTM_H,OFTM_H,TFTA_H,OFTA_H,TORB_H,OORB_H,TDRB_H,ODRB_H,TRB_H,ORB_H,TAST_H,OAST_H,TSTL_H,OSTL_H,TBLK_H,OBLK_H,TTOV_H,OTOV_H,TPF_H,OPF_H,TID_A,TPOFF_A,OddsT_A,OddsO_A,W_A,TSC_A,OSC_A,TFGM_A,OFGM_A,TFGA_A,OFGA_A,TFG3M_A,OFG3M_A,TFG3A_A,OFG3A_A,TFTM_A,OFTM_A,TFTA_A,OFTA_A,TORB_A,OORB_A,TDRB_A,ODRB_A,TRB_A,ORB_A,TAST_A,OAST_A,TSTL_A,OSTL_A,TBLK_A,OBLK_A,TTOV_A,OTOV_A,TPF_A,OPF_A
0,21,1995-11-10,43,12,0,0,1,1.290665,3.640357,815961600000000000,43.0,0.0,1.470806,2.641219,1.0,112.0,104.0,41.0,38.0,77.0,89.0,4.0,4.0,15.0,10.0,26.0,24.0,30.0,29.0,8.0,17.0,31.0,27.0,39.0,44.0,21.0,12.0,5.0,5.0,6.0,3.0,16.0,14.0,25.0,26.0,12.0,0.0,1.441949,2.748731,1.0,107.0,97.0,39.0,32.0,74.0,74.0,3.0,10.0,10.0,22.0,26.0,23.0,31.0,36.0,11.0,10.0,36.0,26.0,47.0,36.0,19.0,19.0,6.0,7.0,3.0,3.0,16.0,13.0,25.0,22.0
1,21,1995-11-10,30,13,0,0,1,1.371997,3.075118,815961600000000000,30.0,0.0,1.714175,2.070291,1.0,109.0,94.0,35.0,37.0,59.0,82.0,6.0,6.0,8.0,20.0,33.0,14.0,42.0,24.0,6.0,11.0,30.0,16.0,36.0,27.0,21.0,21.0,7.0,7.0,7.0,0.0,23.0,15.0,23.0,31.0,13.0,0.0,1.302199,3.542703,1.0,88.0,75.0,31.0,26.0,74.0,78.0,3.0,4.0,9.0,17.0,23.0,19.0,25.0,22.0,15.0,17.0,32.0,25.0,47.0,42.0,19.0,11.0,10.0,9.0,6.0,4.0,17.0,18.0,22.0,24.0
2,21,1995-11-10,3,44,0,1,0,1.219087,4.460257,815961600000000000,3.0,0.0,1.168486,5.425578,0.0,117.0,118.0,42.0,40.0,105.0,95.0,6.0,10.0,21.0,22.0,27.0,28.0,45.0,36.0,23.0,18.0,35.0,38.0,58.0,56.0,14.0,22.0,6.0,6.0,8.0,7.0,14.0,25.0,32.0,42.0,44.0,0.0,1.778309,1.977631,0.0,99.0,106.0,37.0,35.0,78.0,86.0,4.0,6.0,10.0,16.0,21.0,30.0,32.0,41.0,11.0,18.0,35.0,32.0,46.0,50.0,23.0,26.0,8.0,14.0,7.0,5.0,22.0,17.0,29.0,26.0
3,21,1995-11-10,17,2,0,1,0,1.278218,3.7543,815961600000000000,17.0,0.0,1.597972,2.286949,1.0,114.0,106.0,46.0,40.0,83.0,77.0,6.0,13.0,12.0,30.0,16.0,13.0,24.0,17.0,12.0,4.0,31.0,25.0,43.0,29.0,30.0,29.0,11.0,7.0,1.0,5.0,21.0,21.0,19.0,25.0,2.0,0.0,1.183093,5.096083,0.0,106.0,110.0,41.0,40.0,90.0,73.0,6.0,2.0,19.0,10.0,18.0,28.0,20.0,35.0,15.0,10.0,19.0,30.0,34.0,40.0,23.0,13.0,9.0,6.0,4.0,6.0,13.0,16.0,25.0,21.0
4,21,1995-11-10,0,35,0,1,0,1.406762,2.899507,815961600000000000,0.0,0.0,1.274083,3.794318,0.0,66.0,108.0,25.0,39.0,65.0,79.0,3.0,7.0,17.0,13.0,13.0,23.0,22.0,32.0,4.0,13.0,25.0,36.0,29.0,49.0,13.0,22.0,6.0,10.0,3.0,5.0,23.0,16.0,24.0,21.0,35.0,0.0,1.231271,4.287158,1.0,102.0,92.0,40.0,37.0,78.0,94.0,3.0,4.0,18.0,20.0,19.0,14.0,24.0,20.0,9.0,7.0,43.0,31.0,52.0,38.0,28.0,21.0,2.0,8.0,5.0,5.0,24.0,15.0,26.0,27.0


(5121, 80)

In [11]:
formula_bare1 = "H ~ W_H + W_A"

print(formula_bare1)

model_bare1 = smf.logit(formula=formula_bare1, data=df_features)
result_bare1 = model_bare1.fit()

result_bare1.summary()

H ~ W_H + W_A
Optimization terminated successfully.
         Current function value: 0.626210
         Iterations 5


0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5118.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.0676
Time:,00:03:03,Log-Likelihood:,-3206.8
converged:,True,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,1.071e-101

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1912,0.124,1.544,0.123,-0.052,0.434
W_H,2.4465,0.147,16.639,0.000,2.158,2.735
W_A,-2.0174,0.152,-13.257,0.000,-2.316,-1.719


In [12]:
formula_bare2 = "H ~ OddsT_H + OddsO_H + OddsT_A + OddsO_A"

print(formula_bare2)

model_bare2 = smf.logit(formula=formula_bare2, data=df_features)
result_bare2 = model_bare2.fit()

result_bare2.summary()

H ~ OddsT_H + OddsO_H + OddsT_A + OddsO_A
Optimization terminated successfully.
         Current function value: 0.604473
         Iterations 6


0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5116.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.09996
Time:,00:03:03,Log-Likelihood:,-3095.5
converged:,True,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,1.669e-147

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1587,0.430,0.369,0.712,-0.684,1.001
OddsT_H,-0.7506,0.129,-5.821,0.000,-1.003,-0.498
OddsO_H,0.2787,0.043,6.521,0.000,0.195,0.363
OddsT_A,0.7723,0.132,5.832,0.000,0.513,1.032
OddsO_A,-0.1928,0.039,-4.957,0.000,-0.269,-0.117


In [13]:
formula_bare3 = "H ~ TSC_H + OSC_H + TSC_A + OSC_A"

print(formula_bare3)

model_bare3 = smf.logit(formula=formula_bare3, data=df_features)
result_bare3 = model_bare3.fit()

result_bare3.summary()

H ~ TSC_H + OSC_H + TSC_A + OSC_A
Optimization terminated successfully.
         Current function value: 0.615238
         Iterations 5


0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5116.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.08394
Time:,00:03:04,Log-Likelihood:,-3150.6
converged:,True,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,1.2280000000000002e-123

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.7282,0.878,-0.830,0.407,-2.449,0.992
TSC_H,0.0961,0.006,15.823,0.000,0.084,0.108
OSC_H,-0.1010,0.007,-15.490,0.000,-0.114,-0.088
TSC_A,-0.0748,0.006,-12.120,0.000,-0.087,-0.063
OSC_A,0.0912,0.006,14.097,0.000,0.079,0.104


In [14]:
formula_bare4 = "H ~ TFGM_H + TFG3M_H + TFTM_H + TFGM_A + TFG3M_A + TFTM_A + OFGM_H + OFG3M_H + OFTM_H + OFGM_A + OFG3M_A + OFTM_A"

print(formula_bare4)

model_bare4 = smf.logit(formula=formula_bare4, data=df_features)
result_bare4 = model_bare4.fit()

result_bare4.summary()

H ~ TFGM_H + TFG3M_H + TFTM_H + TFGM_A + TFG3M_A + TFTM_A + OFGM_H + OFG3M_H + OFTM_H + OFGM_A + OFG3M_A + OFTM_A
Optimization terminated successfully.
         Current function value: 0.611469
         Iterations 5


0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5108.0
Method:,MLE,Df Model:,12.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.08955
Time:,00:03:04,Log-Likelihood:,-3131.3
converged:,True,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,4.119e-124

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.8418,0.917,-0.918,0.359,-2.639,0.955
TFGM_H,0.1715,0.016,10.539,0.000,0.140,0.203
TFG3M_H,0.1489,0.020,7.325,0.000,0.109,0.189
TFTM_H,0.0954,0.013,7.514,0.000,0.071,0.120
TFGM_A,-0.1149,0.016,-6.996,0.000,-0.147,-0.083
TFG3M_A,-0.1636,0.020,-7.992,0.000,-0.204,-0.123
TFTM_A,-0.0517,0.013,-3.968,0.000,-0.077,-0.026
OFGM_H,-0.2178,0.016,-13.574,0.000,-0.249,-0.186
OFG3M_H,-0.0527,0.027,-1.924,0.054,-0.106,0.001


In [15]:
formula1 = "H ~ W_H + W_A + TSC_H + OSC_H + TSC_A + OSC_A"

print(formula1)

model1 = smf.logit(formula=formula1, data=df_features)
result1 = model1.fit()

result1.summary()

H ~ W_H + W_A + TSC_H + OSC_H + TSC_A + OSC_A
Optimization terminated successfully.
         Current function value: 0.615078
         Iterations 5


0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5114.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.08417
Time:,00:03:04,Log-Likelihood:,-3149.8
converged:,True,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,7.898000000000001e-122

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1121,0.928,-1.198,0.231,-2.931,0.707
W_H,0.3233,0.331,0.978,0.328,-0.325,0.971
W_A,0.2886,0.346,0.833,0.405,-0.390,0.967
TSC_H,0.0857,0.012,7.007,0.000,0.062,0.110
OSC_H,-0.0902,0.013,-6.979,0.000,-0.115,-0.065
TSC_A,-0.0840,0.013,-6.639,0.000,-0.109,-0.059
OSC_A,0.1008,0.013,7.706,0.000,0.075,0.126


In [16]:
formula2 = "H ~ W_H + W_A + TFGM_H + TFG3M_H + TFTM_H + TFGM_A + TFG3M_A + TFTM_A + OFGM_H + OFG3M_H + OFTM_H + OFGM_A + OFG3M_A + OFTM_A"

print(formula2)

model2 = smf.ols(formula=formula2, data=df_features)
result2 = model2.fit()

result2.summary()

H ~ W_H + W_A + TFGM_H + TFG3M_H + TFTM_H + TFGM_A + TFG3M_A + TFTM_A + OFGM_H + OFG3M_H + OFTM_H + OFGM_A + OFG3M_A + OFTM_A


0,1,2,3
Dep. Variable:,H,R-squared:,0.113
Model:,OLS,Adj. R-squared:,0.111
Method:,Least Squares,F-statistic:,46.48
Date:,"Thu, 07 Nov 2024",Prob (F-statistic):,8.17e-122
Time:,00:03:04,Log-Likelihood:,-3297.7
No. Observations:,5121,AIC:,6625.0
Df Residuals:,5106,BIC:,6724.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1502,0.201,0.746,0.456,-0.245,0.545
W_H,0.1103,0.069,1.599,0.110,-0.025,0.246
W_A,0.0590,0.072,0.816,0.415,-0.083,0.201
TFGM_H,0.0290,0.005,5.463,0.000,0.019,0.039
TFG3M_H,0.0284,0.005,5.949,0.000,0.019,0.038
TFTM_H,0.0163,0.004,4.549,0.000,0.009,0.023
TFGM_A,-0.0271,0.006,-4.833,0.000,-0.038,-0.016
TFG3M_A,-0.0365,0.005,-7.468,0.000,-0.046,-0.027
TFTM_A,-0.0127,0.004,-3.461,0.001,-0.020,-0.005

0,1,2,3
Omnibus:,43243.313,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,540.014
Skew:,-0.356,Prob(JB):,5.46e-118
Kurtosis:,1.577,Cond. No.,2640.0


In [17]:
formula3 = "H ~ OddsT_H + OddsO_H + OddsT_A + OddsO_A + W_H + W_A + TFGM_H + TFG3M_H + TFTM_H + TFGM_A + TFG3M_A + TFTM_A + OFGM_H + OFG3M_H + OFTM_H + OFGM_A + OFG3M_A + OFTM_A"

print(formula3)

model3 = smf.ols(formula=formula3, data=df_features)
result3 = model3.fit()

result3.summary()

H ~ OddsT_H + OddsO_H + OddsT_A + OddsO_A + W_H + W_A + TFGM_H + TFG3M_H + TFTM_H + TFGM_A + TFG3M_A + TFTM_A + OFGM_H + OFG3M_H + OFTM_H + OFGM_A + OFG3M_A + OFTM_A


0,1,2,3
Dep. Variable:,H,R-squared:,0.142
Model:,OLS,Adj. R-squared:,0.139
Method:,Least Squares,F-statistic:,46.79
Date:,"Thu, 07 Nov 2024",Prob (F-statistic):,4.12e-154
Time:,00:03:04,Log-Likelihood:,-3213.7
No. Observations:,5121,AIC:,6465.0
Df Residuals:,5102,BIC:,6590.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2070,0.214,0.968,0.333,-0.212,0.626
OddsT_H,-0.1191,0.028,-4.196,0.000,-0.175,-0.063
OddsO_H,0.0388,0.008,4.697,0.000,0.023,0.055
OddsT_A,0.0782,0.027,2.945,0.003,0.026,0.130
OddsO_A,-0.0361,0.008,-4.293,0.000,-0.053,-0.020
W_H,0.0068,0.069,0.099,0.921,-0.128,0.142
W_A,0.1187,0.072,1.654,0.098,-0.022,0.259
TFGM_H,0.0179,0.005,3.344,0.001,0.007,0.028
TFG3M_H,0.0137,0.005,2.784,0.005,0.004,0.023

0,1,2,3
Omnibus:,227191.699,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,479.078
Skew:,-0.332,Prob(JB):,9.32e-105
Kurtosis:,1.656,Cond. No.,2850.0


In [18]:
formula_full = "H ~ POFF + " + " + ".join(map(lambda x: x + "_H", featuresT)) + " + " + " + ".join(map(lambda x: x + "_A", featuresT))

print(formula_full)

model_full = smf.logit(formula=formula_full, data=df_features)
result_full = model_full.fit()

result_full.summary()

H ~ POFF + TPOFF_H + OddsT_H + OddsO_H + W_H + TSC_H + OSC_H + TFGM_H + OFGM_H + TFGA_H + OFGA_H + TFG3M_H + OFG3M_H + TFG3A_H + OFG3A_H + TFTM_H + OFTM_H + TFTA_H + OFTA_H + TORB_H + OORB_H + TDRB_H + ODRB_H + TRB_H + ORB_H + TAST_H + OAST_H + TSTL_H + OSTL_H + TBLK_H + OBLK_H + TTOV_H + OTOV_H + TPF_H + OPF_H + TPOFF_A + OddsT_A + OddsO_A + W_A + TSC_A + OSC_A + TFGM_A + OFGM_A + TFGA_A + OFGA_A + TFG3M_A + OFG3M_A + TFG3A_A + OFG3A_A + TFTM_A + OFTM_A + TFTA_A + OFTA_A + TORB_A + OORB_A + TDRB_A + ODRB_A + TRB_A + ORB_A + TAST_A + OAST_A + TSTL_A + OSTL_A + TBLK_A + OBLK_A + TTOV_A + OTOV_A + TPF_A + OPF_A
         Current function value: 0.586428
         Iterations: 35




0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5058.0
Method:,MLE,Df Model:,62.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.1268
Time:,00:03:04,Log-Likelihood:,-3003.1
converged:,False,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,2.251e-143

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.4709,1.732,-1.427,0.154,-5.865,0.924
POFF,0.5339,0.227,2.353,0.019,0.089,0.979
TPOFF_H,9.2778,4.976,1.865,0.062,-0.474,19.030
OddsT_H,-0.3855,0.157,-2.453,0.014,-0.693,-0.078
OddsO_H,0.2006,0.051,3.950,0.000,0.101,0.300
W_H,-0.5249,0.364,-1.440,0.150,-1.239,0.189
TSC_H,0.0368,,,,,
OSC_H,0.0172,1.18e+04,1.46e-06,1.000,-2.31e+04,2.31e+04
TFGM_H,0.0634,,,,,


In [19]:
# featuresT = [
#     "TPOFF", "OddsT", "OddsO", "W", "TSC", "OSC", "TFGM", "OFGM", "TFGA", "OFGA", "TFG3M", "OFG3M",
#     "TFG3A", "OFG3A", "TFTM", "OFTM", "TFTA", "OFTA", "TORB", "OORB", "TDRB", "ODRB", "TRB", "ORB",
#     "TAST", "OAST", "TSTL", "OSTL", "TBLK", "OBLK", "TTOV", "OTOV", "TPF", "OPF"
# ]

featuresT_remove_corr = [
    "TPOFF", "OddsT", "OddsO", "W", "TFGM", "OFGM", "TFGA", "OFGA", "TFG3M", "OFG3M",
    "TFG3A", "OFG3A", "TFTM", "OFTM", "TFTA", "OFTA", "TORB", "OORB", "TDRB", "ODRB",
    "TAST", "OAST", "TSTL", "OSTL", "TBLK", "OBLK", "TTOV", "OTOV", "TPF", "OPF"
]

In [20]:
formula_no_corr = "H ~ POFF + " + " + ".join(map(lambda x: x + "_H", featuresT_remove_corr)) + " + " + " + ".join(map(lambda x: x + "_A", featuresT_remove_corr))

print(formula_no_corr)

model_no_corr = smf.logit(formula=formula_no_corr, data=df_features)
result_no_corr = model_no_corr.fit()

result_no_corr.summary()

H ~ POFF + TPOFF_H + OddsT_H + OddsO_H + W_H + TFGM_H + OFGM_H + TFGA_H + OFGA_H + TFG3M_H + OFG3M_H + TFG3A_H + OFG3A_H + TFTM_H + OFTM_H + TFTA_H + OFTA_H + TORB_H + OORB_H + TDRB_H + ODRB_H + TAST_H + OAST_H + TSTL_H + OSTL_H + TBLK_H + OBLK_H + TTOV_H + OTOV_H + TPF_H + OPF_H + TPOFF_A + OddsT_A + OddsO_A + W_A + TFGM_A + OFGM_A + TFGA_A + OFGA_A + TFG3M_A + OFG3M_A + TFG3A_A + OFG3A_A + TFTM_A + OFTM_A + TFTA_A + OFTA_A + TORB_A + OORB_A + TDRB_A + ODRB_A + TAST_A + OAST_A + TSTL_A + OSTL_A + TBLK_A + OBLK_A + TTOV_A + OTOV_A + TPF_A + OPF_A


Optimization terminated successfully.
         Current function value: 0.586428
         Iterations 6


0,1,2,3
Dep. Variable:,H,No. Observations:,5121.0
Model:,Logit,Df Residuals:,5059.0
Method:,MLE,Df Model:,61.0
Date:,"Thu, 07 Nov 2024",Pseudo R-squ.:,0.1268
Time:,00:03:04,Log-Likelihood:,-3003.1
converged:,True,LL-Null:,-3439.3
Covariance Type:,nonrobust,LLR p-value:,5.92e-144

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.4709,1.732,-1.427,0.154,-5.865,0.924
POFF,0.5339,0.227,2.353,0.019,0.089,0.979
TPOFF_H,9.2778,4.976,1.865,0.062,-0.474,19.030
OddsT_H,-0.3855,0.157,-2.453,0.014,-0.693,-0.078
OddsO_H,0.2006,0.051,3.950,0.000,0.101,0.300
W_H,-0.5249,0.364,-1.440,0.150,-1.239,0.189
TFGM_H,0.1369,0.055,2.500,0.012,0.030,0.244
OFGM_H,-0.0179,0.054,-0.330,0.741,-0.124,0.088
TFGA_H,-0.0646,0.050,-1.287,0.198,-0.163,0.034
