En el siguiente ntoebook se hará un nuevo preprocesado con tal objetivo de extraer un dataset que nos permita mejorar la precisión del modelo. Las decisiones que se tomen en este dataset derán en base a las últimas observaciones.

Las estadísticas proporcionadas por cada equipo son:
* FGM: Tiros de campo anotados
* FGA: TIros de campo intentados
* FGM3: Triples Anotados
* FGA3: Triples intentados
* FTM: Tiros libres anotados
* FTA: Tiros libres intentados
* OR: Rebites ofensivos
* DR: Rebotes defensivos
* Ast: Asistencias
* TO: Perdidas de balón
* Stl: Robos de balón
* Blk: Bloqueos
* PF: Faltas personales

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('../data/MRegularSeasonDetailedResults.csv')
df.head(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
5,2003,11,1458,81,1186,55,H,0,26,57,...,11,12,17,6,22,8,19,4,3,25
6,2003,12,1161,80,1236,62,H,0,23,55,...,15,20,28,9,21,11,30,10,4,28
7,2003,12,1186,75,1457,61,N,0,28,62,...,17,17,23,8,25,10,15,14,8,18
8,2003,12,1194,71,1156,66,N,0,28,58,...,18,12,27,13,26,13,25,8,2,18
9,2003,12,1458,84,1296,56,H,0,32,67,...,14,7,12,9,23,10,18,1,3,18


# Creación de nuevas variables

### Asist-To-Turnover Ratio

In [2]:
df['WA2TR'] = df['WAst'] / df['WTO']
df['LA2TR'] = df['LAst'] / df['LTO']
df['WA2TR-LA2TR'] = df['WA2TR'] - df['LA2TR']


## Convert Rebounds to a Ratio value
Convertimos los rebotes ofensivos al ratio de rebotes globales.

In [3]:
df['WOffR'] = df['WOR'] / (df['WOR'] + df['LDR'])
df['LOffR'] = df['LOR'] / (df['LOR'] + df['WDR'])
df['WOffR-LOffR'] = df['WOffR'] - df['LOffR']

###
df['WAllR'] = (df['WOR']+df['WDR']) / (df['WOR']+df['WDR']+df['LOR']+df['LDR'])
df['LAllR'] = 1 - df['WAllR']
df['WAllR-LAllR'] = df['WAllR'] - df['LAllR']



## Field Goal Percentage

In [4]:
df['WFGP'] = (df['WFGM']+df['WFGM3']) / (df['WFGA']+df['WFGA3'])
df['LFGP'] = (df['LFGM']+df['LFGM3']) / (df['LFGA']+df['LFGA3'])

## Ratio de Faltas Personales

In [5]:
df['WPFR'] = df['WPF'] / (df['WPF']+df['LPF'])
df['LPFR'] = df['LPF'] / (df['WPF']+df['LPF'])

## Removing already used columns

In [6]:
used_columns = ['LFGA', 'LFGA3', 'WFGA', 'WFGA3', 'WFGM', 'WFGM3', 'LFGM3',
                'LFGM', 'WOR','WDR', 'LOR', 'LDR', 'WAst', 'LAst', 'WTO', 'LTO']

uninformative_columns = ['WFTM', 'WFTA', 'LFTM', 'LFTA', 'WStl', 'LStl', 'WBlk', 'LBlk', 'NumOT',
                         'WAllR-LAllR', 'WA2TR-LA2TR', 'WA2TR-LA2TR', 'WA2TR-LA2TR', 'WAllR-LAllR',
                         'WOffR-LOffR', 'LScore', 'WScore']

drop_columns = used_columns + uninformative_columns

In [7]:
df.drop(columns=drop_columns, inplace=True)


In [8]:
df.drop(columns=['WLoc']).median()

Season     2014.000000
DayNum       74.000000
WTeamID    1287.000000
LTeamID    1281.000000
WPF          17.000000
LPF          19.000000
WA2TR         1.166667
LA2TR         0.818182
WOffR         0.321429
LOffR         0.285714
WAllR         0.527778
LAllR         0.472222
WFGP          0.450000
LFGP          0.377358
WPFR          0.469388
LPFR          0.530612
dtype: float64

## Convert dataset into something trainable

In [9]:
df["Home"] = np.where(df["WLoc"] == "H", df["WTeamID"], df["LTeamID"])
df["Away"] = np.where(df["WLoc"] != "H", df["WTeamID"], df["LTeamID"])
df["Result"] = np.where(df["WLoc"] == "H", 1, 0) # 1 Si gana el de casa, 0 si gana el de fuera

df['HPF'] = np.where(df["WLoc"] == "H", df['WPF'], df['LPF'])
df['APF'] = np.where(df["WLoc"] != "H", df['WPF'], df['LPF'])

df['HA2TR'] = np.where(df["WLoc"] == "H", df['WA2TR'], df['LA2TR'])
df['AA2TR'] = np.where(df["WLoc"] != "H", df['WA2TR'], df['LA2TR'])

df['HOffR'] = np.where(df["WLoc"] == "H", df['WOffR'], df['LOffR'])
df['AOffR'] = np.where(df["WLoc"] != "H", df['WOffR'], df['LOffR'])

df['HAllR'] = np.where(df["WLoc"] == "H", df['WAllR'], df['LAllR'])
df['AAllR'] = np.where(df["WLoc"] != "H", df['WAllR'], df['LAllR'])

df['HFGP'] = np.where(df["WLoc"] == "H", df['WFGP'], df['LFGP'])
df['AFGP'] = np.where(df["WLoc"] != "H", df['WFGP'], df['LFGP'])

df['HPFR'] = np.where(df["WLoc"] == "H", df['WPFR'], df['LPFR'])
df['APFR'] = np.where(df["WLoc"] != "H", df['WPFR'], df['LPFR'])


df.drop(columns=['WTeamID', 'LTeamID', 'WPF', 'LPF', 'WA2TR', 'LA2TR', 'WOffR', 'LOffR', 
                 'LAllR', 'WAllR', 'WFGP', 'LFGP', 'WPFR', 'LPFR', 'WLoc'], inplace=True)


In [None]:
df

In [None]:
list(df.median())

In [10]:
STREAK = {}
STREAK_FUT = {}
def compute_streak(hteam, ateam, result):
    STREAK = STREAK_FUT.copy()
    hteam_streak = STREAK_FUT.get(hteam, 0)
    ateam_streak = STREAK_FUT.get(ateam, 0)
    if result == 1:
        hteam_streak = 1 if hteam_streak < 0 else hteam_streak + 1
        ateam_streak = -1 if ateam_streak > 0 else ateam_streak - 1
    else:
        ateam_streak = 1 if ateam_streak < 0 else ateam_streak + 1
        hteam_streak = -1 if hteam_streak > 0 else hteam_streak - 1
        
    STREAK_FUT[hteam] = hteam_streak
    STREAK_FUT[ateam] = ateam_streak
    return [STREAK.get(hteam, 0), STREAK.get(ateam, 0)]
    

In [11]:

home_columns = ['HPF', 'HA2TR', 'HOffR', 'HAllR', 'HFGP', 'HPFR']
away_columns = ['APF', 'AA2TR', 'AOffR', 'AAllR', 'AFGP', 'APFR']
new_column_names = ['PF', 'A2TR', 'OffR', 'AllR', 'FGP', 'PFR']

dict_away = {x:v for x,v in zip(away_columns, new_column_names)}
dict_home = {x:v for x,v in zip(home_columns, new_column_names)}



final_df_columns = ['Season', 'DayNum', 'Home', 'Away', 'Result'] + home_columns + away_columns + ['HStreak', 'AStreak']
final_df = pd.DataFrame(columns=final_df_columns)

for idx, row in df.iterrows():
    home_home_last = df[(df['Season'] == row['Season']) & (df['DayNum']<row['DayNum']) & (df['Home'] == row['Home'])]
    home_away_last = df[(df['Season'] == row['Season']) & (df['DayNum']<row['DayNum']) & (df['Away'] == row['Home'])]
    
    away_home_last = df[(df['Season'] == row['Season']) & (df['DayNum']<row['DayNum']) & (df['Home'] == row['Away'])]
    away_away_last = df[(df['Season'] == row['Season']) & (df['DayNum']<row['DayNum']) & (df['Away'] == row['Away'])]
    
    home_home_last = home_home_last[home_columns]
    home_away_last = home_away_last[away_columns]

    away_home_last = away_home_last[home_columns]
    away_away_last = away_away_last[away_columns]

    streak = compute_streak(hteam=row['Home'], ateam=row['Away'], result=row['Result'])

    home_home_last = home_home_last.rename(columns=dict_home)
    home_away_last = home_home_last.rename(columns=dict_away)
    away_home_last = away_home_last.rename(columns=dict_home)
    away_away_last = away_away_last.rename(columns=dict_away)
    
    away_last = pd.concat([away_away_last, away_home_last])
    home_last = pd.concat([home_away_last, home_home_last])

    home = list(home_last.median())
    away = list(away_last.median())
    
    final_df.loc[len(final_df)] = list(row[['Season', 'DayNum', 'Home', 'Away', 'Result']].values) + home + away + streak

    print(f"{idx}/{len(df)}")


0/112504
1/112504
2/112504
3/112504
4/112504
5/112504
6/112504
7/112504
8/112504
9/112504
10/112504
11/112504
12/112504
13/112504
14/112504
15/112504
16/112504
17/112504
18/112504
19/112504
20/112504
21/112504
22/112504
23/112504
24/112504
25/112504
26/112504
27/112504
28/112504
29/112504
30/112504
31/112504
32/112504
33/112504
34/112504
35/112504
36/112504
37/112504
38/112504
39/112504
40/112504
41/112504
42/112504
43/112504
44/112504
45/112504
46/112504
47/112504
48/112504
49/112504
50/112504
51/112504
52/112504
53/112504
54/112504
55/112504
56/112504
57/112504
58/112504
59/112504
60/112504
61/112504
62/112504
63/112504
64/112504
65/112504
66/112504
67/112504
68/112504


69/112504
70/112504
71/112504
72/112504
73/112504
74/112504
75/112504
76/112504
77/112504
78/112504
79/112504
80/112504
81/112504
82/112504
83/112504
84/112504
85/112504
86/112504
87/112504
88/112504
89/112504
90/112504
91/112504
92/112504
93/112504
94/112504
95/112504
96/112504
97/112504
98/112504
99/112504
100/112504
101/112504
102/112504
103/112504
104/112504
105/112504
106/112504
107/112504
108/112504
109/112504
110/112504
111/112504
112/112504
113/112504
114/112504
115/112504
116/112504
117/112504
118/112504
119/112504
120/112504
121/112504
122/112504
123/112504
124/112504
125/112504
126/112504
127/112504
128/112504
129/112504
130/112504
131/112504
132/112504
133/112504
134/112504
135/112504
136/112504
137/112504
138/112504
139/112504
140/112504
141/112504
142/112504
143/112504
144/112504
145/112504
146/112504
147/112504
148/112504
149/112504
150/112504
151/112504
152/112504
153/112504
154/112504
155/112504
156/112504
157/112504
158/112504
159/112504
160/112504
161/112504
162/1125

In [12]:
final_df.to_csv('../data/MMedianSeasonAggStreak_2.csv', index=False)

## Model Training

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

MedianSeasonAgg = pd.read_csv("../data/MMedianSeasonAggStreak_2.csv")
MedianSeasonAgg.dropna(inplace=True)
MedianSeasonAgg

Unnamed: 0,Season,DayNum,Home,Away,Result,HPF,HA2TR,HOffR,HAllR,HFGP,HPFR,APF,AA2TR,AOffR,AAllR,AFGP,APFR,HStreak,AStreak
7,2003.0,12.0,1457.0,1186.0,0.0,23.0,0.473684,0.472222,0.596774,0.338028,0.560976,25.0,0.421053,0.200000,0.437500,0.403509,0.581395,-1.0,-1.0
9,2003.0,12.0,1458.0,1296.0,1.0,18.0,1.333333,0.352941,0.562500,0.463768,0.418605,18.0,0.916667,0.230769,0.403226,0.446809,0.439024,1.0,1.0
14,2003.0,14.0,1135.0,1125.0,0.0,13.0,0.944444,0.378378,0.443038,0.400000,0.433333,17.0,0.705882,0.285714,0.481928,0.295455,0.566667,-1.0,-1.0
16,2003.0,14.0,1161.0,1194.0,1.0,25.0,0.823529,0.382353,0.508197,0.396825,0.471698,23.0,0.529412,0.257143,0.442857,0.478261,0.560976,1.0,1.0
17,2003.0,14.0,1166.0,1202.0,1.0,16.0,1.909091,0.500000,0.656716,0.595745,0.516129,5.0,1.200000,0.272727,0.482143,0.562500,0.294118,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112499,2024.0,114.0,1237.0,1454.0,0.0,16.5,0.784615,0.288018,0.471795,0.408043,0.540659,17.0,1.000000,0.322581,0.500000,0.397959,0.500000,-9.0,1.0
112500,2024.0,114.0,1412.0,1455.0,0.0,16.0,1.181818,0.360000,0.536585,0.410959,0.428571,15.0,0.777778,0.285714,0.509091,0.410256,0.500000,1.0,-1.0
112501,2024.0,114.0,1459.0,1359.0,1.0,18.5,1.395833,0.256579,0.538596,0.428964,0.517880,19.0,1.444444,0.281250,0.492063,0.475610,0.514286,-3.0,2.0
112502,2024.0,114.0,1462.0,1177.0,1.0,17.0,1.888889,0.322581,0.517241,0.426829,0.475000,15.0,1.062500,0.166667,0.442308,0.405405,0.475000,-4.0,-15.0


In [6]:
MedianSeasonAgg.tail(1)

Unnamed: 0,Season,DayNum,Home,Away,Result,HPF,HA2TR,HOffR,HAllR,HFGP,HPFR,APF,AA2TR,AOffR,AAllR,AFGP,APFR,HStreak,AStreak
112503,2024.0,114.0,1464.0,1178.0,1.0,17.0,2.25,0.3125,0.52381,0.384615,0.454545,17.0,0.769231,0.25,0.464286,0.38961,0.5,2.0,-2.0


In [14]:
MedianSeasonAgg.corr()

Unnamed: 0,Season,DayNum,Home,Away,Result,HPF,HA2TR,HOffR,HAllR,HFGP,HPFR,APF,AA2TR,AOffR,AAllR,AFGP,APFR,HStreak,AStreak
Season,1.0,-0.054636,0.013586,0.010131,-0.02449,-0.176235,0.15655,-0.374095,0.014828,-0.033801,0.026635,-0.202808,0.204225,-0.432467,-0.003896,-0.032038,0.009098,-0.002638,-0.0083
DayNum,-0.054636,1.0,-0.007945,0.006271,-0.080135,-0.115344,-0.044699,-0.072915,-0.090337,-0.005601,0.040617,-0.190736,0.074369,0.021188,0.071721,0.111737,-0.069945,-0.026973,0.095079
Home,0.013586,-0.007945,1.0,0.002751,0.019502,-0.020811,0.03175,0.01993,0.047171,0.007886,-0.040836,-0.008416,0.034177,0.012625,0.02319,0.025669,-0.023911,0.03383,0.012664
Away,0.010131,0.006271,0.002751,1.0,-0.026317,-0.00957,0.022028,0.009053,0.017899,0.021223,-0.018843,-0.02894,0.052133,0.031839,0.068499,0.037298,-0.059013,0.003118,0.045001
Result,-0.02449,-0.080135,0.019502,-0.026317,1.0,-0.039792,0.126053,0.100285,0.14077,0.131026,-0.096451,0.084016,-0.184036,-0.095775,-0.186262,-0.178614,0.127578,0.166836,-0.210893
HPF,-0.176235,-0.115344,-0.020811,-0.00957,-0.039792,1.0,-0.211327,0.061801,-0.145916,-0.124597,0.452613,0.405302,-0.098814,0.066817,-0.067244,-0.049229,0.064226,-0.132403,-0.042372
HA2TR,0.15655,-0.044699,0.03175,0.022028,0.126053,-0.211327,1.0,-0.010511,0.162459,0.399168,-0.13714,-0.052494,0.128177,-0.029849,0.07034,0.063191,-0.057257,0.264969,0.035446
HOffR,-0.374095,-0.072915,0.01993,0.009053,0.100285,0.061801,-0.010511,1.0,0.61439,0.097789,-0.144033,0.095369,-0.043148,0.227974,0.038034,0.030662,-0.036427,0.163729,0.019915
HAllR,0.014828,-0.090337,0.047171,0.017899,0.14077,-0.145916,0.162459,0.61439,1.0,0.320422,-0.262302,-0.026222,0.085057,0.048273,0.075713,0.064687,-0.06998,0.29001,0.044796
HFGP,-0.033801,-0.005601,0.007886,0.021223,0.131026,-0.124597,0.399168,0.097789,0.320422,1.0,-0.185552,-0.015366,0.072843,0.046163,0.065978,0.081036,-0.05692,0.286722,0.041298


In [15]:

# Separate X, Y for training
X = MedianSeasonAgg.drop(columns=['Result'])
Y = MedianSeasonAgg['Result']

X = MinMaxScaler().fit_transform(X)
# train test spllt
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

# random forest

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
print(model.feature_importances_)
print(model.score(x_train, y_train))    
print(model.score(x_test, y_test))

model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
print(model.feature_importances_)
print(model.score(x_test, y_test))
print(model.score(x_train, y_train))

[0.03556801 0.08724598 0.05213571 0.05086746 0.03746055 0.0656563
 0.05380619 0.06584533 0.06177587 0.05443391 0.03532339 0.06638272
 0.05213508 0.0692796  0.06233514 0.05163246 0.04922779 0.04888852]
1.0
0.6879054452526047
[0.00125693 0.2446057  0.00841716 0.00393913 0.00100518 0.08391055
 0.00775827 0.07265969 0.03242627 0.0204094  0.0002733  0.09224336
 0.00317638 0.10193924 0.03940092 0.00681489 0.1489265  0.13083711]
0.6916896009435817
0.7028324295988074


In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Load the data
agg_MRegularSeasonDetailedResults = pd.read_csv("../data/MMedianSeasonAggStreak_2.csv")
agg_MRegularSeasonDetailedResults.dropna(inplace=True)

seasons = agg_MRegularSeasonDetailedResults['Season'].unique()

for season in seasons:
    df = agg_MRegularSeasonDetailedResults[agg_MRegularSeasonDetailedResults['Season'] == season]
    X = df.drop(columns=['Season', 'DayNum', 'Home', 'Away', 'Result'])
    Y = df['Result']
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    model.fit(x_train, y_train)
    print(f"Season: {season}, train {model.score(x_train, y_train)}")
    print(f"Season: {season}, test {model.score(x_test, y_test)}")

Season: 2003.0, train 0.7543962485345839
Season: 2003.0, test 0.6776084407971864
Season: 2004.0, train 0.7523017523017523
Season: 2004.0, test 0.7054631828978623
Season: 2005.0, train 0.7710144927536232
Season: 2005.0, test 0.6732329084588644
Season: 2006.0, train 0.7628278221208666
Season: 2006.0, test 0.643507972665148
Season: 2007.0, train 0.7499332799572992
Season: 2007.0, test 0.656350053361793
Season: 2008.0, train 0.7425819885476315
Season: 2008.0, test 0.6805411030176899
Season: 2009.0, train 0.7600615226864906
Season: 2009.0, test 0.6793032786885246
Season: 2010.0, train 0.7492323439099283
Season: 2010.0, test 0.6581371545547595
Season: 2011.0, train 0.7473711208002052
Season: 2011.0, test 0.6943589743589743
Season: 2012.0, train 0.7662904053360697
Season: 2012.0, test 0.6646153846153846
Season: 2013.0, train 0.7560606060606061
Season: 2013.0, test 0.6494949494949495
Season: 2014.0, train 0.7389558232931727
Season: 2014.0, test 0.6877510040160643
Season: 2015.0, train 0.741093

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def busqueda_hiperparametros_gb(X_train, y_train, X_test, y_test):
  """
  Función que realiza una búsqueda de hiperparámetros para GradientBoostClassifier.

  Argumentos:
    X_train: Matriz de características del conjunto de entrenamiento.
    y_train: Vector de etiquetas del conjunto de entrenamiento.
    X_test: Matriz de características del conjunto de prueba.
    y_test: Vector de etiquetas del conjunto de prueba.

  Devuelve:
    Diccionario con los mejores hiperparámetros y la precisión del modelo.
  """

  # Parámetros a evaluar
  param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
  }

  # Búsqueda en cuadrícula
  grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=4)
  grid_search.fit(X_train, y_train)

  # Mejores hiperparámetros
  mejores_parametros = grid_search.best_params_

  # Precisión del modelo con los mejores hiperparámetros
  modelo = GradientBoostingClassifier(**mejores_parametros)
  modelo.fit(X_train, y_train)
  predicciones = modelo.predict(X_test)
  precision = accuracy_score(y_test, predicciones)

  return {
    'mejores_parametros': mejores_parametros,
    'precision': precision
  }

# Ejemplo de uso
resultados = busqueda_hiperparametros_gb(x_train, y_train, x_test, y_test)

# Imprimir resultados
print(f"Mejores parámetros: {resultados['mejores_parametros']}")
print(f"Precisión: {resultados['precision']}")


In [None]:
def get_last_game(teamid):
    """
    Params:
        Looks for last line of information corresponding to a team
    Recives:
        array: containing all columns of dataset for the team
    
    """
    pass

def compute_game(team_a, team_b, model):
    game_preview = np.concat(get_last_game(team_a), get_last_game(team_b))
    y_pred = model.predict(x=game_preview)
    return team_a if y_pred == 1 else team_b

def compute_play_off(seeds):
    pass