In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve

In [3]:
pd.options.display.max_columns
df = pd.read_csv('df_all_rs.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('index', axis=1)
df

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2004-04-14,20301188,1610612746,1610612760,2003,1610612746,87.0,0.423,0.727,0.214,17.0,37.0,1610612760,118.0,0.542,1.000,0.375,32.0,34.0,0
1,2004-04-14,20301184,1610612759,1610612743,2003,1610612759,93.0,0.424,0.679,0.100,15.0,58.0,1610612743,67.0,0.325,0.611,0.222,11.0,47.0,1
2,2004-04-14,20301181,1610612754,1610612741,2003,1610612754,101.0,0.420,0.794,0.316,24.0,58.0,1610612741,96.0,0.420,0.667,0.357,20.0,41.0,1
3,2004-04-14,20301177,1610612764,1610612740,2003,1610612764,78.0,0.375,0.714,0.211,13.0,39.0,1610612740,94.0,0.451,0.600,0.364,24.0,48.0,0
4,2004-04-14,20301179,1610612752,1610612739,2003,1610612752,90.0,0.481,0.714,0.400,13.0,42.0,1610612739,100.0,0.488,0.900,0.364,22.0,40.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19393,2018-10-17,21800011,1610612758,1610612762,2018,1610612758,117.0,0.516,0.667,0.368,17.0,37.0,1610612762,123.0,0.519,0.737,0.481,21.0,44.0,0
19394,2018-10-17,21800012,1610612746,1610612743,2018,1610612746,98.0,0.398,0.833,0.286,21.0,47.0,1610612743,107.0,0.379,0.786,0.333,20.0,56.0,0
19395,2018-10-17,21800013,1610612756,1610612742,2018,1610612756,121.0,0.543,0.875,0.559,35.0,44.0,1610612742,100.0,0.432,0.700,0.303,28.0,38.0,1
19396,2018-10-16,21800001,1610612738,1610612755,2018,1610612738,105.0,0.433,0.714,0.297,21.0,55.0,1610612755,87.0,0.391,0.609,0.192,18.0,47.0,1


In [4]:
target = df['HOME_TEAM_WINS']
data = df[['FG_PCT_home', 'FG_PCT_away', 'FG3_PCT_home', 'FG3_PCT_away']]
data_copy = data.copy()
normalized_data = (data_copy - data_copy.mean()) / data_copy.std()

In [5]:
def evaluar_metricas(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    return [name, mse, rmse, mae]

In [6]:
metrics_results = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])
metrics_results

Unnamed: 0,NAME,MSE,RMSE,MAE


In [None]:
#Kn
for i in range(1, 51):
    kn = KNeighborsRegressor(n_neighbors=i)
    metrics_results.loc[len(metrics_results)+1] = \
        evaluar_metricas(kn, 
                         normalized_data, 
                         target, 
                         'kn_normalized_neighbors_'+str(i))

In [7]:
#kn n times:
metrics_results_kn = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])

for n in range(1, 11):
    metrics_results_kn_2 = pd.DataFrame(columns=['NAME','MSE','RMSE', 'MAE'])
    for i in range(1, 51):
        kn = KNeighborsRegressor(n_neighbors=i)
        metrics_results_kn_2.loc[len(metrics_results_kn_2)+1] = \
            evaluar_metricas(kn, 
                             normalized_data, 
                             target, 
                             'kn_normalized_neighbors_'+str(i))
    
    
    if n==1:
        metrics_results_kn = metrics_results_kn_2
    else: 
        metrics_results_kn = pd.concat([metrics_results_kn, metrics_results_kn_2[['MSE','RMSE', 'MAE']]], axis=1)
metrics_results_kn

Unnamed: 0,NAME,MSE,RMSE,MAE,MSE.1,RMSE.1,MAE.1,MSE.2,RMSE.2,MAE.2,...,MAE.3,MSE.3,RMSE.3,MAE.4,MSE.4,RMSE.4,MAE.5,MSE.5,RMSE.5,MAE.6
1,kn_normalized_neighbors_1,0.253196,0.503186,0.253196,0.258557,0.508485,0.258557,0.261237,0.511114,0.261237,...,0.258557,0.256907,0.50686,0.256907,0.259381,0.509295,0.259381,0.260206,0.510104,0.260206
2,kn_normalized_neighbors_2,0.197887,0.444844,0.265464,0.201959,0.449398,0.268454,0.201856,0.449284,0.265773,...,0.269175,0.19799,0.44496,0.264021,0.195155,0.441763,0.261649,0.203351,0.450944,0.269588
3,kn_normalized_neighbors_3,0.174387,0.417597,0.26433,0.169622,0.411852,0.259381,0.170493,0.412907,0.258419,...,0.266873,0.169599,0.411824,0.258213,0.175189,0.418556,0.263436,0.168591,0.410598,0.2589
4,kn_normalized_neighbors_4,0.172925,0.415843,0.271804,0.164046,0.405026,0.262165,0.155528,0.394371,0.257062,...,0.26866,0.165876,0.407279,0.263711,0.165245,0.406503,0.265928,0.160206,0.400258,0.258454
5,kn_normalized_neighbors_5,0.156693,0.395844,0.264619,0.15972,0.399649,0.267216,0.157781,0.397217,0.265608,...,0.260289,0.154194,0.392675,0.257649,0.161468,0.401831,0.266144,0.161839,0.402292,0.269814
6,kn_normalized_neighbors_6,0.15079,0.388317,0.263505,0.153677,0.392017,0.263299,0.149696,0.386906,0.260034,...,0.259759,0.14819,0.384955,0.259931,0.150956,0.388531,0.262165,0.15205,0.389936,0.260069
7,kn_normalized_neighbors_7,0.150734,0.388245,0.264566,0.148475,0.385324,0.262415,0.149901,0.387171,0.264683,...,0.264242,0.14287,0.377981,0.259529,0.147023,0.383436,0.261443,0.151071,0.388678,0.266274
8,kn_normalized_neighbors_8,0.147655,0.384259,0.265052,0.145834,0.381883,0.265851,0.148563,0.385439,0.266134,...,0.266418,0.143908,0.379352,0.260541,0.150161,0.387506,0.266289,0.149388,0.386507,0.263144
9,kn_normalized_neighbors_9,0.141446,0.376093,0.262978,0.14478,0.3805,0.26394,0.147761,0.384397,0.265888,...,0.264307,0.147275,0.383764,0.264719,0.145613,0.381592,0.265109,0.144312,0.379884,0.263345
10,kn_normalized_neighbors_10,0.146181,0.382337,0.268247,0.143913,0.379359,0.264701,0.142621,0.377651,0.265546,...,0.260619,0.142515,0.377512,0.263588,0.136109,0.36893,0.257258,0.141462,0.376114,0.263691


In [8]:
metrics_results_kn.to_csv('metrics_results_kn.csv')

In [4]:
metrics_results_kn = pd.read_csv('metrics_results_kn.csv')
metrics_results_kn = metrics_results_kn.drop('Unnamed: 0', axis=1)

In [5]:
metrics_results_kn.select_dtypes(include=[np.number, np.datetime64]).style.highlight_min(color='lightgreen', axis=1)

Unnamed: 0,MSE,RMSE,MAE,MSE.1,RMSE.1,MAE.1,MSE.2,RMSE.2,MAE.2,MSE.3,RMSE.3,MAE.3,MSE.4,RMSE.4,MAE.4,MSE.5,RMSE.5,MAE.5,MSE.6,RMSE.6,MAE.6,MSE.7,RMSE.7,MAE.7,MSE.8,RMSE.8,MAE.8,MSE.9,RMSE.9,MAE.9
0,0.253196,0.503186,0.253196,0.258557,0.508485,0.258557,0.261237,0.511114,0.261237,0.266804,0.516531,0.266804,0.255876,0.505842,0.255876,0.260619,0.510508,0.260619,0.258557,0.508485,0.258557,0.256907,0.50686,0.256907,0.259381,0.509295,0.259381,0.260206,0.510104,0.260206
1,0.197887,0.444844,0.265464,0.201959,0.449398,0.268454,0.201856,0.449284,0.265773,0.199072,0.446175,0.264536,0.194485,0.441004,0.260309,0.194897,0.441471,0.259485,0.20232,0.449799,0.269175,0.19799,0.44496,0.264021,0.195155,0.441763,0.261649,0.203351,0.450944,0.269588
2,0.174387,0.417597,0.26433,0.169622,0.411852,0.259381,0.170493,0.412907,0.258419,0.174502,0.417734,0.26055,0.174708,0.417981,0.261993,0.169164,0.411295,0.25677,0.177022,0.42074,0.266873,0.169599,0.411824,0.258213,0.175189,0.418556,0.263436,0.168591,0.410598,0.2589
3,0.172925,0.415843,0.271804,0.164046,0.405026,0.262165,0.155528,0.394371,0.257062,0.16393,0.404883,0.264381,0.162216,0.402761,0.26,0.165644,0.406994,0.264021,0.16768,0.409488,0.26866,0.165876,0.407279,0.263711,0.165245,0.406503,0.265928,0.160206,0.400258,0.258454
4,0.156693,0.395844,0.264619,0.15972,0.399649,0.267216,0.157781,0.397217,0.265608,0.156536,0.395646,0.265072,0.160891,0.401112,0.267959,0.153237,0.391455,0.258639,0.155744,0.394645,0.260289,0.154194,0.392675,0.257649,0.161468,0.401831,0.266144,0.161839,0.402292,0.269814
5,0.15079,0.388317,0.263505,0.153677,0.392017,0.263299,0.149696,0.386906,0.260034,0.144158,0.379682,0.256151,0.155074,0.393795,0.269072,0.154582,0.393169,0.264124,0.148975,0.385973,0.259759,0.14819,0.384955,0.259931,0.150956,0.388531,0.262165,0.15205,0.389936,0.260069
6,0.150734,0.388245,0.264566,0.148475,0.385324,0.262415,0.149901,0.387171,0.264683,0.153709,0.392058,0.270957,0.145904,0.381973,0.260442,0.143547,0.378876,0.25891,0.151025,0.388619,0.264242,0.14287,0.377981,0.259529,0.147023,0.383436,0.261443,0.151071,0.388678,0.266274
7,0.147655,0.384259,0.265052,0.145834,0.381883,0.265851,0.148563,0.385439,0.266134,0.146653,0.382953,0.263634,0.146852,0.383213,0.265644,0.147816,0.384468,0.266289,0.150042,0.387352,0.266418,0.143908,0.379352,0.260541,0.150161,0.387506,0.266289,0.149388,0.386507,0.263144
8,0.141446,0.376093,0.262978,0.14478,0.3805,0.26394,0.147761,0.384397,0.265888,0.139267,0.373185,0.259588,0.146662,0.382964,0.266438,0.139611,0.373645,0.257915,0.143167,0.378374,0.264307,0.147275,0.383764,0.264719,0.145613,0.381592,0.265109,0.144312,0.379884,0.263345
9,0.146181,0.382337,0.268247,0.143913,0.379359,0.264701,0.142621,0.377651,0.265546,0.144874,0.380623,0.265237,0.14126,0.375845,0.263691,0.142223,0.377124,0.265278,0.139085,0.37294,0.260619,0.142515,0.377512,0.263588,0.136109,0.36893,0.257258,0.141462,0.376114,0.263691


In [None]:
#dt
metrics_results = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])
for i in range(1, 11):
    for j in range(1,11):
        dt = DecisionTreeRegressor(max_depth=i, min_samples_leaf=j)
        metrics_results.loc[len(metrics_results)+1] = \
            evaluar_metricas(dt, 
                             normalized_data, 
                             target, 
                             'dt_normalized__maxdepth_'+str(i)+'__minsamplesleaf_'+str(j))
        
metrics_results

In [None]:
#dt n times:
metrics_results_dt = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])

for n in range(1, 11):
    metrics_results_dt_2 = pd.DataFrame(columns=['NAME','MSE','RMSE', 'MAE'])
    for i in range(1, 11):
        for j in range(1,11):
            dt = DecisionTreeRegressor(max_depth=i, min_samples_leaf=j)
            metrics_results_dt_2.loc[len(metrics_results_dt_2)+1] = \
                evaluar_metricas(dt, 
                                 normalized_data, 
                                 target, 
                                 'dt_normalized__maxdepth_'+str(i)+'__minsamplesleaf_'+str(j))
    
    
    if n==1:
        metrics_results_dt = metrics_results_dt_2
    else: 
        metrics_results_dt = pd.concat([metrics_results_dt, metrics_results_dt_2[['MSE','RMSE', 'MAE']]], axis=1)
metrics_results_dt

In [None]:
#rf
for k in range(1,10):
    for i in range(1, 10):
        for j in range(1,10):
            rf = RandomForestRegressor(n_estimators=k, max_depth=i, min_samples_leaf=j)
            metrics_results.loc[len(metrics_results)+1] = \
                evaluar_metricas(rf, 
                                 normalized_data,target,
                                 'rf_normalized__estimators_'+str(k)+
                                 '__maxdepth_'+str(i)+
                                 '__minsamplesleaf_'+str(j))

In [None]:
#rf n times:
metrics_results_rf = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])

for n in range(1, 11):
    metrics_results_rf_2 = pd.DataFrame(columns=['NAME','MSE','RMSE', 'MAE'])
    for k in range(1,101):
        for i in range(1, 6):
            for j in range(1,6):
                rf = RandomForestRegressor(n_estimators=k, max_depth=i, min_samples_leaf=j)
                metrics_results_rf_2.loc[len(metrics_results_rf_2)+1] = \
                    evaluar_metricas(rf, 
                                     normalized_data,target,
                                     'rf_normalized__estimators_'+str(k)+
                                     '__maxdepth_'+str(i)+
                                     '__minsamplesleaf_'+str(j))
        k = k + 25
    
    if n==1:
        metrics_results_rf = metrics_results_rf_2
    else: 
        metrics_results_rf = pd.concat([metrics_results_rf, metrics_results_rf_2[['MSE','RMSE', 'MAE']]], axis=1)
metrics_results_rf

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)
metrics_results_rf.head(100)

In [None]:
#mlpr
metrics_results_mlpr = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])
activations = ['relu', 'logistic', 'tanh']
solvers = ['adam', 'sgd']
learning_rates = ['constant', 'invscaling', 'adaptive']

for act in activations:
    for sol in solvers:
        for rate in learning_rates:
            mlpr = MLPRegressor(activation=act, solver=sol, learning_rate=rate)
            metrics_results_mlpr.loc[len(metrics_results_mlpr)+1] = \
                evaluar_metricas(mlpr, 
                                 normalized_data,
                                 target, 
                                 'mlpr_normalized_activation_'+act+
                                 '__solver_'+sol+
                                 '__batchsize_100'+
                                 '__learningrate_'+rate)
    
metrics_results_mlpr

In [None]:
#mlpr n times:
metrics_results_mlpr = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])
activations = ['relu', 'logistic', 'tanh']
solvers = ['adam', 'sgd']
learning_rates = ['constant', 'invscaling', 'adaptive']
batch_sizes = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

for n in range(1, 11):
    metrics_results_mlpr_2 = pd.DataFrame(columns=['NAME','MSE','RMSE', 'MAE'])
    for act in activations:
        for sol in solvers:
            for rate in learning_rates:
                for size in batch_sizes:
                    mlpr = MLPRegressor(activation=act, solver=sol, batch_size=size, learning_rate=rate)
                    metrics_results_mlpr_2.loc[len(metrics_results_mlpr_2)+1] = \
                        evaluar_metricas(mlpr, 
                                         normalized_data,
                                         target, 
                                         'mlpr_normalized__activation_'+act+
                                         '__solver_'+sol+
                                         '__batchsize_'+str(size)+
                                         '__learningrate_'+rate)
    
    if n==1:
        metrics_results_mlpr = metrics_results_mlpr_2
    else: 
        metrics_results_mlpr = pd.concat([metrics_results_mlpr, metrics_results_mlpr_2[['MSE','RMSE', 'MAE']]], axis=1)
metrics_results_mlpr

In [None]:
df_metrics_list = [metrics_results_kn, metrics_results_dt, metrics_results_rf, metrics_results_mlpr]

df_all_metrics = pd.concat(df_metrics_list)

df_all_metrics.to_csv('df_all_metrics.csv')

In [None]:
df_all_metrics = pd.read_csv('df_all_metrics.csv')
df_all_metrics

In [None]:
df_all_metrics = df_all_metrics.drop('Unnamed: 0', axis=1)
df_all_metrics

In [None]:
df_all_metrics.style.highlight_min(color='lightgreen', axis=1)

In [None]:
#logR
metrics_results_logR = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])
logR = LogisticRegression()

for i in range(1,51):
    metrics_results_logR.loc[len(metrics_results_logR)+1] = \
        evaluar_metricas(logR, 
                         normalized_data, 
                         target, 
                         'logR_norm_'+str(i))
metrics_results_logR

In [None]:
#logR n times:
metrics_results_logR = pd.DataFrame(columns=['NAME', 'MSE','RMSE', 'MAE'])

for n in range(1, 11):
    metrics_results_logR_2 = pd.DataFrame(columns=['NAME','MSE','RMSE', 'MAE'])
    for i in range(1,51):
        logR = LogisticRegression()
        metrics_results_logR.loc[len(metrics_results_logR)+1] = \
            evaluar_metricas(logR, 
                             normalized_data, 
                             target, 
                             'logR_norm_'+str(i))
    
    
    if n==1:
        metrics_results_logR = metrics_results_logR_2
    else: 
        metrics_results_logR = pd.concat([metrics_results_logR, metrics_results_logR_2[['MSE','RMSE', 'MAE']]], axis=1)
metrics_results_logR