#### Etapa de Entrenamiento y Evaluación del Modelo de Aprendizaje Supervisado de Regresión

In [25]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

Ya que algunos algoritmos como los lineales y los de Boosting exigen tener los datos
normalizados, pues se procede a realizar dicho paso.

In [44]:
dataframe = pd.read_csv('/workspaces/ProyectoDS/data/processed/data_clean.csv')
dataframe.head()

Unnamed: 0,PLAYER_ID,PTS,MIN_PROM,FGM_PROM,FGA_PROM,FG3M_PROM,FG3A_PROM,FTM_PROM,FTA_PROM,OREB_PROM,...,STL_PROM,BLK_PROM,BLKA_PROM,PF_PROM,PFD_PROM,PTS_PROM,PLUS_MINUS_PROM,PTSRTeam_PROM,PTSTeam_PROM,W_PCTTeam_PROM
0,2544,22,28.35,7.0,17.0,3.0,8.0,5.0,7.0,2.0,...,0.0,0.0,1.0,2.0,6.0,22.0,2.0,116.0,109.0,0.0
1,2544,22,28.35,7.0,17.0,3.0,8.0,5.0,7.0,2.0,...,0.0,0.0,1.0,2.0,6.0,22.0,2.0,116.0,109.0,0.0
2,2544,18,29.875,7.5,17.5,3.0,8.0,4.0,5.5,1.5,...,0.5,0.0,1.0,1.0,5.0,22.0,9.0,115.5,123.5,0.25
3,2544,29,28.65,7.0,16.0,2.333333,7.0,4.333333,6.0,1.0,...,0.666667,0.0,0.666667,1.0,5.333333,20.666667,11.333333,107.333333,124.666667,0.389
4,2544,26,30.466667,8.0,17.0,2.5,7.0,4.25,5.75,1.25,...,1.0,0.25,1.0,1.25,4.75,22.75,8.5,109.25,120.25,0.41675


In [45]:
#Se separan los datos de entrenamiento y prueba

X = dataframe.drop(['PTS'], axis = 1)
y = dataframe['PTS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [46]:
#Se realiza la normalización de los datos

#Columnas de las variables predictoras
n_columns = X.columns.tolist()

#Se inicializa la función
scaler = StandardScaler()

#Se ajusta el scaler a los datos de entrenamiento
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train.index, columns = n_columns)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test.index, columns = n_columns)

In [47]:
X_test_scaled.head()

Unnamed: 0,PLAYER_ID,MIN_PROM,FGM_PROM,FGA_PROM,FG3M_PROM,FG3A_PROM,FTM_PROM,FTA_PROM,OREB_PROM,DREB_PROM,...,STL_PROM,BLK_PROM,BLKA_PROM,PF_PROM,PFD_PROM,PTS_PROM,PLUS_MINUS_PROM,PTSRTeam_PROM,PTSTeam_PROM,W_PCTTeam_PROM
32182,0.658931,0.587941,0.740498,0.979923,0.960608,0.763749,-0.139991,0.137703,1.114954,2.207345,...,-0.561365,1.842957,0.355121,0.375555,0.721569,0.643615,-0.870403,-0.014473,0.443834,-1.251793
53479,-1.494717,0.878471,0.04769,-0.2855,-1.091694,-1.196499,1.71036,1.664684,2.832174,1.634077,...,-0.196265,1.842957,-0.109524,1.457926,1.60153,0.282614,-1.809369,0.545053,-1.009776,-0.233392
60887,0.663165,0.401242,0.124668,0.251346,-0.345402,-0.02035,-0.718226,-0.81666,-0.602267,0.29645,...,0.168836,-0.091837,-0.57417,-0.273868,-1.038352,-0.133925,-0.000991,0.125408,0.899869,1.156863
55277,-1.493954,0.851699,1.433306,1.094962,0.40089,0.528519,2.635535,2.332738,2.402869,1.442987,...,0.168836,1.069039,-0.109524,0.808503,2.230073,1.726616,-0.661744,0.964697,-0.952772,-0.046266
46641,0.665523,0.054723,-1.106991,-1.062009,-1.091694,-1.176896,-0.94952,-1.007533,0.578322,-0.921745,...,0.077561,0.585341,0.122799,0.429673,-0.849789,-1.189157,1.346599,-0.357183,-1.601196,-2.63785


Se evaluarán varios modelos en su forma estándar y a partir de allí se decidirá cuál procederá a la fase de optimización.

In [48]:
modelos = {"Lineal" : LinearRegression(),
           "Lasso" : Lasso(),
           "Ridge" : Ridge(),
           "Decision tree" : DecisionTreeRegressor(),
           "Random forest" : RandomForestRegressor(),
           "Boosting" :  XGBRegressor(),
           "KNeighbors": KNeighborsRegressor()}

In [49]:
def prueba_modelos(modelos, X_train, X_test, y_train, y_test):

  result_training = {}

  for names, model in modelos.items():

    model.fit(X_train, y_train)    
    y_pred = model.predict(X_test)
    r2_score_ = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    result_training[names] = (r2_score_, rmse)

  return result_training

In [50]:
resultados_base = prueba_modelos(modelos=modelos, X_train=X_train_scaled, X_test=X_test_scaled, y_train=y_train, y_test=y_test)
for key, values in resultados_base.items():
    print(f"{key}:")
    print(f"  Coeficiente de determinación (R^2): {values[0]}")
    print(f"  RMSE: {values[1]}\n")

Lineal:
  Coeficiente de determinación (R^2): 0.5240885063403183
  RMSE: 6.04255269074052

Lasso:
  Coeficiente de determinación (R^2): 0.5047482997046095
  RMSE: 6.1641093866107175

Ridge:
  Coeficiente de determinación (R^2): 0.5240885686841794
  RMSE: 6.042552294956763

Decision tree:
  Coeficiente de determinación (R^2): 0.024047866631771386
  RMSE: 8.653097719700513

Random forest:
  Coeficiente de determinación (R^2): 0.5135203116649232
  RMSE: 6.10927543270873

Boosting:
  Coeficiente de determinación (R^2): 0.5066651679327379
  RMSE: 6.152168750324893

KNeighbors:
  Coeficiente de determinación (R^2): 0.4687418806952999
  RMSE: 6.384253740736669



In [51]:
resultados_base = prueba_modelos(modelos=modelos, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
for key, values in resultados_base.items():
    print(f"{key}:")
    print(f"  Coeficiente de determinación (R^2): {values[0]}")
    print(f"  RMSE: {values[1]}\n")

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Lineal:
  Coeficiente de determinación (R^2): 0.524088506340336
  RMSE: 6.042552690740407

Lasso:
  Coeficiente de determinación (R^2): 0.5135945040011729
  RMSE: 6.108809556423554

Ridge:
  Coeficiente de determinación (R^2): 0.5240885140682765
  RMSE: 6.042552641680352

Decision tree:
  Coeficiente de determinación (R^2): 0.026485592228685073
  RMSE: 8.642284143097125

Random forest:
  Coeficiente de determinación (R^2): 0.5119766601525093
  RMSE: 6.118960444505023

Boosting:
  Coeficiente de determinación (R^2): 0.5066651679327379
  RMSE: 6.152168750324893

KNeighbors:
  Coeficiente de determinación (R^2): 0.46803324559664605
  RMSE: 6.388510239099307

