In [1]:
import sklearn as sk
import pandas as pd
from sklearn.model_selection import KFold,RepeatedKFold,RandomizedSearchCV,GridSearchCV
from sklearn.neural_network import MLPRegressor
from scipy.stats import uniform,loguniform
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from sklearn.linear_model import Ridge
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Preprocesamiento

In [2]:
equipos = pd.read_csv('../Data/equipos_data.csv')
equipos = equipos[['equipo','posicion','valor_mercado','aforo']]
historico = pd.read_csv('../Data/historico_data.csv')
historico = historico[['equipo1','equipo2','mes','fin_de_semana','publico']]

##### Ver si se llenan los estadios en algunos partidos 

In [32]:
se_llena = pd.merge(historico,equipos, left_on='equipo1', right_on='equipo')
se_llena[se_llena['publico'] == se_llena['aforo']]

Se llena el estadio en 157 partidos

##### Media de publico

In [13]:
historico['publico'].mean()

  and should_run_async(code)


29283.747368421053

### Obtener variables dummies para entrenar los modelos

In [3]:
historico = pd.get_dummies(historico, prefix=['equipo1','equipo2'])

## Modelos Predictivos

In [4]:
X = historico.drop('publico',axis=1)
y = historico['publico']
X = X.to_numpy()
y = y.to_numpy()

### Regresion lineal

In [8]:
from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_score
# define model
model = LinearRegression()
# define evaluation
kf = KFold(n_splits=5, shuffle=True, random_state=3)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train],X[test], y[train], y[test]
    model.fit(X_train, y_train)

    # predecimos con los mejores parámetros (por defecto)
    y_pred = model.predict(X_test) 

    # calculamos MSE 
    print('MAE:', mean_squared_error(y_test, y_pred))

MAPE: 0.15307607133539253
MAPE: 0.1463019944091577
MAPE: 0.1533558248248459
MAPE: 0.1441364201243734
MAPE: 0.1550377439637367


### Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor
# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = RandomForestRegressor() 
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
space = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
# define search
search = RandomizedSearchCV(model, space, n_iter=100, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

KeyboardInterrupt: 

### Perceptrón multicapa

In [38]:
# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

model = MLPRegressor(max_iter=200, activation='relu', solver='lbfgs') 
space = dict(hidden_layer_sizes=[10,150,300,1000],
                            learning_rate=['constant', 'adaptive'])
# define search
search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -3200.2255991798893
Best Hyperparameters: {'hidden_layer_sizes': 1000, 'learning_rate': 'constant'}


### KNeighbors

In [37]:
from sklearn.neighbors import KNeighborsRegressor

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = KNeighborsRegressor() 
space = dict(n_neighbors=[3,5,10,15,20,25,30,35,40,45,50],
                    weights=['uniform', 'distance'],
                    leaf_size=[2,5,10,15,20,25,30,35,40,45,50])

# define search
search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -6381.709757141513
Best Hyperparameters: {'leaf_size': 2, 'n_neighbors': 5, 'weights': 'distance'}


### Maquina de Vectores de Soporte

In [36]:
from sklearn.svm import LinearSVR

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

model = LinearSVR(max_iter=1000) 
space = dict(epsilon=[0,0.1,0.001,0.2,0.03,0.4],
                     C=[1,10,100,1000,10000,100000])
# epsilon=[2^-15,2^-13,2^-11,2^-9,2^-7,2^-5,2^-3,2^-1,2,2^3],
# C=[2^-5,2^-3,2^-1,2,2^3,2^5,2^7,2^9,2^9,2^11,2^13,2^15]

# define search
search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -3442.4075477688516
Best Hyperparameters: {'C': 10000, 'epsilon': 0.001}


### RIDGE

In [16]:
# define model
model = Ridge()
# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = loguniform(1e-5, 100)
space['fit_intercept'] = [True, False]
space['normalize'] = [True, False]
# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -3508.8692411906236
Best Hyperparameters: {'alpha': 0.00020578878844176485, 'fit_intercept': True, 'normalize': False, 'solver': 'sag'}


### Validación Cruzada modelo final

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=3)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train],X[test], y[train], y[test]

    # estimamos los mejores parámetros en base al score f1 (según documentacion mas adecuado para clases binarias)
    # como es nuestro caso, esteriotipado o no esteriotipado
    clf = MLPRegressor(hidden_layer_sizes= 1000, learning_rate= 'constant',max_iter=200, activation='relu', solver='lbfgs')
    clf.fit(X_train, y_train)

    # predecimos con los mejores parámetros (por defecto)
    y_pred = clf.predict(X_test) 

    # calculamos MSE 
    print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.12775619429162188
MAPE: 0.12491119715986906


KeyboardInterrupt: 

### Guardar mejor modelo entrenado con todos los datos, en este caso MLP

In [5]:
model = MLPRegressor(max_iter=200, activation='relu', solver='lbfgs', hidden_layer_sizes= 1000, learning_rate= 'constant')

result = model.fit(X,y)

from joblib import dump, load
dump(result, 'trainedMLP.joblib')

['trainedMLP.joblib']

In [16]:
# Later you can load back the pickled model (possibly in another Python process) with:
clf = load('trainedMLP.joblib')

clf.predict(X)

array([68449.09411159, 56404.71247165, 69026.85836228, ...,
        7074.07568818,  4641.32382101,  7074.07568818])