In [281]:
import importlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ensamble_model as em
importlib.reload(em)

# Train Data
fd = '../../data/main_data/train/train.csv'
train = pd.read_csv(fd)

# Test Data
fd = '../../data/main_data/test/test.csv'
test = pd.read_csv(fd)

## This is a description of how the ensamble model works and what it is doing under all of the code

The general object below is just a general_Regression object that can be changed to any model, like linear regression, Lasso, Ridge, decision tree, random forest, gradient boost regression.  You can also put in the parameters for these in the **kwargs argument, it will deal with it correctly with the names, for example max_depth=6 in a decision tree.


In [282]:
# Baseline model, what we want to beat!

general = em.general_Regression(train,type='LR', scale='log')
general.perform_CV()

MSE for train: mean: 0.9718164149663101 std: 0.0063425237322426746
MSE for test:  mean: 0.9921637184791161  std: 0.056137283121666284

RMSE for train: mean: 0.9858022401021957 std: 0.0032184430712350687
RMSE for test: mean: 0.9956784978399821 std: 0.028072182284834084

R^2 for train: mean: 0.4981374205877124 std: 0.0034748163489597577
R^2 for test: mean: 0.4863840926024687 std: 0.03369637922276197

MAE for train: mean: 0.7713606371136257 std: 0.0022404362480962325
MAE for test: mean: 0.7777574257083658 std: 0.0193528801077698

MAPE for train: mean: 0.05553508215855024 std: 0.00018568276535186495
MAPE for test: mean: 0.055975024830683597 std: 0.0016292739650968886



### Position models
The classes G_Pos, D_Pos, M_Pos, F_Pos, are inherited classes of the general_Regression model above, but specificalyly designed to take in X that only correspond to its position.   If you look at the code in ensamble_mode.py, you can see what features it has for each position, which is what it is using for its model.  Since this is inherits the code from general_Regression, it has all the features of that class.

#### Model for Goalkeepers

In [283]:
#base line model for the G-position
g_model = em.G_Pos(train, scale = 'log')
g_model.perform_CV()


MSE for train: mean: 1.22591402345411 std: 0.006828025727353497
MSE for test:  mean: 1.2314295300362061  std: 0.06205934508636801

RMSE for train: mean: 1.1072057193888885 std: 0.003085191508442716
RMSE for test: mean: 1.1093485822244284 std: 0.027843368202442785

R^2 for train: mean: 0.36692526251810664 std: 0.002074083260167302
R^2 for test: mean: 0.3633141191758053 std: 0.019922672686285874

MAE for train: mean: 0.8809492466319812 std: 0.0021147368907914045
MAE for test: mean: 0.8826425924614612 std: 0.018849214847860975

MAPE for train: mean: 0.06322154612302026 std: 0.00016149021925135283
MAPE for test: mean: 0.06333693700665695 std: 0.0014310499490877196



In [284]:
g_hp = em.hyperparameter_tuning_general(train,n_iter=100,cv=3,scale='log')
print(g_hp.best_score)
print(g_hp.best_params)

0.5440535397186607
{'model': 'GBR', 'param': {'max_depth': 10, 'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 8, 'bootstrap': True}}


In [285]:
# Checking better model from hyperparameter tuning
g_hp.best_model.perform_CV()

MSE for train: mean: 0.0001265490405826083 std: 3.858145519361166e-05
MSE for test:  mean: 1.1236305785108807  std: 0.0636715481585731

RMSE for train: mean: 0.011119527144029145 std: 0.0017044520162818775
RMSE for test: mean: 1.059607755694533 std: 0.029359539895506347

R^2 for train: mean: 0.9999350009314828 std: 1.9755369084949526e-05
R^2 for test: mean: 0.4184853441239145 std: 0.05431898051410733

MAE for train: mean: 0.00690284480853718 std: 0.0010574523013200964
MAE for test: mean: 0.8273108407811494 std: 0.018498598615182306

MAPE for train: mean: 0.0005062996284798591 std: 7.748879245767376e-05
MAPE for test: mean: 0.05928246734583968 std: 0.001140168843599241



In [None]:
# note that this model is overfitting: so we need change 

In [None]:
# baseline model for the D-position
d_model = em.D_Pos(train,scale='log')
d_model.perform_CV()

MSE for train: mean: 0.987961258589283 std: 0.006683446288828631
MSE for test:  mean: 1.0063665170562737  std: 0.05897083160872418

RMSE for train: mean: 0.9939567128459608 std: 0.0033632689061734913
RMSE for test: mean: 1.0027500638763334 std: 0.029305740944106325

R^2 for train: mean: 0.48980112750348326 std: 0.0034804301073545238
R^2 for test: mean: 0.47913538082648904 std: 0.03334695611751439

MAE for train: mean: 0.7761123498824781 std: 0.002423170625815895
MAE for test: mean: 0.78161709972238 std: 0.02159430280116342

MAPE for train: mean: 0.05585964255073576 std: 0.00019635286940126447
MAPE for test: mean: 0.056234200798305836 std: 0.0017530010961268998



In [None]:
# Lets look at the D-position and try to make it better.
#from hypertuning, it looks like random forest regressor works well

d_hp = em.hyperparameter_tuning_general(train,n_iter=100,cv=3,scale='log')
print(d_hp.best_params)
print(d_hp.best_params)

MSE for train: mean: 0.04673264513570574 std: 0.0008085874088795808
MSE for test:  mean: 0.6450817785114202  std: 0.03528497002399829

RMSE for train: mean: 0.21616915008907012 std: 0.0018824679213261428
RMSE for test: mean: 0.8028726433634181 std: 0.02184712910334101

R^2 for train: mean: 0.9758672692336485 std: 0.000374529633579481
R^2 for test: mean: 0.6661520461746238 std: 0.01926588172029876

MAE for train: mean: 0.1522410707387417 std: 0.0010449607478465065
MAE for test: mean: 0.6235159357771702 std: 0.014977371459057946

MAPE for train: mean: 0.010976332419750842 std: 7.7191341816692e-05
MAPE for test: mean: 0.044735220579156895 std: 0.0011421604953018671



In [None]:
d_hp.best_model.perform_CV()

In [None]:
#baseline model for the M-position
m_model = em.M_Pos(train,scale='log')
m_model.perform_CV()

MSE for train: mean: 0.9843446354184389 std: 0.006599502477681914
MSE for test:  mean: 1.0026610274759913  std: 0.058225505881492265

RMSE for train: mean: 0.9921358620155323 std: 0.00332666817336718
RMSE for test: mean: 1.000909066751958 std: 0.029018400536829497

R^2 for train: mean: 0.4916687454630805 std: 0.003446101566907871
R^2 for test: mean: 0.48105081799215926 std: 0.03304449514312531

MAE for train: mean: 0.7752565200774655 std: 0.002377056734925013
MAE for test: mean: 0.7809257187143217 std: 0.021203109052552748

MAPE for train: mean: 0.05580615727370001 std: 0.00019403533453101512
MAPE for test: mean: 0.05619175596846023 std: 0.0017327890978573774



In [None]:
# Lets improve the m-model
m_hp = em.hyperparameter_tuning_general(train,n_iter=100,cv=3,scale='log')
print(m_hp.best_score)
print(m_hp.best_params)

In [None]:
m_hp.best_model.perform_CV()

In [None]:
#Baseline model for $F$-position
f_model = em.F_Pos(train,scale='log')
f_model.perform_CV()

MSE for train: mean: 0.987961258589283 std: 0.006683446288828631
MSE for test:  mean: 1.0063665170562737  std: 0.05897083160872418

RMSE for train: mean: 0.9939567128459608 std: 0.0033632689061734913
RMSE for test: mean: 1.0027500638763334 std: 0.029305740944106325

R^2 for train: mean: 0.48980112750348326 std: 0.0034804301073545238
R^2 for test: mean: 0.47913538082648904 std: 0.03334695611751439

MAE for train: mean: 0.7761123498824781 std: 0.002423170625815895
MAE for test: mean: 0.78161709972238 std: 0.02159430280116342

MAPE for train: mean: 0.05585964255073576 std: 0.00019635286940126447
MAPE for test: mean: 0.056234200798305836 std: 0.0017530010961268998



In [None]:
# Lets improve the F-model
f_hp = em.hyperparameter_tuning_general(train,n_iter=100,cv=3,scale='log')
print(f_hp.best_score)
print(f_hp.best_params)

In [None]:
f_hp.best_model.perform_CV()

## The ensamble model
This is a new class that builds a model for all the positions and puts it together.  The way to set it up is as follows in the next cell.

In [None]:
# To set up the ensamble model with your specifications we do the following:

ex = em.ensamble_model(scale='log')    # this sets up the class ready to take inthe parameters and the data for fitting

ex.G_parameters(type = 'LR') # put the parameters for the G model as just linear regression
ex.D_parameters(type = 'RIDGE', alpha=5)  # puts the parameters for the D model as just linear regression with ridge regularization
ex.F_parameters(type='RFR', max_depth=10)   # puts the forwards parameters as random forest with max depth of 10

# Note that any left our parameters changes, for example M here, is left as just Linear regression

# Once you set up what your model is, we can fit the data to it
ex.fit(train)

# It is now fitted to the data and read to predict things:
result = train.copy()
result[ex.target] = ex.scale_target(result[ex.target])

result['prediction']= ex.predict(train)
result['residual'] = abs(result[ex.target] - result['prediction'])

display(result)


Unnamed: 0,name,dob,pos,height,foot,date,market_value,adjusted_market_value,team,league,...,accuratePass,accurateLongBalls,accurateCross,accurateKeeperSweeper,expectedAssists,expectedGoals,xGChain,xGBuildup,prediction,residual
0,noah mbamba,2005-01-04,D,187.0,right,2024-02-03,3000000.0,14.914123,Bayer 04 Leverkusen,Bundesliga,...,13.523810,0.619048,0.047619,0.000000,0.025627,0.001195,0.000000,0.000000,13.625808,1.288315
1,zachary duncan,2000-05-29,M,183.0,right,2021-11-07,300000.0,12.815613,AGF,Superligaen,...,9.000000,0.458333,0.041667,0.000000,0.000000,0.000000,0.000000,0.000000,13.514280,0.698667
2,manuel neuer,1986-03-26,G,193.0,right,2024-10-19,4000000.0,15.201805,FC Bayern München,Bundesliga,...,27.122881,5.411017,0.000000,0.495763,0.001084,0.000969,0.271096,0.270107,15.496563,0.294758
3,mickel miller,1995-12-01,M,173.0,left,2024-04-12,250000.0,12.429220,Plymouth Argyle,Championship,...,13.837209,0.697674,0.418605,0.000000,0.067950,0.015244,0.000000,0.000000,13.544847,1.115626
4,gaetano monachello,1994-03-02,F,185.0,left,2016-05-15,900000.0,13.991030,Atalanta,Serie A,...,5.700000,0.200000,0.100000,0.000000,0.000000,0.111925,0.161218,0.061587,14.019406,0.028376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9362,michael svoboda,1998-10-14,D,195.0,right,2024-10-20,1500000.0,14.220976,Venezia,Serie A,...,29.730769,2.653846,0.038462,0.000000,0.038870,0.042371,0.117319,0.117319,14.464836,0.243859
9363,kalifa coulibaly,1991-08-20,F,197.0,right,2022-05-21,1500000.0,14.370794,Nantes,Ligue 1,...,7.405941,0.099010,0.069307,0.000000,0.050634,0.219190,0.265225,0.053416,14.604443,0.233650
9364,nordin jackers,1997-09-04,G,185.0,right,2024-04-28,900000.0,13.710151,Club Brugge KV,"First Division A, Championship Round",...,20.742857,10.514286,0.000000,0.200000,0.000000,0.000000,0.000000,0.000000,13.666569,0.043582
9365,mads kikkenborg,1999-10-06,G,197.0,right,2023-12-03,800000.0,13.647174,Lyngby,Superligaen,...,16.488372,7.023256,0.000000,0.488372,0.000000,0.000000,0.000000,0.000000,13.922657,0.275484


# Now lets say you want to figure out which model is the best through a cross-validation.
You can do the above, but now you use a perform_CV(train) to do a cross-validation with the type of parameters you put in
For example:

In [None]:
# make the ensamble model object
ensamble = em.ensamble_model(scale='log')

ensamble.G_parameters(type = 'DT') # put the parameters for the G model as just linear regression
ensamble.D_parameters(type = 'RIDGE', alpha=5)  # puts the parameters for the D model as just linear regression with ridge regularization
ensamble.F_parameters(type='RFR', max_depth=10)   # puts the forwards parameters as random forest with max depth of 10

ensamble.perform_CV(train)

MSE for train: mean: 0.7454830095134034 std: 0.00572234405339352
MSE for test:  mean: 0.996919386291642  std: 0.06584344309304907

RMSE for train: mean: 0.8634072224588488 std: 0.0033132641457367345
RMSE for test: mean: 0.9979164769063393 std: 0.032895157857675425

R^2 for train: mean: 0.6150215229061187 std: 0.002876980265179195
R^2 for test: mean: 0.48434522954280174 std: 0.03141278041223129

MAE for train: mean: 0.6423923594436958 std: 0.0022225905334812292
MAE for test: mean: 0.7770604159011327 std: 0.020860191668480275

MAPE for train: mean: 0.04610834073999311 std: 0.00016847813935349205
MAPE for test: mean: 0.05583895019415822 std: 0.0015787696932722717



In [None]:
ex = em.ensamble_model(scale='log')
ex.G_parameters(type ='DT',max_depth = 3, max_features = 0.75, min_samples_split = 5, min_samples_leaf=4)
ex.D_parameters(type ='RFR', max_depth= 10, n_estimators= 100, max_features= 'sqrt', min_samples_split= 10,
                 min_samples_leaf=2, bootstrap= False )
ex.M_parameters(type='RFR',max_depth=None, n_estimators= 200, max_features= 0.25, min_samples_split= 5,
                 min_samples_leaf=1, bootstrap= True)
ex.F_parameters(type = 'RFR',max_depth= 10, n_estimators= 10, max_features= 0.5, min_samples_split= 2,
                 min_samples_leaf= 4, bootstrap=True)
ex.perform_CV(train)

MSE for train: mean: 0.2895754092964745 std: 0.0049122880909785605
MSE for test:  mean: 0.7544538816507846  std: 0.04213843955709652

RMSE for train: mean: 0.5381028615146602 std: 0.004551892596368877
RMSE for test: mean: 0.868253455109634 std: 0.02428619651092614

R^2 for train: mean: 0.8504566883179487 std: 0.0026511860900081453
R^2 for test: mean: 0.609610613484545 std: 0.021962336366659434

MAE for train: mean: 0.3937592153202706 std: 0.0021538266569940453
MAE for test: mean: 0.6783822132082473 std: 0.01645014332380803

MAPE for train: mean: 0.02838953346982533 std: 0.0001657208639421521
MAPE for test: mean: 0.048613288823953514 std: 0.001302318254336439



## Hyperparameter tuning

This takes in the set of possible parameters defined in the ensamble_model.py and randomly chooses them as it goes through the n_iter=100

In [None]:
# Lets try some hyperparameter tuning:

hp = em.hyperparameter_tuning(train,n_iter=1000,cv=4)

# After it gets done doing its hyperparameter tuning, it saves the best model, the parameters for that model, and the score 
# (which is just RMSE for now, I can change it later to do what ever score you want to use)

print(hp.best_params)  # Best parameters in a dictionary object
print(hp.best_score)   # the best RMSE
print(hp.best_model)   # the ensamble_model() object that has the best parameters above and score.

# You can now use the hp.best_model to do predictions and we can save it later once we have the one we want.

# you can now do 
best_model_prediction = hp.best_model.predict(train)


{'G': {'model': 'KNN', 'param': {'n_neighbors': 6}}, 'D': {'model': 'RFR', 'param': {'max_depth': None, 'n_estimators': 50, 'max_features': 0.5, 'min_samples_split': 10, 'min_samples_leaf': 1, 'bootstrap': False}}, 'M': {'model': 'RFR', 'param': {'max_depth': 10, 'n_estimators': 20, 'max_features': 0.75, 'min_samples_split': 5, 'min_samples_leaf': 2, 'bootstrap': True}}, 'F': {'model': 'GBR', 'param': {'max_depth': 5, 'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 7, 'bootstrap': False}}}
0.9097402676223301
<ensamble_model.ensamble_model object at 0x76d3c16d2ff0>


In [None]:
hp.best_model.perform_CV(train)

MSE for train: mean: 0.18186342164730637 std: 0.0018113113554112945
MSE for test:  mean: 0.8201073298117496  std: 0.04687169637008292

RMSE for train: mean: 0.42644918521680386 std: 0.002124635317064425
RMSE for test: mean: 0.9052201114094249 std: 0.026151093890267766

R^2 for train: mean: 0.9060828984349527 std: 0.0009341586927974755
R^2 for test: mean: 0.5754906654398151 std: 0.02648926449754054

MAE for train: mean: 0.25899913440105615 std: 0.001670153125031759
MAE for test: mean: 0.6985663339637462 std: 0.02059325913710562

MAPE for train: mean: 0.01865477591382816 std: 0.00012833621135423424
MAPE for test: mean: 0.050136511865538205 std: 0.0014348098122654064

