In [73]:
import importlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ensamble_model as em
importlib.reload(em)

# Train Data
fd = '../../data/main_data/train/train.csv'
train = pd.read_csv(fd)

# Test Data
fd = '../../data/main_data/test/test.csv'
test = pd.read_csv(fd)

## This is a description of how the ensamble model works and what it is doing under all of the code

The general object below is just a general_Regression object that can be changed to any model, like linear regression, Lasso, Ridge, decision tree, random forest, gradient boost regression.  You can also put in the parameters for these in the **kwargs argument, it will deal with it correctly with the names, for example max_depth=6 in a decision tree.


In [74]:
general = em.general_Regression(train,type='LR')
general.perform_CV()

MSE for train: mean: 49961930100825.71 std: 1766190834328.662
MSE for test:  mean: 52623128336151.76  std: 16157787424203.045

RMSE for train: mean: 7067261.791999064 std: 125462.60070582716
RMSE for test: mean: 7167938.0340295015 std: 1115254.5343844378

R^2 for train: mean: 0.4080075808979379 std: 0.006344560557093116
R^2 for test: mean: 0.3709262457835678 std: 0.07693190732989633

MAE for train: mean: 3607039.4571535764 std: 52779.976526211336
MAE for test: mean: 3660546.333153973 std: 185378.77754387652

MAPE for train: mean: 4.104013257779082 std: 0.05894752588706118
MAPE for test: mean: 4.1451680329698934 std: 0.5716244798784486



### Position models
The classes G_Pos, D_Pos, M_Pos, F_Pos, are inherited classes of the general_Regression model above, but specificalyly designed to take in X that only correspond to its position.   If you look at the code in ensamble_mode.py, you can see what features it has for each position, which is what it is using for its model.  Since this is inherits the code from general_Regression, it has all the features of that class.

In [75]:
g_model = em.G_Pos(train)
g_model.perform_CV()


MSE for train: mean: 61683219833298.81 std: 2219945018031.943
MSE for test:  mean: 62205878727569.86  std: 20011922777712.785

RMSE for train: mean: 7852586.7440355215 std: 141778.91485141055
RMSE for test: mean: 7783601.32705945 std: 1273353.4894004185

R^2 for train: mean: 0.2691621147664784 std: 0.0052409632861336675
R^2 for test: mean: 0.2639280823504907 std: 0.05308575164317294

MAE for train: mean: 3838273.851915125 std: 54560.33622877767
MAE for test: mean: 3851011.914504575 std: 252278.20128958815

MAPE for train: mean: 4.717831190395815 std: 0.07551720201912203
MAPE for test: mean: 4.728688995878598 std: 0.5542047421151353



In [76]:
d_model = em.D_Pos(train)
d_model.perform_CV()

MSE for train: mean: 50544228181063.92 std: 1802962172237.9333
MSE for test:  mean: 53121591192881.555  std: 16483703522198.58

RMSE for train: mean: 7108305.294912612 std: 127373.52700318863
RMSE for test: mean: 7200370.978870483 std: 1129711.892263365

R^2 for train: mean: 0.40111617463962174 std: 0.006332628828309911
R^2 for test: mean: 0.3656441369508611 std: 0.07528484299740598

MAE for train: mean: 3610349.5081242 std: 51834.37294181964
MAE for test: mean: 3659425.1013683877 std: 192222.99592860552

MAPE for train: mean: 4.061381454039268 std: 0.056254552993016876
MAPE for test: mean: 4.094471598801382 std: 0.5632261402184444



In [77]:
m_model = em.M_Pos(train)
m_model.perform_CV()

MSE for train: mean: 50294405634312.64 std: 1784796950948.8962
MSE for test:  mean: 52895234591291.83  std: 16316294433226.521

RMSE for train: mean: 7090728.837536447 std: 126373.20460311424
RMSE for test: mean: 7185767.778406923 std: 1122486.5371578664

R^2 for train: mean: 0.40407152347115005 std: 0.006349294850707546
R^2 for test: mean: 0.36796804976146796 std: 0.07650070079951457

MAE for train: mean: 3603893.397093645 std: 51871.309000422945
MAE for test: mean: 3654779.804799214 std: 186489.74129049366

MAPE for train: mean: 4.0574758598876555 std: 0.05661994647567225
MAPE for test: mean: 4.094595715678373 std: 0.5810236718762408



In [78]:
f_model = em.F_Pos(train)
f_model.perform_CV()

MSE for train: mean: 50544228181063.92 std: 1802962172237.9333
MSE for test:  mean: 53121591192881.555  std: 16483703522198.58

RMSE for train: mean: 7108305.294912612 std: 127373.52700318863
RMSE for test: mean: 7200370.978870483 std: 1129711.892263365

R^2 for train: mean: 0.40111617463962174 std: 0.006332628828309911
R^2 for test: mean: 0.3656441369508611 std: 0.07528484299740598

MAE for train: mean: 3610349.5081242 std: 51834.37294181964
MAE for test: mean: 3659425.1013683877 std: 192222.99592860552

MAPE for train: mean: 4.061381454039268 std: 0.056254552993016876
MAPE for test: mean: 4.094471598801382 std: 0.5632261402184444



## The ensamble model
This is a new class that builds a model for all the positions and puts it together.  The way to set it up is as follows in the next cell.

In [79]:
# To set up the ensamble model with your specifications we do the following:

ex = em.ensamble_model()    # this sets up the class ready to take inthe parameters and the data for fitting

ex.G_parameters(type = 'LR') # put the parameters for the G model as just linear regression
ex.D_parameters(type = 'RIDGE', alpha=5)  # puts the parameters for the D model as just linear regression with ridge regularization
ex.F_parameters(type='RFR', max_depth=10)   # puts the forwards parameters as random forest with max depth of 10

# Note that any left our parameters changes, for example M here, is left as just Linear regression

# Once you set up what your model is, we can fit the data to it
ex.fit(train)

# It is now fitted to the data and read to predict things:

prediction = ex.predict(train)

display(prediction)


Unnamed: 0,prediction
0,2.825627e+06
1,1.690582e+06
2,1.481337e+07
3,-1.193986e+06
4,1.942152e+06
...,...
9362,4.948148e+06
9363,3.138596e+06
9364,2.539480e+06
9365,3.885528e+06


# Now lets say you want to figure out which model is the best through a cross-validation.
You can do the above, but now you use a perform_CV(train) to do a cross-validation with the type of parameters you put in
For example:

In [80]:
# make the ensamble model object
ensamble = em.ensamble_model()

ensamble.G_parameters(type = 'LR') # put the parameters for the G model as just linear regression
ensamble.D_parameters(type = 'RIDGE', alpha=5)  # puts the parameters for the D model as just linear regression with ridge regularization
ensamble.F_parameters(type='RFR', max_depth=10)   # puts the forwards parameters as random forest with max depth of 10

ensamble.perform_CV(train)

MSE for train: mean: 47067774220465.875 std: 1608129583158.1545
MSE for test:  mean: 53069589118872.61  std: 15211355486132.438

RMSE for train: mean: 6859583.605203572 std: 117843.04683879399
RMSE for test: mean: 7212619.981221761 std: 1023573.2144567011

R^2 for train: mean: 0.4422764274646175 std: 0.00610570403278057
R^2 for test: mean: 0.3578213076742 std: 0.09857481553917803

MAE for train: mean: 3581415.0837480677 std: 49290.10083694372
MAE for test: mean: 3711861.345793127 std: 190021.43319106396

MAPE for train: mean: 4.004039934289492 std: 0.05615761669443768
MAPE for test: mean: 4.080809144668858 std: 0.5267677823546241



## Hyperparameter tuning

This takes in the set of possible parameters defined in the ensamble_model.py and randomly chooses them as it goes through the n_iter=100

In [81]:
# Lets try some hyperparameter tuning:

hp = em.hyperparameter_tuning(train,n_iter=100,cv=2)

# After it gets done doing its hyperparameter tuning, it saves the best model, the parameters for that model, and the score 
# (which is just RMSE for now, I can change it later to do what ever score you want to use)

print(hp.best_params)  # Best parameters in a dictionary object
print(hp.best_score)   # the best RMSE
print(hp.best_model)   # the ensamble_model() object that has the best parameters above and score.

# You can now use the hp.best_model to do predictions and we can save it later once we have the one we want.

# you can now do 
best_model_prediction = hp.best_model.predict(train)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'G': {'model': 'KNN', 'param': {'n_neighbors': 10}}, 'D': {'model': 'RIDGE', 'param': {'alpha': np.float64(9.224489795918368)}}, 'M': {'model': 'RFR', 'param': {'max_depth': 10, 'n_estimators': 50, 'max_features': 0.5, 'min_samples_split': 10, 'min_samples_leaf': 2, 'bootstrap': True}}, 'F': {'model': 'DT', 'param': {'max_depth': None, 'max_features': 0.75, 'min_samples_split': 2, 'min_samples_leaf': 8}}}
7040139.241624018
<ensamble_model.ensamble_model object at 0x7fd50fab15b0>


In [71]:
hp = em.hyperparameter_tuning(train,n_iter=200,cv=2)
display(hp.best_params, hp.best_score)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'G': {'model': 'ELASTICR', 'param': {'alpha': np.float64(5.73469387755102)}},
 'D': {'model': 'RFR',
  'param': {'max_depth': None,
   'n_estimators': 50,
   'max_features': 0.75,
   'min_samples_split': 10,
   'min_samples_leaf': 1,
   'bootstrap': False}},
 'M': {'model': 'RIDGE', 'param': {'alpha': np.float64(8.448979591836736)}},
 'F': {'model': 'RFR',
  'param': {'max_depth': 10,
   'n_estimators': 30,
   'max_features': 0.25,
   'min_samples_split': 5,
   'min_samples_leaf': 1,
   'bootstrap': False}}}

np.float64(6905655.627982973)

In [72]:
hp = em.hyperparameter_tuning(train,n_iter=300,cv=2)
display(hp.best_params, hp.best_score)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'G': {'model': 'ELASTICR', 'param': {'alpha': np.float64(9.806122448979592)}},
 'D': {'model': 'RFR',
  'param': {'max_depth': 5,
   'n_estimators': 20,
   'max_features': 0.5,
   'min_samples_split': 2,
   'min_samples_leaf': 4,
   'bootstrap': True}},
 'M': {'model': 'RFR',
  'param': {'max_depth': 10,
   'n_estimators': 200,
   'max_features': 0.5,
   'min_samples_split': 2,
   'min_samples_leaf': 1,
   'bootstrap': False}},
 'F': {'model': 'RIDGE', 'param': {'alpha': np.float64(7.673469387755102)}}}

np.float64(6903694.268728865)