In [22]:
import importlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ensamble_model as em
importlib.reload(em)

# Train Data
fd = '../../data/main_data/train/train.csv'
train = pd.read_csv(fd)

# Test Data
fd = '../../data/main_data/test/test.csv'
test = pd.read_csv(fd)

## This is a description of how the ensamble model works and what it is doing under all of the code

The general object below is just a general_Regression object that can be changed to any model, like linear regression, Lasso, Ridge, decision tree, random forest, gradient boost regression.  You can also put in the parameters for these in the **kwargs argument, it will deal with it correctly with the names, for example max_depth=6 in a decision tree.


In [23]:
general = em.general_Regression(train,type='LR')
general.perform_CV()

MSE for train: mean: 0.9718164149663101 std: 0.006342523732242686
MSE for test:  mean: 0.9921637184791535  std: 0.05613728312166448

RMSE for train: mean: 0.9858022401021957 std: 0.0032184430712350804
RMSE for test: mean: 0.9956784978400008 std: 0.028072182284833106

R^2 for train: mean: 0.4981374205877124 std: 0.0034748163489597855
R^2 for test: mean: 0.4863840926024496 std: 0.03369637922275653

MAE for train: mean: 0.7713606371136246 std: 0.0022404362480961284
MAE for test: mean: 0.777757425708384 std: 0.01935288010777617

MAPE for train: mean: 0.055535082158550034 std: 0.00018568276535180768
MAPE for test: mean: 0.05597502483068467 std: 0.001629273965097209



### Position models
The classes G_Pos, D_Pos, M_Pos, F_Pos, are inherited classes of the general_Regression model above, but specificalyly designed to take in X that only correspond to its position.   If you look at the code in ensamble_mode.py, you can see what features it has for each position, which is what it is using for its model.  Since this is inherits the code from general_Regression, it has all the features of that class.

In [24]:
g_model = em.G_Pos(train)
g_model.perform_CV()


MSE for train: mean: 1.22591402345411 std: 0.006828025727353497
MSE for test:  mean: 1.2314295300362061  std: 0.06205934508636801

RMSE for train: mean: 1.1072057193888885 std: 0.003085191508442716
RMSE for test: mean: 1.1093485822244284 std: 0.027843368202442785

R^2 for train: mean: 0.36692526251810664 std: 0.002074083260167302
R^2 for test: mean: 0.3633141191758053 std: 0.019922672686285874

MAE for train: mean: 0.8809492466319812 std: 0.0021147368907914045
MAE for test: mean: 0.8826425924614612 std: 0.018849214847860975

MAPE for train: mean: 0.06322154612302026 std: 0.00016149021925135283
MAPE for test: mean: 0.06333693700665695 std: 0.0014310499490877196



In [25]:
d_model = em.D_Pos(train)
d_model.perform_CV()

MSE for train: mean: 0.987961258589283 std: 0.006683446288828631
MSE for test:  mean: 1.0063665170562737  std: 0.05897083160872418

RMSE for train: mean: 0.9939567128459608 std: 0.0033632689061734913
RMSE for test: mean: 1.0027500638763334 std: 0.029305740944106325

R^2 for train: mean: 0.48980112750348326 std: 0.0034804301073545238
R^2 for test: mean: 0.47913538082648904 std: 0.03334695611751439

MAE for train: mean: 0.7761123498824781 std: 0.002423170625815895
MAE for test: mean: 0.78161709972238 std: 0.02159430280116342

MAPE for train: mean: 0.05585964255073576 std: 0.00019635286940126447
MAPE for test: mean: 0.056234200798305836 std: 0.0017530010961268998



In [26]:
m_model = em.M_Pos(train)
m_model.perform_CV()

MSE for train: mean: 0.9843446354184389 std: 0.006599502477681914
MSE for test:  mean: 1.0026610274759913  std: 0.058225505881492265

RMSE for train: mean: 0.9921358620155323 std: 0.00332666817336718
RMSE for test: mean: 1.000909066751958 std: 0.029018400536829497

R^2 for train: mean: 0.4916687454630805 std: 0.003446101566907871
R^2 for test: mean: 0.48105081799215926 std: 0.03304449514312531

MAE for train: mean: 0.7752565200774655 std: 0.002377056734925013
MAE for test: mean: 0.7809257187143217 std: 0.021203109052552748

MAPE for train: mean: 0.05580615727370001 std: 0.00019403533453101512
MAPE for test: mean: 0.05619175596846023 std: 0.0017327890978573774



In [27]:
f_model = em.F_Pos(train)
f_model.perform_CV()

MSE for train: mean: 0.987961258589283 std: 0.006683446288828631
MSE for test:  mean: 1.0063665170562737  std: 0.05897083160872418

RMSE for train: mean: 0.9939567128459608 std: 0.0033632689061734913
RMSE for test: mean: 1.0027500638763334 std: 0.029305740944106325

R^2 for train: mean: 0.48980112750348326 std: 0.0034804301073545238
R^2 for test: mean: 0.47913538082648904 std: 0.03334695611751439

MAE for train: mean: 0.7761123498824781 std: 0.002423170625815895
MAE for test: mean: 0.78161709972238 std: 0.02159430280116342

MAPE for train: mean: 0.05585964255073576 std: 0.00019635286940126447
MAPE for test: mean: 0.056234200798305836 std: 0.0017530010961268998



## The ensamble model
This is a new class that builds a model for all the positions and puts it together.  The way to set it up is as follows in the next cell.

In [28]:
# To set up the ensamble model with your specifications we do the following:

ex = em.ensamble_model()    # this sets up the class ready to take inthe parameters and the data for fitting

ex.G_parameters(type = 'LR') # put the parameters for the G model as just linear regression
ex.D_parameters(type = 'RIDGE', alpha=5)  # puts the parameters for the D model as just linear regression with ridge regularization
ex.F_parameters(type='RFR', max_depth=10)   # puts the forwards parameters as random forest with max depth of 10

# Note that any left our parameters changes, for example M here, is left as just Linear regression

# Once you set up what your model is, we can fit the data to it
ex.fit(train)

# It is now fitted to the data and read to predict things:

prediction = ex.predict(train)

display(prediction)


Unnamed: 0,prediction
0,2.677650
1,2.671885
2,2.794739
3,2.676423
4,2.710603
...,...
9362,2.735036
9363,2.739177
9364,2.682696
9365,2.698147


# Now lets say you want to figure out which model is the best through a cross-validation.
You can do the above, but now you use a perform_CV(train) to do a cross-validation with the type of parameters you put in
For example:

In [29]:
# make the ensamble model object
ensamble = em.ensamble_model()

ensamble.G_parameters(type = 'LR') # put the parameters for the G model as just linear regression
ensamble.D_parameters(type = 'RIDGE', alpha=5)  # puts the parameters for the D model as just linear regression with ridge regularization
ensamble.F_parameters(type='RFR', max_depth=10)   # puts the forwards parameters as random forest with max depth of 10

ensamble.perform_CV(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.target] = np.log1p(data[self.target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.target] = np.log1p(data[self.target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.target] = np.log1p(data[self.target])
A value is trying to be set on a copy of a slice from a DataF

MSE for train: mean: 164.09485232667075 std: 0.13790574634211408
MSE for test:  mean: 164.0940503929265  std: 1.2480116608094936

RMSE for train: mean: 12.809950170326687 std: 0.005381488460096055
RMSE for test: mean: 12.809826973938536 std: 0.048818958235104445

R^2 for train: mean: -83.7417625791968 std: 0.3307173776138773
R^2 for test: mean: -83.95973849043654 std: 3.0525403854499404

MAE for train: mean: 12.73548528708964 std: 0.005261674081109263
MAE for test: mean: 12.735436049753107 std: 0.04761951136045132

MAPE for train: mean: 0.9059221617023644 std: 2.9210581255990124e-05
MAPE for test: mean: 0.9059185400125166 std: 0.000281486705931203



In [30]:
ex = em.ensamble_model()
ex.G_parameters(type ='DT',max_depth = 3, max_features = 0.75, min_samples_split = 5, min_samples_leaf=4)
ex.D_parameters(type ='RFR', max_depth= 10, n_estimators= 100, max_features= 'sqrt', min_samples_split= 10,
                 min_samples_leaf=2, bootstrap= False )
ex.M_parameters(type='RFR',max_depth=None, n_estimators= 200, max_features= 0.25, min_samples_split= 5,
                 min_samples_leaf=1, bootstrap= True)
ex.F_parameters(type = 'RFR',max_depth= 10, n_estimators= 10, max_features= 0.5, min_samples_split= 2,
                 min_samples_leaf= 4, bootstrap=True)
ex.perform_CV(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.target] = np.log1p(data[self.target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.target] = np.log1p(data[self.target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.target] = np.log1p(data[self.target])
A value is trying to be set on a copy of a slice from a DataF

MSE for train: mean: 164.09485232667075 std: 0.13790574634211408
MSE for test:  mean: 164.0940503929265  std: 1.2480116608094936

RMSE for train: mean: 12.809950170326687 std: 0.005381488460096055
RMSE for test: mean: 12.809826973938536 std: 0.048818958235104445

R^2 for train: mean: -83.7417625791968 std: 0.3307173776138773
R^2 for test: mean: -83.95973849043654 std: 3.0525403854499404

MAE for train: mean: 12.73548528708964 std: 0.005261674081109263
MAE for test: mean: 12.735436049753107 std: 0.04761951136045132

MAPE for train: mean: 0.9059221617023644 std: 2.9210581255990124e-05
MAPE for test: mean: 0.9059185400125166 std: 0.000281486705931203



## Hyperparameter tuning

This takes in the set of possible parameters defined in the ensamble_model.py and randomly chooses them as it goes through the n_iter=100

In [None]:
# Lets try some hyperparameter tuning:

hp = em.hyperparameter_tuning(train,n_iter=100,cv=2)

# After it gets done doing its hyperparameter tuning, it saves the best model, the parameters for that model, and the score 
# (which is just RMSE for now, I can change it later to do what ever score you want to use)

print(hp.best_params)  # Best parameters in a dictionary object
print(hp.best_score)   # the best RMSE
print(hp.best_model)   # the ensamble_model() object that has the best parameters above and score.

# You can now use the hp.best_model to do predictions and we can save it later once we have the one we want.

# you can now do 
best_model_prediction = hp.best_model.predict(train)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'G': {'model': 'KNN', 'param': {'n_neighbors': 10}}, 'D': {'model': 'RIDGE', 'param': {'alpha': np.float64(9.224489795918368)}}, 'M': {'model': 'RFR', 'param': {'max_depth': 10, 'n_estimators': 50, 'max_features': 0.5, 'min_samples_split': 10, 'min_samples_leaf': 2, 'bootstrap': True}}, 'F': {'model': 'DT', 'param': {'max_depth': None, 'max_features': 0.75, 'min_samples_split': 2, 'min_samples_leaf': 8}}}
7040139.241624018
<ensamble_model.ensamble_model object at 0x7fd50fab15b0>


In [None]:
hp = em.hyperparameter_tuning(train,n_iter=200,cv=2)
display(hp.best_params, hp.best_score)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'G': {'model': 'ELASTICR', 'param': {'alpha': np.float64(5.73469387755102)}},
 'D': {'model': 'RFR',
  'param': {'max_depth': None,
   'n_estimators': 50,
   'max_features': 0.75,
   'min_samples_split': 10,
   'min_samples_leaf': 1,
   'bootstrap': False}},
 'M': {'model': 'RIDGE', 'param': {'alpha': np.float64(8.448979591836736)}},
 'F': {'model': 'RFR',
  'param': {'max_depth': 10,
   'n_estimators': 30,
   'max_features': 0.25,
   'min_samples_split': 5,
   'min_samples_leaf': 1,
   'bootstrap': False}}}

np.float64(6905655.627982973)

In [None]:
hp = em.hyperparameter_tuning(train,n_iter=300,cv=2)
display(hp.best_params, hp.best_score)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'G': {'model': 'ELASTICR', 'param': {'alpha': np.float64(9.806122448979592)}},
 'D': {'model': 'RFR',
  'param': {'max_depth': 5,
   'n_estimators': 20,
   'max_features': 0.5,
   'min_samples_split': 2,
   'min_samples_leaf': 4,
   'bootstrap': True}},
 'M': {'model': 'RFR',
  'param': {'max_depth': 10,
   'n_estimators': 200,
   'max_features': 0.5,
   'min_samples_split': 2,
   'min_samples_leaf': 1,
   'bootstrap': False}},
 'F': {'model': 'RIDGE', 'param': {'alpha': np.float64(7.673469387755102)}}}

np.float64(6903694.268728865)