### Importing libraries

In [1]:
#import some necessary librairies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
from sklearn.model_selection import RandomizedSearchCV

In [2]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLars,RidgeCV
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [3]:
# import zipfile
# import pandas as pd
# zip = zipfile.ZipFile('preprocessesdData.zip')
# zip.extractall()
train =pd.read_csv('X_train.csv')
test = pd.read_csv('X_test.csv')
ytrain=pd.read_csv('y_train.csv')

### Metric function

In [4]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

## Stacking

![Image](https://camo.githubusercontent.com/fa34150cb31d02f68886584d549f300f8c290ba3/68747470733a2f2f6769746875622e636f6d2f766563786f7a2f766563737461636b2f7261772f6d61737465722f7069632f616e696d6174696f6e322e676966)

#### layer 1

Tree based models do not need data to be scaled !
so I haven't use scaling when predicting with boosting models

In [5]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9,
                                                random_state=7))
#########################################################################
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

#########################################################################

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)


In [6]:
# library used for stacking 
# !pip install vecstack



In [7]:
from vecstack import stacking

estimators = [KRR,GBoost,ENet]
X_train=train
y_train=ytrain
X_test=test
k=5

L_train_1, L_test_1=stacking(estimators,X_train,
         y_train, X_test,regression=True, 
         n_folds=k,mode='oof_pred',random_state=7, 
         verbose=2)

  y = column_or_1d(y, warn=True)


task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred]
n_models:     [3]

model  0:     [KernelRidge]
    fold  0:  [0.07751935]
    fold  1:  [0.08266582]
    fold  2:  [0.08303608]
    fold  3:  [0.07790344]
    fold  4:  [0.07958313]
    ----
    MEAN:     [0.08014157] + [0.00232152]
    FULL:     [0.08014348]

    Fitting on full train set...

model  1:     [GradientBoostingRegressor]
    fold  0:  [0.07667160]
    fold  1:  [0.08286217]
    fold  2:  [0.08281941]
    fold  3:  [0.07613338]
    fold  4:  [0.08030994]
    ----
    MEAN:     [0.07975930] + [0.00289746]
    FULL:     [0.07976141]

    Fitting on full train set...

model  2:     [Pipeline]
    fold  0:  [0.07594472]
    fold  1:  [0.07917898]
    fold  2:  [0.07852674]
    fold  3:  [0.07524053]
    fold  4:  [0.07742451]
    ----
    MEAN:     [0.07726310] + [0.00149146]
    FULL:     [0.07726437]

    Fitting on full train set...



#### layer 2

In [8]:
ENet2 = make_pipeline(RobustScaler(), ElasticNet(alpha=0.00055, l1_ratio=.45,
                                                random_state=7))
#########################################################################
KRR2 = KernelRidge(alpha=0.4, kernel='polynomial', degree=2, coef0=2.5)
#########################################################################
GBoost2 = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,
                                   max_depth=3, max_features='sqrt',
                                   min_samples_leaf=7, min_samples_split=10, 
                                   loss='huber', random_state =7)

In [9]:
#layer 2
estimatorsL2=[ENet2,KRR2,GBoost2]

L_train_2, L_test_2=stacking(estimatorsL2,L_train_1,
         y_train, L_test_1,regression=True, 
         n_folds=k,mode='oof_pred',random_state=7, 
         verbose=2)


  y = column_or_1d(y, warn=True)


task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred]
n_models:     [3]

model  0:     [Pipeline]
    fold  0:  [0.07383766]
    fold  1:  [0.07569147]
    fold  2:  [0.07536123]
    fold  3:  [0.07190660]
    fold  4:  [0.07401263]
    ----
    MEAN:     [0.07416192] + [0.00134103]
    FULL:     [0.07416357]

    Fitting on full train set...

model  1:     [KernelRidge]
    fold  0:  [0.07397262]
    fold  1:  [0.07572903]
    fold  2:  [0.07534267]
    fold  3:  [0.07191053]
    fold  4:  [0.07399440]
    ----
    MEAN:     [0.07418985] + [0.00134006]
    FULL:     [0.07419155]

    Fitting on full train set...

model  2:     [GradientBoostingRegressor]
    fold  0:  [0.07590069]
    fold  1:  [0.07828955]
    fold  2:  [0.07719493]
    fold  3:  [0.07499336]
    fold  4:  [0.07385180]
    ----
    MEAN:     [0.07604607] + [0.00156826]
    FULL:     [0.07604829]

    Fitting on full train set...



#### layer 3


In [10]:
#our estimator (hyper params have been found by randomized search)
ENet3=make_pipeline(RobustScaler(), ElasticNet(alpha=0.006, l1_ratio=0.0008,
                                                random_state=7))

In [11]:
#layer 3
L_train_3, L_test_3=stacking([ENet3],L_train_2,
         y_train, L_test_2,regression=True, 
         n_folds=k,mode='oof_pred',random_state=7, 
         verbose=1)

print(rmsle(y_train,L_train_3))

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred]
n_models:     [1]

model  0:     [Pipeline]
    ----
    MEAN:     [0.07425581] + [0.00124405]
    FULL:     [0.07425787]

    Fitting on full train set...

0.1097057540215649


  y = column_or_1d(y, warn=True)


In [12]:
stack_pred=np.expm1(L_test_3).reshape(len(L_test_3),)

#traing predictions are in logged form 
#because the y_train is still in this form too
stack_train=L_train_3.reshape(len(L_train_3),)

## Weighted average ensemble



In [13]:

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
#########################################################################
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)



**XGBoost:**

In [14]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

0.07879894799249869


**LightGBM:**

In [15]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

0.07307464036005416


### training error

In [16]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train,stack_train*0.7 +xgb_train_pred*0.12 + lgb_train_pred*0.18  ))

RMSLE score on train data:
0.09732256628046794


### Ensemble prediction

In [17]:
stack_pred=stack_pred.reshape(1459,)
ensemble =stack_pred*0.6 +xgb_pred*0 + lgb_pred*0.4  

In [7]:
ensemble.shape

NameError: name 'ensemble' is not defined

## Submission

In [19]:
sub = pd.DataFrame()
sub['Id'] = range(1461,1461+1459)
sub['SalePrice'] = ensemble
sub.to_csv('submission.csv',index=False)
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,120631.34209
1,1462,158404.409305
2,1463,183752.333993
3,1464,194412.748061
4,1465,195149.086338
