# Initialization

In [2]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
import numpy as np

Preparing the **dataset** for training and evaluating

In [3]:
data = pd.read_csv('./cleaned_train.csv', index_col = 0)
data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
count,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,...,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0
mean,7.146978,57.296016,10448.784341,0.995879,1.946429,0.0625,6.088599,5.576236,1971.18544,1984.819368,...,0.082418,0.00206,0.868819,0.068681,0.002747,0.008242,0.013736,0.822115,0.084478,12.02195
std,3.65605,33.877802,9860.763449,0.064084,1.408326,0.276593,1.369669,1.113966,30.20159,20.652143,...,0.275094,0.045361,0.337715,0.252998,0.05236,0.09044,0.116434,0.382547,0.278199,0.396077
min,0.0,0.0,1300.0,0.0,0.0,0.0,1.0,1.0,1872.0,1950.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.460271
25%,5.0,42.0,7538.75,1.0,0.0,0.0,5.0,5.0,1954.0,1966.75,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,11.774528
50%,6.0,63.0,9468.5,1.0,3.0,0.0,6.0,5.0,1972.0,1993.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,12.001512
75%,10.0,79.0,11588.0,1.0,3.0,0.0,7.0,6.0,2000.0,2004.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,12.273736
max,15.0,313.0,215245.0,1.0,3.0,2.0,10.0,9.0,2010.0,2010.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.345509


In [4]:
test = pd.read_csv('./cleaned_test.csv', index_col = 0)
test.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,7.113777,57.910212,9819.161069,0.995888,1.952707,0.045236,6.078821,5.553804,1971.357779,1983.662783,...,0.080192,0.002742,0.86292,0.061001,0.005483,0.008225,0.01782,0.825223,0.082248,2190.0
std,3.743775,32.266054,4955.517327,0.064018,1.410751,0.217566,1.436812,1.11374,30.390071,21.130467,...,0.271683,0.052306,0.34405,0.239414,0.073871,0.090348,0.132344,0.379907,0.274837,421.321334
min,0.0,0.0,1470.0,0.0,0.0,0.0,1.0,1.0,1879.0,1950.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1461.0
25%,5.0,44.0,7391.0,1.0,0.0,0.0,5.0,5.0,1953.0,1963.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1825.5
50%,5.0,63.0,9399.0,1.0,3.0,0.0,6.0,5.0,1973.0,1992.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2190.0
75%,10.0,78.0,11517.5,1.0,3.0,0.0,7.0,6.0,2001.0,2004.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2554.5
max,15.0,200.0,56600.0,1.0,3.0,2.0,10.0,9.0,2010.0,2010.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2919.0


In [5]:
x_train = data.drop(labels=['SalePrice'],axis=1,inplace=False)
y_train = data['SalePrice'].values
x_train.reset_index(inplace=True, drop=True)

In [6]:
test_id = test['Id']
x_test = test.drop(labels=['Id'], axis=1)
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Columns: 220 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(14), int64(206)
memory usage: 2.5 MB


Using **Cross validation** strategy to evaluate my models

In [7]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=17).get_n_splits(x_train.values)
    mse= np.sqrt(-cross_val_score(model, x_train.values, y_train, scoring="neg_mean_squared_log_error", cv = kf))
    return(mse)

# Models

**LASSO Regression** :
This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's **Robustscaler()** method on pipeline

In [8]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=17))

**Elastic Net Regression** :
again made robust to outliers

In [9]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

**Kernel Ridge Regression**:

In [10]:
KRR = KernelRidge(alpha=1, kernel='polynomial', degree=2, coef0=2.5)

**Gradient Boosting Regression**:
With huber loss that makes it robust to outliers

In [11]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

**XGBoost**:

In [12]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

**LightGBM**:

In [13]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

## Scores

In [14]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.0088 (0.0005)



In [15]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.0088 (0.0005)



In [16]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.0625 (0.0223)



  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,


In [17]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.0091 (0.0007)



In [18]:
score = rmsle_cv(model_xgb)
print("\nXGBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

In [19]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

LGBM score: 0.0089 (0.0005)



# Stacking models

In [20]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.base_models_ = [clone(x) for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=172)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)


In [21]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

# Predicting House Prices

In [22]:
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb, lasso, model_xgb))
averaged_models.fit(x_train, y_train)
pred = np.expm1(averaged_models.predict(x_test))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [23]:
submission = pd.DataFrame()
submission['Id'] = test_id
submission['SalePrice'] = pred
submission.to_csv('submission.csv', index=False)

In [24]:
submission

Unnamed: 0,Id,SalePrice
0,1461,123002.605752
1,1462,155211.690324
2,1463,183554.292334
3,1464,194860.635418
4,1465,190806.229269
...,...,...
1454,2915,83816.940571
1455,2916,83515.769847
1456,2917,157926.672372
1457,2918,119664.729212
