# Initialization

In [106]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
import numpy as np

Preparing the **dataset** for training and evaluating

In [107]:
data = pd.read_csv('./cleaned_train.csv', index_col = 0)
data.describe()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,ExterQual,ExterCond,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,HeatingQC,2ndFlrSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,...,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0
mean,6.094715,1971.194235,1984.818806,3.785175,4.611531,4.2671580000000004e-18,-5.132781e-16,-4.740355e-16,1.834592,-1.680955e-16,...,0.003432,0.003432,0.082361,0.002059,0.868909,0.002745,0.008236,0.013727,0.821551,0.08442
std,1.376542,30.190353,20.640669,1.597508,1.054206,0.8250042,0.9589801,0.98756,2.164451,0.6567486,...,0.0585,0.0585,0.275008,0.045345,0.337616,0.052342,0.09041,0.116395,0.383022,0.278112
min,1.0,1872.0,1950.0,0.0,0.0,-2.508631,-2.726576,-4.987976,0.0,-3.524366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,1954.0,1967.0,2.0,5.0,-0.2889312,-0.6385582,-0.6470418,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,6.0,1972.0,1994.0,5.0,5.0,0.0,0.0,-0.04024282,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,7.0,2000.0,2004.0,5.0,5.0,0.293268,0.6285,0.6684488,5.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,10.0,2010.0,2010.0,5.0,5.0,3.086749,2.585456,3.702579,5.0,3.69511,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [146]:
test = pd.read_csv('./cleaned_test.csv', index_col = 0)
test.describe()

Unnamed: 0,Id,OverallQual,YearBuilt,YearRemodAdd,ExterQual,ExterCond,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,HeatingQC,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,6.078821,1971.357779,1983.662783,3.744345,4.546265,-6.445229000000001e-17,1.801925e-16,3.743388e-16,1.824537,...,0.002742,0.002056,0.080192,0.002742,0.86292,0.005483,0.008225,0.01782,0.825223,0.082248
std,421.321334,1.436812,30.390071,21.130467,1.620691,1.149828,0.8265153,0.9568897,0.9858394,2.171502,...,0.052306,0.045314,0.271683,0.052306,0.34405,0.073871,0.090348,0.132344,0.379907,0.274837
min,1461.0,1.0,1879.0,1950.0,0.0,0.0,-2.306754,-2.645587,-3.656306,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1825.5,5.0,1953.0,1963.0,2.0,5.0,-0.2967805,-0.6503426,-0.5775882,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,2190.0,6.0,1973.0,1992.0,5.0,5.0,0.0,0.0,-0.01475348,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,2554.5,7.0,2001.0,2004.0,5.0,5.0,0.4122764,0.6387899,0.6537297,5.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,2919.0,10.0,2010.0,2010.0,5.0,5.0,4.705638,2.417964,5.504799,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [147]:
x_train = data.drop(labels=['SalePrice'],axis=1,inplace=False)
y_train = data['SalePrice'].values
x_train.reset_index(inplace=True, drop=True)

In [148]:
test_id = test['Id']
x_test = test.drop(labels=['Id'], axis=1)
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Data columns (total 76 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   OverallQual            1459 non-null   int64  
 1   YearBuilt              1459 non-null   int64  
 2   YearRemodAdd           1459 non-null   int64  
 3   ExterQual              1459 non-null   int64  
 4   ExterCond              1459 non-null   int64  
 5   BsmtFinSF1             1459 non-null   float64
 6   BsmtUnfSF              1459 non-null   float64
 7   TotalBsmtSF            1459 non-null   float64
 8   HeatingQC              1459 non-null   int64  
 9   2ndFlrSF               1459 non-null   float64
 10  GrLivArea              1459 non-null   float64
 11  FullBath               1459 non-null   int64  
 12  KitchenQual            1459 non-null   int64  
 13  Fireplaces             1459 non-null   int64  
 14  GarageCars             1459 non-null   float64
 15  MSZo

Using **Cross validation** strategy to evaluate my models

In [110]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=17).get_n_splits(x_train.values)
    mse= np.sqrt(-cross_val_score(model, x_train.values, y_train, scoring="neg_mean_squared_log_error", cv = kf))
    return(mse)

# Models

**LASSO Regression** :
This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's **Robustscaler()** method on pipeline

In [111]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=17))

**Elastic Net Regression** :
again made robust to outliers

In [112]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

**Kernel Ridge Regression**:

In [113]:
KRR = KernelRidge(alpha=1, kernel='polynomial', degree=2, coef0=2.5)

**Gradient Boosting Regression**:
With huber loss that makes it robust to outliers

In [114]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

**XGBoost**:

In [115]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

**LightGBM**:

In [116]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

## Scores

In [117]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.0104 (0.0004)



In [118]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.0104 (0.0004)



In [119]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.0104 (0.0006)



In [120]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.0103 (0.0005)



In [121]:
score = rmsle_cv(model_xgb)
print("\nXGBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

In [122]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

LGBM score: 0.0103 (0.0004)



# Stacking models

In [123]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  

In [124]:
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb, lasso, model_xgb))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

# Predicting House Prices

In [152]:
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb, lasso, model_xgb))
averaged_models.fit(x_train, y_train)
pred = np.expm1(averaged_models.predict(x_test))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [153]:
submission = pd.DataFrame()
submission['Id'] = test_id
submission['SalePrice'] = pred
submission.to_csv('submission.csv', index=False)

In [154]:
submission

Unnamed: 0,Id,SalePrice
0,1461,110013.504991
1,1462,160692.757190
2,1463,178933.424879
3,1464,190065.925833
4,1465,194509.051358
...,...,...
1454,2915,84112.113297
1455,2916,88013.221829
1456,2917,151263.975183
1457,2918,116277.817549
