Average Model 1: Here I will be exploring 2 Base Models; Lasso and Elastic Net
Part of the code is referenced from Serigne's Stacked Regression Kernel: Top 4% on Leader Board
By: Serigne
@https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from scipy.stats.stats import pearsonr


%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
TrainSet = pd.read_csv('train.csv')
TestSet = pd.read_csv('test.csv')

In [3]:
TrainSet = TrainSet.drop(TrainSet[(TrainSet['GrLivArea']>4000) & (TrainSet['SalePrice']<300000)].index)

In [4]:
all_data = pd.concat((TrainSet.loc[:,'MSSubClass':'SaleCondition'],
                      TestSet.loc[:,'MSSubClass':'SaleCondition']))

In [5]:
# I drop features that have more than half of missing information or do not correlate to SalePrice
all_data.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
               'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
               'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
              axis=1, inplace=True)

In [6]:
#log transforming the target
TrainSet["SalePrice"] = np.log1p(TrainSet["SalePrice"])

#log transformation of skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = TrainSet[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [7]:
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [8]:
# Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
#features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
all_data.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)

In [9]:
#creating matrices for sklearn:
train = all_data[:TrainSet.shape[0]]
test = all_data[TrainSet.shape[0]:]
y_train = TrainSet.SalePrice

In [10]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [11]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [12]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

In [13]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f}".format(score.mean()))


Lasso score: 0.1134


In [14]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [15]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f}".format(score.mean()))

ElasticNet score: 0.1135


In [16]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  

In [17]:
#Average Base Model's Score
averaged_models = AveragingModels(models = (ENet, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f}".format(score.mean()))

 Averaged base models score: 0.1134


In [18]:
averaged_models.fit(train.values, y_train)
averaged_train_pred = averaged_models.predict(train.values)
averaged_pred = np.expm1(averaged_models.predict(test.values))

In [19]:
solution = pd.DataFrame({"id":TestSet.Id, "SalePrice": averaged_pred})
solution.to_csv("Two_Avg.csv", index = False)