# Motivation Behind this Notebook:

### Hi, I am new to Kaggle and Data Science in general and, I've been trying to learn 'Model Stacking' and to start working on datasets on kaggle for some weeks now. Then I stumbled upon [This Notebook (Stacked Regressions : Top 4% on LeaderBoard)](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard) by [Serigne](https://www.kaggle.com/serigne).

### In his notebook he very effectively (By The Way Thank You Sergine, Your Awesome) demonstrated the concept of Model Stacking and after going through every word he wrote, I am Grateful and also a little bit smarter.


### Now, Along with me some other users were wondering about the transformation of data fields after combining the train and test sets, As it can lead to Data Leakage.

### So, I am trying to reproduce his pipeline while trying to remove that chance of data leakage.

## Similarities and Changes
1. Tranformation of test and train data seperately
2. Model Parameters will be same for comparison's sake

# Importing Data and Dependencies

In [None]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
from scipy import stats
from scipy.stats import norm, skew

#Data 
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
smp = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

#Getting Info on the Data
#with open('/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt') as fhandle:
    #for line in fhandle.readlines():
        #print(line)

# Data Cleaning

In [None]:
# Visualizing Outliers
plt.figure(figsize=(15,10))
plt.scatter(train.GrLivArea,train.SalePrice,c='orange',s=90,alpha=0.4)
plt.ylabel('Sales Price',fontsize=15)
plt.xlabel('GrLivArea',fontsize=15)
plt.title('Checking For Outliers',fontsize=15)
plt.grid( alpha=0.5,color='lightslategrey')
sp = plt.gca().spines
sp['top'].set_visible(False)
sp['right'].set_visible(False);

In [None]:
# Removing Outliers
train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<200000)].index,inplace=True)
# Visualizing Outliers
plt.figure(figsize=(15,10))
plt.scatter(train.GrLivArea,train.SalePrice,c='orange',s=90,alpha=0.4)
plt.ylabel('Sales Price',fontsize=15)
plt.xlabel('GrLivArea',fontsize=15)
plt.title('Checking For Outliers',fontsize=15)
plt.grid( alpha=0.5,color='lightslategrey')
sp = plt.gca().spines
sp['top'].set_visible(False)
sp['right'].set_visible(False);

### SalePrice is Skewed Positively, Therefore Normalizing the Distribution

In [None]:
#Making Canvas
canv,axs = plt.subplots(2,2)
canv.set_size_inches(18,13)
canv.tight_layout(pad=7.0)
title = 'Before'

#Plotting and Tranforming

for rw in range(2):
    plt.sca(axs[rw][0])
    sns.distplot(train['SalePrice'] , fit=norm, ax = plt.gca())
    
    mu,sigma = norm.fit(train['SalePrice']) # Getting Fitting Parameters
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best',frameon=False)
    
    sp = plt.gca().spines
    sp['top'].set_visible(False)
    sp['right'].set_visible(False)
    plt.grid( alpha=0.5,color='lightslategrey')
    
    plt.ylabel('Frequency')
    plt.title('SalePrice Distribution {} Tranformation'.format(title))
    
    plt.sca(axs[rw][1])
    
    stats.probplot(train['SalePrice'], plot=plt)
    plt.title('Probability Plot {} Tranformation'.format(title))
    sp = plt.gca().spines
    sp['top'].set_visible(False)
    sp['right'].set_visible(False)
    
    if rw != 0: # Little bit of automation is not bad right!
        break
    
    train["SalePrice"] = np.log1p(train["SalePrice"])
    title = 'After'

# Handling Missing Data

In [None]:
#Train Data
train_na = (train.isnull().sum() / len(train)) * 100
train_na = train_na.drop(train_na[train_na==0].index).sort_values(ascending=False)

train_na_df = pd.DataFrame({'Missing Ratio' :train_na})
train_na_df

In [None]:
#Test Data
test_na = (test.isnull().sum() / len(test)) * 100
test_na = test_na.drop(test_na[test_na==0].index).sort_values(ascending=False)

test_na_df = pd.DataFrame({'Missing Ratio' :test_na})
test_na_df

In [None]:
data = train_na
title = 'Train'
for _ in range(2):
    plt.figure(figsize=(18, 10))
    sns.barplot(x=data.index, y=data)
    plt.xticks(rotation='90')  
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of missing values', fontsize=15)
    plt.title('Percent missing data by feature in {} Data'.format(title), fontsize=15)
    
    sp = plt.gca().spines
    sp['top'].set_visible(False)
    sp['right'].set_visible(False)
    
    if title != 'Train': # Little bit of automation is never bad!
        break
    data = test_na
    title = 'Test'

## Going Through Columns

- **PoolQC :** data description says NA mean 'No Pool'

In [None]:
train['PoolQC'] = train['PoolQC'].fillna('None')
test['PoolQC'] = test['PoolQC'].fillna('None')

- **MiscFeature :** data description says NA means 'No Misc Feature'

In [None]:
train["MiscFeature"] = train["MiscFeature"].fillna("None")
test["MiscFeature"] = test["MiscFeature"].fillna("None")

- **Alley :** data description says NA means 'No Alley Access'

In [None]:
train["Alley"] = train["Alley"].fillna("None")
test["Alley"] = test["Alley"].fillna("None")

- **Fence :** data description says NA means 'No Fence'

In [None]:
train["Fence"] = train["Fence"].fillna("None")
test["Fence"] = test["Fence"].fillna("None")

- **FireplaceQu :** data description says NA means 'No Fireplace'

In [None]:
train["FireplaceQu"] = train["FireplaceQu"].fillna("None")
test["FireplaceQu"] = test["FireplaceQu"].fillna("None")

- **LotFrontage :** we will fill in missing values by the median LotFrontage of the neighborhood.

In [None]:
mapper = train.groupby("Neighborhood").median()['LotFrontage'].to_dict()

for k,v in mapper.items():
    train.loc[(train['LotFrontage'].isnull() == True) & (train['Neighborhood'] == k), 'LotFrontage'] = v
    test.loc[(test['LotFrontage'].isnull() == True) & (test['Neighborhood'] == k), 'LotFrontage'] = v

- **GarageType, GarageFinish, GarageQual and GarageCond :** Replacing missing data with 'None'

In [None]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

- **GarageYrBlt, GarageArea and GarageCars :** Replacing missing data with 0 

In [None]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

- **BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :** missing values to '0'

In [None]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

- **BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2 :** NaN to 'None'

In [None]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

- **MasVnrArea and MasVnrType :** We can fill 0 for the area and None for the type

In [None]:
#MasVnrType
train["MasVnrType"] = train["MasVnrType"].fillna("None")
test["MasVnrType"] = test["MasVnrType"].fillna("None")

#MasVnrArea
train["MasVnrArea"] = train["MasVnrArea"].fillna(0)
test["MasVnrArea"] = test["MasVnrArea"].fillna(0)

- **MSZoning :** So we can fill in missing values with the most common value as the % of missing value is really low

In [None]:
train['MSZoning'] = train['MSZoning'].fillna(train['MSZoning'].mode().item())
test['MSZoning'] = test['MSZoning'].fillna(test['MSZoning'].mode().item())

- **Utilities :** For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. We can then safely remove it.

In [None]:
train.drop('Utilities',axis=1,inplace=True)
test.drop('Utilities',axis=1,inplace=True)

- **Functional :** data description says NA means typical

In [None]:
train["Functional"] = train["Functional"].fillna("Typ")
test["Functional"] = test["Functional"].fillna("Typ")

- **Electrical :** Since this feature has mostly 'SBrkr', we can set that for the missing value.

In [None]:
train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
test['Electrical'] = test['Electrical'].fillna(test['Electrical'].mode()[0])

- **KitchenQual :** we set 'TA' the most frequent value for the missing value

In [None]:
train['KitchenQual'] = train['KitchenQual'].fillna(train['KitchenQual'].mode()[0])
test['KitchenQual'] = test['KitchenQual'].fillna(test['KitchenQual'].mode()[0])

- **Exterior1st and Exterior2nd :** Using the most common value again

In [None]:
for col in ['Exterior1st','Exterior2nd']:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

- **SaleType :** Filling with most frequent "WD"

In [None]:
train['SaleType'] = train['SaleType'].fillna(train['SaleType'].mode()[0])
test['SaleType'] = test['SaleType'].fillna(test['SaleType'].mode()[0])

- **MSSubClass :** Replacing missing values with None

In [None]:
train['MSSubClass'] = train['MSSubClass'].fillna("None")
test['MSSubClass'] = test['MSSubClass'].fillna("None")

### Checking for missing values again

In [None]:
train_na = (train.isnull().sum() / len(train)) * 100
train_na = train_na.drop(train_na[train_na==0].index).sort_values(ascending=False)

train_na_df = pd.DataFrame({'Missing Ratio' :train_na})
train_na_df

In [None]:
test_na = (test.isnull().sum() / len(test)) * 100
test_na = test_na.drop(test_na[test_na==0].index).sort_values(ascending=False)

test_na_df = pd.DataFrame({'Missing Ratio' :test_na})
test_na_df

# Feature Engineering

### Transforming some numerical variables that are really categorical

In [None]:
#MSSubClass=The building class
train['MSSubClass'] = train['MSSubClass'].apply(str)
test['MSSubClass'] = test['MSSubClass'].apply(str)

#Changing OverallCond into a categorical variable
train['OverallCond'] = train['OverallCond'].astype(str)
test['OverallCond'] = test['OverallCond'].astype(str)

#Year and month sold are transformed into categorical features.
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)

test['YrSold'] = test['YrSold'].astype(str)
test['MoSold'] = test['MoSold'].astype(str)

### Label Encoding some categorical variables that may contain information in their ordering set

In [None]:
from sklearn.preprocessing import LabelEncoder

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(set(train[c].unique().tolist() + test[c].unique().tolist())))
    train[c] = lbl.transform(list(train[c].values))
    test[c] = lbl.transform(list(test[c].values))

### Adding feature

In [None]:
# Adding total sqfootage feature 
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

# Skewed Feature

In [None]:
#Seperating Columns for Skew check
y_train = train.SalePrice
train.drop(['SalePrice','Id'],axis=1,inplace=True)
test_Ids = test['Id']
test.drop('Id',axis=1,inplace=True)

In [None]:
numeric_f = train.dtypes[train.dtypes != "object"].index

# Check the skew of all numerical features

skewed_f = train[numeric_f].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew in train data' :skewed_f})
skewness.head(10)

In [None]:
#Transforming train Data

skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features in train data to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    train[feat] = boxcox1p(train[feat], lam)

In [None]:
numeric_f = test.dtypes[test.dtypes != "object"].index

# Check the skew of all numerical features

skewed_f = test[numeric_f].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew in test data' :skewed_f})
skewness.head(10)

In [None]:
#Transforming test Data

skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features in test data to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    test[feat] = boxcox1p(test[feat], lam)

### Getting dummy variables

In [None]:
#Getting Dummy Variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

#Balancing Data Sets
missing_cols = set(train.columns) - set(test.columns)
for c in missing_cols:
    test[c] = 0  

missing_cols = set(test.columns) - set(train.columns)
for c in missing_cols:
    train[c] = 0
    
test = test[train.columns.tolist()]

#Checking Shapes
train.shape,test.shape

# Model Building

In [None]:
#Importing Libraries

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [None]:
#Define a cross Validation Strategy

n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

## Base Models

- Lasso Regression

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

- Elastic Net Regression 

In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

- Kernel Ridge Regression :

In [None]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

- Gradient Boosting Regression :

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

- XGBoost :

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, verbosity=0,
                             random_state =7, nthread = -1)

- LightGBM :

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

## Base Model Scoring

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

# Model Stacking

## Average Based Model Class

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

## Averaging Class Score


In [None]:
averaged_score = AveragingModels(models = (ENet, GBoost, KRR, lasso))

score = rmsle_cv(averaged_score)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# Adding a Meta Class 
### if you don't what this is, you can read more about this in this [Notebook](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard/notebook)

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y.iloc[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

### Scoring

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

# Final Prediction

In [None]:
#rmsle Func
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

### StackedRegressor

In [None]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

### XGBoost

In [None]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

### LightGBM

In [None]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

### Final Train Score

In [None]:
print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

### Test Prediction

In [None]:
Predictions = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15

In [None]:
subm = pd.DataFrame()
subm['Id'] = test_Ids
subm['SalePrice'] = Predictions
subm.to_csv('submission.csv',index=False)