In [None]:
# Loading neccesary packages:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, QuantileTransformer, PowerTransformer, OrdinalEncoder, OneHotEncoder
# experiment class
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

#

import warnings
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

# Meeting the data
We're going to start by loading the data and taking first look on it as usual. For the column names we have great dictionary file in our dataset location so we can get familiar with them in no time.

In [None]:
# Loading datasets.

train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train

In [None]:
train.describe()

* Id column looks useless we can safely drop it from both. I'm going to save our target (SalePrice) on different variable so we can use it in future.

In [None]:
# Dropping unnecessary Id column.

train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [None]:
# Backing up target variables and dropping them from train data.

y = train.SalePrice.reset_index(drop=True)
X = train.drop('SalePrice', axis=1)

# Analysis Time!
Ok the short inspection at the beginning give us some hints how should we move from here. I'm going to play with the data we have while analysing the data at the same time. With this way I hope we can get the data in better shape while digging deeper into it.

We're going to start with basic correlation table here. I dropped the top part since it's just mirror of the other part below. With this table we can understand some linear relations between different features.

### Observations:
* There's strong relation between overall quality of the houses and their sale prices.
* Again above grade living area seems strong indicator for sale price.
* Garage features, number of baths and rooms, how old the building is etc. also having effect on the price on various levels too.
* There are some obvious relations we gonna pass like total square feet affecting how many rooms there are or how many cars can fit into a garage vs. garage area etc.
* Overall condition of the house seems less important on the pricing, it's interesting and worth digging.

In [None]:
# Display numerical correlations (pearson) between features on heatmap.

sns.set(font_scale=1.1)
correlation_train = train.corr()
mask = np.triu(correlation_train.corr())
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_train,
            annot=True,
            fmt='.1f',
            cmap='coolwarm',
            square=True,
            mask=mask,
            linewidths=1,
            cbar=False)

plt.show()

# delete unnecesory varieble 
del correlation_train, mask


* **I'm going to merge the datasets here before we start editing it so we don't have to do these operations twice. Let's call it features since it has features only. So our data has 2919 observations and 79 features to begin with...**

In [None]:
# Merging train test features for engineering.

features = pd.concat([X, test]).reset_index(drop=True)
print(features.shape)

# Missing Data
Alright, first of all we need detect missing values, then wee need to get rid of them for the next steps of our work. So let's list our missing values and visualize them:

In [None]:
def missing_percentage(df):
    
    """A function for returning missing ratios."""
    total = df.isnull().sum().sort_values(ascending=False)
    
    return pd.concat([total, (total / len(df) * 100)], axis=1, keys=['Total', 'Percent'])[total!=0]

* **That's quite a lot! No need to panic though we got this. If you look at the data description given to us we can see that most of these missing data actually not missing, it's just means house doesn't have that specific feature, we can fix that easily...**

In [None]:
# Checking 'NaN' values.

missing = missing_percentage(features)

fig, ax = plt.subplots(figsize=(20, 7))
sns.barplot(x=missing.index, y='Percent', data=missing, palette='Reds_r')
plt.xticks(rotation=90)

display(missing.T.style.background_gradient(cmap='Reds', axis=1))

del missing

# Ok this is how we gonna fix most of the missing data:
1. First we fill the NaN's in the columns where they mean 'None' so we gonna replace them with that,
2. Then we fill numerical columns where missing values indicating there is no parent feature to measure, so we replace them with 0's.
3. Even with these there are some actual missing data, by checking general trends of these features we can fill them with most frequent value(with mode).
4. MSZoning part is little bit tricky I choose to fill them with most common type of the related MSSubClass type. It's not perfect but at least we decrease randomness a little bit.
4. Again we fill the Lot Frontage with similar approach.

In [None]:
# List of 'NaN' including columns where NaN's mean's none.

none_cols = [
    'Alley', 'PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'
]

# List of 'NaN' including columns where NaN's mean's 0.

zero_cols = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
    'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea'
]

# List of 'NaN' including columns where NaN's actually missing gonna replaced with mode.

most_cols = [
    'Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual',
    'SaleType', 'Utilities', 'MSZoning'
]

# Multivariate feature imputation method
# for ref => https://scikit-learn.org/stable/modules/impute.html#univariate-feature-imputation

regg_cols = ['LotFrontage']#,'MSZoning']


In [None]:
# impute missing value with Column transformer

missing_value_preprocessor = ColumnTransformer(
    transformers=[
        # imputation
        ('none_imputer', SimpleImputer(fill_value= 'none', strategy='constant'), none_cols),
        ('zero_imputer', SimpleImputer(fill_value= 0, strategy='constant'), zero_cols),
        ('most_imputer', SimpleImputer(strategy='most_frequent'), most_cols),
        # experimental class imputation => Multivariate feature imputation
        ('regg_features', IterativeImputer(max_iter=10, random_state=0), regg_cols),
    ],
    remainder = 'passthrough',
)

In [None]:
col_tr_col = none_cols + zero_cols + most_cols
col_tr_col = col_tr_col + [i for i in features.columns if i not in col_tr_col]

features = pd.DataFrame(
    missing_value_preprocessor.fit_transform(features),
    columns = col_tr_col)
features = features.convert_dtypes()

# Feature Engineering
Ok this is the part where we dig deeper into our completed dataset. There are no missing values so we're good to go! I'm going to start with grouping some values, these values are really rare and I'm thinking they do not add much, so if they appear less than 10 times in our observations they get into 'Other' group.

In [None]:
# Transforming rare values(less than 10) into one group.

others = [
    'Condition1', 'Condition2', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
    'Heating', 'Electrical', 'Functional', 'SaleType'
]

for col in others:
    mask = features[col].isin(
        features[col].value_counts()[features[col].value_counts() < 10].index)
    features[col][mask] = 'Other'

In [None]:
def show_box(y, df):
    
    '''A function for displaying categorical variables.'''
    
    fig, axes = plt.subplots(14, 3, figsize=(25, 80))
    axes = axes.flatten()
    
    for i, j in zip(df.select_dtypes(include=['object']).columns, axes):
        
        sortd = df.groupby([i])[y].median().sort_values(ascending=False)
        sns.boxplot(x=i,
                    y=y,
                    data=df,
                    palette='plasma',
                    order=sortd.index,
                    ax=j)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()



# Categorical Data
We already checked some of the numerical features with correlation heatmap but what about categorical values? We want to see relations between categorical data and sale price. Boxplots seems decent way to inspect this type of relation. We're also going to sort them by the median value of that group so we can see the importances in descending order.


In [None]:
# Displaying sale prices vs. categorical values:

show_box('SalePrice', train)

# Encoding categorical features

In [None]:
# Converting some of the categorical values to numeric ones.
ordi_encode_cols = ['Neighborhood', 'ExterQual', 'ExterCond','BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageCond', 'GarageQual']

categorical_value_preprocessor = ColumnTransformer(
    transformers=[('ordinal_encoder', OrdinalEncoder(), ordi_encode_cols)],
    remainder = 'passthrough',
)

features = pd.DataFrame(categorical_value_preprocessor.fit_transform(features),
             columns = (ordi_encode_cols + [i for i in features.columns if i not in ordi_encode_cols]))
features = features.convert_dtypes()


# Numeric Data
There are many numeric features the inspect, one of the best ways to see how they effect sale prices is scatter plots. We're also plotting polynomial regression lines to see general trend. With this way we can understand the numerical values and their importance on sale price, also it's really helpful to spot outliers.

### Observations:
* OverallQual; It's clearly visible that sale price of the house increases with overall quality. This confirms the correlation in first table we did at the beginning. (Pearson corr was 0.8)

* OverallCondition; Looks like overall condition is left skewed where most of the houses are around 5/10 condition. But it doesn't effect the price like quality indicator...

* YearBuilt; Again new buildings are generally expensive than the old ones.

* Basement; General table shows bigger basements are increasing the price but I see some outliers there...

* GrLivArea; This feature is pretty linear but we can spot two outliers effecting this trend. There are some huge area houses with pretty cheap prices, there might be some reason behind it but we better drop them.

* SaleDates; They seem pretty unimportant on sale prices, we can drop them...

In [None]:
# Plotting numerical features with polynomial order to detect outliers by eye.

def show_reg(y, df):
    fig, axes = plt.subplots(12, 3, figsize=(25, 80))
    axes = axes.flatten()
    
    for i, j in zip(df.select_dtypes(include=['number']).columns, axes):

        sns.regplot(x=i,
                    y=y,
                    data=df,
                    ax=j,
                    order=3,
                    ci=None,
                    color='#e74c3c',
                    line_kws={'color': 'black'},
                    scatter_kws={'alpha':0.4})
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=10))

        plt.tight_layout()

show_reg('SalePrice', train)

# Outliers
Ok here we're going to drop some outliers we detected them just above, this part is kinda subjective and can try different approaches or implement some automatic outlier detection methods like isolation forests.

In [None]:
# Dropping outliers after detecting them by eye.

features = features.join(y)

features = features.drop(features[(features['GrLivArea'] > 4000)
                                  & (features['SalePrice'] < 200000)].index)
features = features.drop(features[(features['GarageArea'] > 1200)
                                  & (features['SalePrice'] < 250000)].index)
features = features.drop(features[(features['TotalBsmtSF'] > 3000)
                                  & (features['SalePrice'] < 320000)].index)
features = features.drop(features[(features['1stFlrSF'] < 3000)
                                  & (features['SalePrice'] > 600000)].index)
features = features.drop(features[(features['1stFlrSF'] > 3000)
                                  & (features['SalePrice'] < 200000)].index)

y = features['SalePrice']
y.dropna(inplace=True)
features.drop(columns='SalePrice', inplace=True)


# Creating New Features
Ok in this part we going to create some features, these can improve our modelling. I went with basic approach by merging some important indicators and making them stronger.

In [None]:
# Creating new features  based on previous observations. There might be some highly correlated features now. You cab drop them if you want to...

features['TotalSF'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                       features['1stFlrSF'] + features['2ndFlrSF'])
features['TotalBathrooms'] = (features['FullBath'] +
                              (0.5 * features['HalfBath']) +
                              features['BsmtFullBath'] +
                              (0.5 * features['BsmtHalfBath']))

features['TotalPorchSF'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                            features['EnclosedPorch'] +
                            features['ScreenPorch'] + features['WoodDeckSF'])

features['YearBlRm'] = (features['YearBuilt'] + features['YearRemodAdd'])

# Merging quality and conditions.

features['TotalExtQual'] = (features['ExterQual'] + features['ExterCond'])
features['TotalBsmQual'] = (features['BsmtQual'] + features['BsmtCond'] +
                            features['BsmtFinType1'] +
                            features['BsmtFinType2'])
features['TotalGrgQual'] = (features['GarageQual'] + features['GarageCond'])
features['TotalQual'] = features['OverallQual'] + features[
    'TotalExtQual'] + features['TotalBsmQual'] + features[
        'TotalGrgQual'] + features['KitchenQual'] + features['HeatingQC']

# Creating new features by using new quality indicators.

features['QualGr'] = features['TotalQual'] * features['GrLivArea']
features['QualBsm'] = features['TotalBsmQual'] * (features['BsmtFinSF1'] +
                                                  features['BsmtFinSF2'])
features['QualPorch'] = features['TotalExtQual'] * features['TotalPorchSF']
features['QualExt'] = features['TotalExtQual'] * features['MasVnrArea']
features['QualGrg'] = features['TotalGrgQual'] * features['GarageArea']
features['QlLivArea'] = (features['GrLivArea'] -
                         features['LowQualFinSF']) * (features['TotalQual'])
features['QualSFNg'] = features['QualGr'] * features['Neighborhood']


In [None]:
# Observing the effects of newly created features on sale price.

def srt_reg():
    fig, axes = plt.subplots(5, 3, figsize=(25, 40))
    axes = axes.flatten()

    new_features = [
        'TotalSF', 'TotalBathrooms', 'TotalPorchSF', 'YearBlRm',
        'TotalExtQual', 'TotalBsmQual', 'TotalGrgQual', 'TotalQual', 'QualGr',
        'QualBsm', 'QualPorch', 'QualExt', 'QualGrg', 'QlLivArea', 'QualSFNg'
    ]
    merged = features.join(y)
    merged = merged[new_features+['SalePrice']].astype('float')

    for i, j in zip(new_features, axes):

        sns.regplot(x=i,
                    y='SalePrice',
                    data=merged,
                    ax=j,
                    order=3,
                    ci=None,
                    color='#e74c3c',
                    line_kws={'color': 'black'},
                    scatter_kws={'alpha':0.4})
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=10))

        plt.tight_layout()


# Checking New Features
Well... They look decent enough, I hope these can help us building strong models. I also wanted to add some more basic features for having specific feature or not. This approach was widely accepted by community so I see no harm to add them.

In [None]:
srt_reg()

In [None]:
# Creating some simple features.

features['HasPool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['Has2ndFloor'] = features['2ndFlrSF'].apply(lambda x: 1
                                                     if x > 0 else 0)
features['HasGarage'] = features['QualGrg'].apply(lambda x: 1 if x > 0 else 0)
features['HasBsmt'] = features['QualBsm'].apply(lambda x: 1 if x > 0 else 0)
features['HasFireplace'] = features['Fireplaces'].apply(lambda x: 1
                                                        if x > 0 else 0)
features['HasPorch'] = features['QualPorch'].apply(lambda x: 1 if x > 0 else 0)

**Here we dropping some unnecessary features had their use in feature engineering or not needed at all. Obviously it's subjective but I feel they don't add much to model. Then we one hot encode the categorical data left so everything will be prepared for the modelling.**

In [None]:
# Features to drop:

to_drop = [
    'Utilities',
    'PoolQC',
    'YrSold',
    'MoSold',
    'ExterQual',
    'BsmtQual',
    'GarageQual',
    'KitchenQual',
    'HeatingQC',
]

# Dropping features.

features.drop(columns=to_drop, inplace=True)

# OneHotEncoding

In [None]:
features = pd.get_dummies(data=features,columns= features.select_dtypes('string').columns)

# Transforming the Data
Some of the continious values are not distributed evenly and not fitting on normal distribution, we can fix them by using couple transformation approaches. We're going to use power tranformer here, again it's widely used by community and I want to thank them all for their great work.

In [None]:
# Numerical features we worked on which seems highly skewed but we filter again anyways...

skewed = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal'
]

In [None]:

power_column_preprocessor = ColumnTransformer(
    transformers=[('power_tranform', PowerTransformer(standardize=False), skewed)],
    remainder = 'passthrough',
)

features = pd.DataFrame(power_column_preprocessor.fit_transform(features),
             columns = (skewed + [i for i in features.columns if i not in skewed]))

In [None]:
sns.regplot(x='QualGr',y='SalePrice',data=features.join(y).astype('float'))
#features.convert_dtypes()

# Double Check
Before we move to modelling I want to take one last look to the data we processed. Everyting seems in order, not missing datas, values are numerical etc. Our feature engineered data is present...

Just want to check how transformed data correlates with sale prices before we move on and it looks decent.

Again I wanted to check our target value distribution and it seems little skewed. We can fix this by applying log transformation so our models can perform better.

In [None]:
# Separating train and test set.

train = features.iloc[:len(y), :]
test = features.iloc[len(train):, :]

In [None]:
a = train.join(y).astype('float')
correlations = a.corrwith(a['SalePrice']).iloc[:-1].to_frame()
correlations['Abs Corr'] = correlations[0].abs()
correlations.sort_values('Abs Corr',ascending=False).head()


# Modelling
Well then, it's time to do some modelling! First of all I wanted to thank kaggle community for loads of examples inspired me. Especially Alex Lekov's great script and Serigne's stacked regressions approach were great guides for me!

Let's start with loading packages needed and then we set our regressors. The regressors I'm going to use here are:

Ridge,
Lasso,
Elasticnet,
Support Vector Regression
I'm going to apply robust scaler on these before we run them because they really get effected by outliers.
Gradient Boosting Regressor
LightGBM Regressor
XGBoost Regressor
These don't need scaling in my opinion so we just go as it is
Hist Gradient Boosting Regressor
This is just for experimenting, it's still experimental on sklearn anyways
Tweedie Regressor
This regressor added in latest version of sklearn and I wanted to try it. It's generalized linear model with a Tweedie distribution. We gonna use power of 0 because we expecting normal target distribution but you can try this or other generalized models like poisson regressor or gamma regressor.
I tried to tune models by using Optuna package, that part is not added here.

In [None]:
# Loading neccesary packages for modelling.

from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor # This is for stacking part, works well with sklearn and others...

In [None]:
from sklearn.compose import TransformedTargetRegressor

tt = TransformedTargetRegressor(regressor=SVR(),
                                func=np.log, inverse_func=np.exp)


kf = KFold(10, random_state=42)
# Some parameters for ridge, lasso and elasticnet.

alphas_alt = [0.01, 13, 15.5, 15.6, 15.7, 15.8, 15.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [
    0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.001
]
e_alphas = [
    0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007
]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

# ridge_cv

ridge = make_pipeline(RobustScaler(), RidgeCV(
    alphas=alphas_alt,
    cv=kf,
))

# lasso_cv:

lasso = make_pipeline(
    RobustScaler(),
    LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kf))

# elasticnet_cv:

elasticnet = make_pipeline(
    RobustScaler(),
    ElasticNetCV(max_iter=1e7,
                 alphas=e_alphas,
                 cv=kf,
                 random_state=42,
                 l1_ratio=e_l1ratio))

# svr

svr = make_pipeline(
    RobustScaler(),
    SVR(
        C=20,
        gamma=0.00017,
))


gbr = GradientBoostingRegressor(n_estimators=2900,
                                learning_rate=0.0161,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=17,
                                loss='huber',
                                random_state=42)

# lightgbm:

lightgbm = LGBMRegressor(objective='regression',
                         learning_rate=0.00721,
                         )
'''n_estimators=3500,
                         num_leaves=5,
                         max_bin=163,
                         bagging_fraction=0.35711,
                         n_jobs=-1,
                         bagging_seed=42,
                         feature_fraction_seed=42,
                         bagging_freq=7,
                         feature_fraction=0.1294,
                         min_data_in_leaf=8)
'''
# xgboost:

xgboost = XGBRegressor(
    learning_rate=0.0139,
    )
'''n_estimators=4500,
    max_depth=4,
    min_child_weight=0,
    subsample=0.7968,
    colsample_bytree=0.4064,
    nthread=-1,
    scale_pos_weight=2,
    seed=42,
)'''



# stacking regressor:

stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr,
                                            xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

scores = cross_validate(
    lasso,
    train,
    np.log1p(y),
    cv=kf,
    scoring=['r2','neg_root_mean_squared_error'],
    return_train_score=True,
    n_jobs=-1)

pd.DataFrame(scores).mean()

In [None]:
def model_check(X, y, estimators, cv):
    
    ''' A function for testing multiple estimators.'''
    
    model_table = pd.DataFrame()

    row_index = 0
    for est, label in zip(estimators, labels):

        MLA_name = label
        model_table.loc[row_index, 'Model Name'] = MLA_name

        cv_results = cross_validate(est,
                                    X,
                                    y,
                                    cv=cv,
                                    scoring='neg_root_mean_squared_error',
                                    return_train_score=True,
                                    n_jobs=-1)

        model_table.loc[row_index, 'Train RMSE'] = -cv_results[
            'train_score'].mean()
        model_table.loc[row_index, 'Test RMSE'] = -cv_results[
            'test_score'].mean()
        model_table.loc[row_index, 'Test Std'] = cv_results['test_score'].std()
        model_table.loc[row_index, 'Time'] = cv_results['fit_time'].mean()

        row_index += 1

    model_table.sort_values(by=['Test RMSE'],
                            ascending=True,
                            inplace=True)

    return model_table

In [None]:
# Setting list of estimators and labels for them:

estimators = [ridge, lasso, elasticnet, gbr, svr,
             # xgboost, lightgbm
             ]
labels = [
    'Ridge', 'Lasso', 'Elasticnet', 'GradientBoostingRegressor',
    'SVR', #'XGBRegressor', 'LGBMRegressor', 
]

# Model Results
Allright, our results are here. Looks like our models did pretty close to each other, there might be some overfitting models and we can try to fix them by tuning but it was computationally expensive for me and since I'm going to stack and blend the models I think we can leave them as it is. We already added our models to stacking regression and set the XGBoost as meta regressor we can continue with stacking

In [None]:
# Executing cross validation.

raw_models = model_check(train, np.log1p(y), estimators, kf)
display(raw_models.style.background_gradient(cmap='summer_r'))