In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder
from scipy.special import boxcox1p

from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

* **Data Description :**
   Explore data and features and get to know it better
* **Missing Data :**
   Check if any missing value that needs to be taken care
* **Explore Target :**
    Check and explore what the values look like in target that we are going to predict and normalize if needed
* **Feature Engineering and Feature Exploration :**
    Exploring and taking closer look at all the Features
* **Multicollinearity Check :**
    Explore and check feature relation
* **Handling Outliers :**
    If there is any outlier that needs adjusting or if need to be removed
* **Handling Missing values :**
    Imputing missing records by evaluating the feature
* **Label and Categorical Encoding :**
    To get features in same scale for better model performance and early model convergence
* **Creating new Features :**
    New features for better model accuracy
* **Handling Skewed Features :**
    Normalizing the features for better model performance.
    Visualizing skewed features and after normalizing the features
* **Model Building and Prediction :**
    Model building and tuning

In [None]:

df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

**Look and feel of train and test data set**

In [None]:
display(df_train.sample(5))
display(df_test.sample(5))

In [None]:
# Original training and test data shape

print("Training data shape : {}".format(df_train.shape))
print("Test data shape     : {}".format(df_test.shape))

**Generic descriptive statistics of training data set**

In [None]:
df_train.describe()

In [None]:
# Takin a glance at all the features present
df_train.columns

**A quick look at feature data types and data type grouped counts**

In [None]:
df_train.info(verbose = False);

**Missing Data Exploration**

In [None]:
missing_data = df_train.isna().sum()
missing_data = missing_data[missing_data > 0]
missing_data_sorted = missing_data.sort_values(ascending = False)

percent_missing = df_train.isna().mean().round(4)*100
percent_missing = percent_missing[percent_missing>0]
percent_missing_sorted = percent_missing.sort_values(ascending = False)

df_missing = pd.DataFrame()
df_missing['Missing_Data'] = missing_data_sorted
df_missing['Missing_Percent'] = percent_missing_sorted
display(df_missing)

f, ax = plt.subplots(figsize=(8, 7))
 
percent_missing_sorted.plot.bar(color="b")
ax.xaxis.grid(False)
ax.set(ylabel="Percent missing")
ax.set(xlabel="Misssing features")
ax.set(title="Missing percent Visual")
sns.despine(trim=True,left=True)

In [None]:
# Store  and drop the Id column just incase needed later
train_id = df_train['Id']
test_id  = df_test['Id']

df_train.drop(['Id'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

**Explore the Target - Sale Price and Visualize the data distribution**

In [None]:
# Explore the target that we want to predict

print('\033[1m' + "Description of Sale Price:")
display(df_train['SalePrice'].describe());

print('      ')

# Skew and kurt

print('\033[1m' + "Skewness and Kurtosis of Sale Price:")
   
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())



In [None]:

f, ax = plt.subplots(figsize =(15,8))
sns.distplot(df_train['SalePrice'], rug = True, fit = norm)
ax.xaxis.grid(False)
ax.set(ylabel="Frequency_SalePrice")
ax.set(xlabel="SalePrice")
ax.set(title="Distribution of SalePrice")
sns.despine(trim=True, left=True)
plt.show();

**Target - Sale Price is Right Skewed. Used log1p to Normalize**

In [None]:
#So target's (SalePrice) distribution is right skewed so have to normalize target first.

df_train["SalePrice"] = np.log1p(df_train["SalePrice"])

df_train['SalePrice'].describe()
print('      ')

# Skew and kurt

print('\033[1m' + "Skewness and Kurtosis of Sale Price after normalization:")
   
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())

**Target variable (Sale Price) plot after Normalization**

In [None]:
f, ax = plt.subplots(figsize =(15,8))
sns.distplot(df_train['SalePrice'], rug = True, fit = norm)
ax.xaxis.grid(False)
ax.set(ylabel="Frequency_SalePrice")
ax.set(xlabel="SalePrice")
ax.set(title="Distribution of SalePrice")
sns.despine(trim=True, left=True)
plt.show();

**Feature Exploration**

In [None]:
def CategoryFeaturePlot(columns):
    fig = plt.figure(figsize=(30,55))
    
    for i, column in enumerate(columns):

       
        plt.subplot(16,5, i+1)
        
        sns.scatterplot(x = column, y = df_train['SalePrice'], data = df_train,s = 80)
        
        plt.xticks(rotation = 90,fontsize=10)
        plt.tight_layout()

    fig.show()

In [None]:
CategoryFeaturePlot(df_train.columns)

# Multicollinearity check

**Correlation matrix to see how features are correlated with each other and Sale Price**

In [None]:
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(corrmat, vmin = 0,vmax=1, square=True, cmap = 'RdBu', annot = True, fmt = '.1f', 
            linecolor = 'black', center = 0,annot_kws={"size": 7},);


In [None]:
#If plot the Heat map for features with correlation more than 75% we get below plot 

corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corrmat, vmin = 0,vmax=1, square=True, cmap = 'RdBu', annot = True,mask= corrmat < 0.75, fmt = '.1f', 
            linecolor = 'black', center = 0,annot_kws={"size": 7},);

**These features have good relation:**

OverallQual <-> SalesPrice 
YearBuilt <-> GarageYrBlt 
1stFlrSF <-> TotalBsmtSF 
GrLivArea <-> TotRmsAbvGrd 
GarageCars <-> GarageArea

**Although these features look to have close relation, they are not enough to be removed. Let's take a closer look on these Five Pairs - Pair Plot**

In [None]:
Pair1 = ['OverallQual', 'SalePrice']
Pair2 = ['YearBuilt', 'GarageYrBlt']
Pair3 = ['1stFlrSF', 'TotalBsmtSF']
Pair4 = ['GrLivArea' , 'TotRmsAbvGrd']
Pair5 = ['GarageCars', 'GarageArea']

In [None]:
sns.pairplot(df_train, vars = Pair1, kind = 'reg',diag_kind = 'hist');

In [None]:
sns.pairplot(df_train, vars= Pair2, kind = 'reg',diag_kind = 'hist');

In [None]:
sns.pairplot(df_train, vars = Pair3, kind = 'reg',diag_kind = 'hist');

In [None]:
sns.pairplot(df_train, vars = Pair4, kind = 'reg',diag_kind = 'hist');

In [None]:
sns.pairplot(df_train, vars = Pair5, kind = 'reg',diag_kind = 'hist');

**Explore Outliers : If take a closer look at GrLivArea it is clearly visible some outliers**

In [None]:
CategoryFeaturePlot(['GrLivArea'])

In [None]:
# Removing any records is very costly but this one may help the model so remove this outlier
df_train.drop(df_train[(df_train['GrLivArea']>4500) & (df_train['SalePrice']<300000)].index, inplace=True)
df_train.reset_index(drop=True, inplace=True)

**Feature Engineering and combining train and test data for better feature engineering**

In [None]:
train_labels = df_train['SalePrice'].reset_index(drop=True)
train_set = df_train.drop(['SalePrice'], axis=1)
test_set = df_test

# Combine train and test features in order to apply the feature transformation pipeline to the entire dataset
all_data = pd.concat([train_set, test_set]).reset_index(drop=True)
print("Shape of all data : {}".format(all_data.shape))

**Missing value Imputation**

In [None]:

# PoolQC is categorical with the values (Ex,Gd,TA,Fa,NA) so  will fill the missing with 'None'
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
# MiscFeature is categorical with the values (Elev,Gar2,Othr,Shed,TenC,NA) so  will fill the missing with 'None'
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
# Alley is categorical with the values (Grvl,Pave,NA) so  will fill the missing with 'None'
all_data["Alley"] = all_data["Alley"].fillna("None")
# Fence is categorical with the values (GdPrv,MnPrv,GdWo,MnWw,NA) so  will fill the missing with 'None'
all_data["Fence"] = all_data["Fence"].fillna("None")
# FireplaceQu is categorical with the values (Ex,Gd,TA,Fa,Po,NA) so  will fill the missing with 'None'
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
# Median of Lot frontage for the whole neighbourhood
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# categorical 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond' fill with None
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')
 # If no garage then no car
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)
# Fill in 0 for no basement
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)
    
 # Categorical 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2' fill with None
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')
# Categorical MasVnrType fill with None
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
# Fill 0 if no area
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
# For MSZoning 'RL' is predominant so will take that for missing ones
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
# Categorical Utilities fill with NOne
all_data["Utilities"] = all_data["Utilities"].fillna("None")
# For Functional if no data that means typical
all_data["Functional"] = all_data["Functional"].fillna("Typ")

# For Electrical 'SBrkr' is predominant so will take that for missing ones
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

# For KitchenQual 'TA' is predominant so will take that for missing ones
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

# For Exterior1st and Exterior2nd since very few missing will take most predominant one
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

# For SaleType 'WD' is predominant so will take that for missing ones
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

# For categorical MSSubClass fill with None
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

**Verify if still any missing record**

In [None]:
missing_data = all_data.isna().sum()
missing_data = missing_data[missing_data > 0]
missing_data_sorted = missing_data.sort_values(ascending = False)

print("Missing data :".format(missing_data_sorted))

**There are few Numerical look a like Features but actually Categorical so transform them as Categorical**

In [None]:
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

**Categorical Features and Label Encoding**

In [None]:
cat_cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

In [None]:
#apply LabelEncoder to categorical features
for col in cat_cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[col].values)) 
    all_data[col] = lbl.transform(list(all_data[col].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

**Create New Features**

In [None]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['YrBltRemod'] = all_data['YearBuilt'] + all_data['YearRemodAdd']
all_data['TotalBathrooms'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) +
                               all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))
all_data['TotalPorchSf'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                              all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                              all_data['WoodDeckSF'])

all_data["LivLotRatio"] = all_data['GrLivArea']/all_data['LotArea']

all_data["TotalOutsideSF"] = sum((all_data['WoodDeckSF'],all_data['OpenPorchSF'],all_data['EnclosedPorch'], all_data['ScreenPorch']))


# Handling Skewed Features

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("Skewed features :\n")

skewness = pd.DataFrame()
skewness['Skew_value'] = skewed_feats
skewness.head(10)

In [None]:
skewed_features = ['MiscVal','PoolArea','LotArea','LowQualFinSF','3SsnPorch','LandSlope','KitchenAbvGr','BsmtFinSF2','EnclosedPorch','ScreenPorch']

**Visualize the top 10 skewed features distribution with a histogram and maximum likelihood gaussian distribution fit:**

In [None]:

def CategoryFeaturePlot(columns):
    fig = plt.figure(figsize=(23,7))
    for i, col in   enumerate(columns):
        plt.subplot(2,5, i+1)
        sns.distplot(all_data[col],fit=norm, kde=False)
        plt.tight_layout()
    fig.show()

In [None]:
CategoryFeaturePlot(skewed_features)

**Normalize skewed features with boxcox1p**

In [None]:
skewness = skewness[abs(skewness) > 0.70]

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
  
    all_data[feat] = boxcox1p(all_data[feat], lam)
    

**Visualize the distributions after normalization**

In [None]:
normalized_features = ['MiscVal','PoolArea','LotArea','LowQualFinSF','3SsnPorch','LandSlope','KitchenAbvGr','BsmtFinSF2','EnclosedPorch','ScreenPorch']

In [None]:
CategoryFeaturePlot(normalized_features)

**Encoding Categorical features with dummy encoding**

In [None]:
all_data = pd.get_dummies(all_data)
print("Shape of all data : {}".format(all_data.shape))

In [None]:
all_data.head()

# Model Building and Tuning

In [None]:
# Remove any duplicate column names
all_data = all_data.loc[:,~all_data.columns.duplicated()]

**Training and Test Data split**

In [None]:
X = all_data.iloc[:len(train_labels), :]
X_test = all_data.iloc[len(train_labels):, :]
X.shape, train_labels.shape, X_test.shape

In [None]:
#((1458, 228), (1458,), (1459, 228))

In [None]:
#Validation function
kf = KFold(n_splits=12, random_state=42, shuffle=True)

def rmse_cv(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, train_labels, scoring="neg_mean_squared_error", cv=kf))
    return rmse

In [None]:
# Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


In [None]:
best_alpha = 0.00099
model_lasso = make_pipeline(RobustScaler(),Lasso(alpha=best_alpha, max_iter=50000))

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       subsample=0.8 , 
                       subsample_freq=4,  
                       bagging_seed=8,
                       colsample_bytree=0.2, 
                       feature_fraction_seed=8,
                       min_child_weight=0.001, 
                       verbose=-1,
                       random_state=42)

In [None]:
# XGBoost Regressor

model_xgb = xgb.XGBRegressor (learning_rate=0.05,
                       n_estimators=7200,
                       max_depth=6,
                       min_child_weight=1.5,
                       gamma=0.0,
                       subsample=0.2,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.9,
                       reg_lambda=0.6,
                       random_state=42)  


In [None]:
# Support Vector Regressor
model_svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

In [None]:
# Gradient Boosting Regressor
model_gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

In [None]:
# Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

**Model Stacking**

In [None]:

regressors = (model_lasso,model_lgb,model_xgb,  model_svr, model_gbr, model_rf)

stacking_reg = StackingCVRegressor(regressors=regressors,
                                meta_regressor= model_lasso,
                                use_features_in_secondary=True)
                             

In [None]:
scores = {}

score = rmse_cv(model_lasso)
print("model_lasso: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['model_lasso'] = (score.mean(), score.std())

In [None]:
scores = {}

score = rmse_cv(model_lgb)
print("model_lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['model_lgb'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(model_xgb)
print("model_xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['model_xgb'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(model_svr)
print("model_SVR: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['model_svr'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(model_rf)
print("model_rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['model_rf'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(model_gbr)
print("model_gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['model_gbr'] = (score.mean(), score.std())

**Model fitting and Prediction**

In [None]:

stacked_reg_model_fit = stacking_reg.fit(np.array(X), np.array(train_labels))
stacked_train_pred = stacked_reg_model_fit.predict(X)
stacked_pred = np.floor(np.expm1(stacked_reg_model_fit.predict(X_test)))
print(rmsle(train_labels, stacked_train_pred))

In [None]:
lasso_model_fit = model_lasso.fit(X, train_labels)
lasso_train_pred = lasso_model_fit.predict(X)
lasso_pred = np.floor(np.expm1(lasso_model_fit.predict(X_test)))
print(rmsle(train_labels, lasso_train_pred))

In [None]:
lgb_model_fit = model_lgb.fit(X, train_labels)
lgb_train_pred = lgb_model_fit.predict(X)
lgb_pred = np.floor(np.expm1(lgb_model_fit.predict(X_test)))
print(rmsle(train_labels, lgb_train_pred))

In [None]:
xgb_model_fit = model_xgb.fit(X, train_labels)
xgb_train_pred = xgb_model_fit.predict(X)
xgb_pred = np.floor(np.expm1(xgb_model_fit.predict(X_test)))
print(rmsle(train_labels, xgb_train_pred))

In [None]:
svr_model_fit = model_svr.fit(X, train_labels)
svr_train_pred = svr_model_fit.predict(X)
svr_pred = np.floor(np.expm1(svr_model_fit.predict(X_test)))
print(rmsle(train_labels, svr_train_pred))

In [None]:

rf_model_fit = model_rf.fit(X, train_labels)
rf_train_pred = rf_model_fit.predict(X)
rf_pred = np.floor(np.expm1(rf_model_fit.predict(X_test)))
print(rmsle(train_labels, rf_train_pred))

In [None]:
gbr_model_fit = model_gbr.fit(X, train_labels)
gbr_train_pred = gbr_model_fit.predict(X)
gbr_pred = np.floor(np.expm1(gbr_model_fit.predict(X_test)))
print(rmsle(train_labels, gbr_train_pred))

**Blended rmsle**

In [None]:
print('Blended rmsle score : {}'.format(rmsle(train_labels, (stacked_train_pred * 0.40) + (lgb_train_pred * 0.15) + (gbr_train_pred * 0.15)  + (rf_train_pred * .09) + 
            (svr_train_pred * .08) + (lasso_train_pred * .07) + (xgb_train_pred * .06) )))

In [None]:
ensemble = ((stacked_pred * 0.30) + (lgb_pred * 0.30) + (gbr_pred * 0.25)  + (rf_pred * .05) + 
            (svr_pred * .04) + (lasso_pred * .03) + (xgb_pred * .03))


In [None]:
submission = pd.DataFrame({'Id': test_id, 'SalePrice': ensemble})

submission.to_csv('submission.csv',index=False)

print("Submitted successfully!")