In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Summary

If you like this kernel then please upvote. If you fork it then definitely upvote. You would spot a number of functions and class to deal with the features & usage of pipeline to accidentally get into data leakage especially during cross validation and grid/randomized search. 

Please leave a comment if you have suggestions, feedback. The model can be defintely improved by smart choices of feature engineering. 


In [None]:
# python version
import sys
assert sys.version_info > (3,5)

# sklearn version
import sklearn
assert sklearn.__version__ > '0.20'

# common imports
import os
import pandas as pd
import numpy as np

#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# display visuals in the notebook
%matplotlib inline

# handle internal library warnings
import warnings
warnings.filterwarnings(action='ignore',message='')

# consistent plot size
from pylab import rcParams
rcParams['figure.figsize'] = 12,5
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['axes.labelsize'] = 12


## Load the data

In [None]:
house_train_full =pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
house_test =pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

## Part 1: Data Exploration

In [None]:
# view all the columns of the dataframe
pd.options.display.max_columns = None

In [None]:
# inspect the first few rows
house_train_full.head(10)

In [None]:
#house_train_full.info()

*There are too many features in the dataset. Efficient approach to check for the missing values would be to define a function to return the percentage of null values in each predictor*

In [None]:
# function to return the percentage of null values in a list of features
def percent_na(feature, df = house_train_full):
    for val in feature:
        return df[feature].isna().sum()/len(df)

*Print the percentage null values of each of the features in the descending order. Below it prints the top 10 features with missing values*

In [None]:
percent_na(list(house_train_full.columns),house_train_full).sort_values(ascending=False).head()

*Four features have more than 80% missing values and FireplaceQu has close to 50% missing values. Generall quality appears to have the maximum missing values, especially for features which are not very common in all houses, e.g fence, pool.*
* PoolQC - Pool Quality
* MiscFeature - Contains features not covered in other categories
* Alley - Type of alley access
* Fence -  Fence Quality
* FireplaceQu - Fireplace Quality

Lot Frontage has 17% missing values. However it could be a very important predictor for house sale price. 
Rest of the features have nominal (<5%) or no missing values.
 


*Check the distribution of the target value SalePrice in the training set*
*The distribution of the SalePrice is fairly normal with mean around 200,000. 500,000 and above seems to be outliers at first sight. We will check this using the box plot.*


In [None]:
plt.hist(house_train_full['SalePrice'],bins=30)
plt.title('Histogram of House Sale Price')
plt.xlabel('Sale Price');

In [None]:
# create boxplot of SalePrice
plt.boxplot(house_train_full['SalePrice'],vert=False)
plt.title('Boxplot of House Sale Price');

In [None]:
# log transform of the sale price and check the histogram and boxplot
plt.hist(np.log(house_train_full['SalePrice']),bins=30)
plt.title('Histogram of House Sale Price (Log transformed)')
plt.xlabel('Sale Price (log transformed)');

In [None]:
# test for normality of the log transformed sale price
from statsmodels.graphics.gofplots import qqplot

qqplot(np.log(house_train_full['SalePrice']),line='s')
plt.title('Quantile-Quantile Plot Log Transformed Sale Price');

From the Q-Q plot it is clear that the log transformed SalePrice follows normal distribution. Due to outliers, there is a deviation from the normal probability plot. 

In [None]:
# create boxplot of the log transformed SalePrice
plt.boxplot(np.log(house_train_full['SalePrice']),vert=False)
plt.title('Boxplot of House Sale Price (log transformed)');

In [None]:
# function to return the quantile of the numerical feature
def quantile_num(df,num_feature,quant):
    quantiles = []
    for q in quant:
        quantiles.append(np.quantile(df[num_feature],q))
    return quantiles        

In [None]:
quantile_num(df=house_train_full,num_feature='SalePrice',quant = [0.5,0.75,0.9,0.95,0.99])

*99th quantile is 442567. 
 95th quantile is 326099. 
I will next check what are the houses above these percentiles.*

In [None]:
house_train_full[house_train_full['SalePrice']>np.quantile(house_train_full['SalePrice'],0.99)]

In [None]:
house_train_full.describe().transpose()

*Above 700,000 is mentioned as Abnormal Sale condition (trade,foreclosure or short sale) and is the only one with a pool and a fence. The zone of this property is Residential Low Density. 
More information to understand the various attributes is in the data description. So better to read it.*

In [None]:
# uncomment to read the file content

#with  open('/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt') as file:
 #   file_content = file.read()
  #  print (file_content)

Reading the data description and with a bit of experience, the following categorical features could be important attributes in determining the price of the house. While at this early stage of exploration, it could be all wrong :-)
- Neighborhood
- Proximity to various conditions (Condition 1) ... Condition 2 is also very similar. I will check it next
- MSZoning (zoning classification of the sale , agricultural, high rise etc. ) 
- OverallCond (overall condition of the house)
- MSSubClass (the type of dwelling involved) 

In [None]:
# check the abnormal sales neighborhoods
house_train_full[house_train_full['SaleCondition']=='Abnorml']['Neighborhood'].value_counts().sort_values(ascending=False)

In [None]:
# check the neighborhood of the houses in sale price above the 99 percentile
house_train_full[house_train_full['SalePrice']>np.quantile(house_train_full['SalePrice'],0.99)]['Neighborhood'].value_counts().sort_values(ascending=False)

A more understanding of the neighborhood can be understood from the map of Ames city. Certain neighborhoods are deemed to be more expensive than the other ones. 

It is very effort intensive to check the value count of each categorical feature one at a time. I will define a function to separate all the categorical features and numerical features and explore them when needed. 

In [None]:
# function to list all the categorical features
def type_features(df,features):
    cat_features = []
    num_features = []
    for feat in features:
        if df[feat].dtype == 'O':
            cat_features.append(feat)
        else:
            num_features.append(feat)
    return (cat_features,num_features)

In [None]:
categorical_features, numerical_features = type_features(house_train_full,house_train_full.columns)
len(categorical_features)

There are 43 categorical features in the dataset

**Check the correlation of the numerical features with Sale Price of the house**|

In [None]:
house_train_full.corr()['SalePrice'].sort_values(ascending=False).head(15)

In [None]:
house_train_full.corr()['SalePrice'].sort_values(ascending=False).tail(15)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['SalePrice','OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF']
scatter_matrix(house_train_full[attributes],figsize=(15,8),grid=True);

*Check the value count of each categorical features*

In [None]:
for features in categorical_features:
    print (house_train_full[features].value_counts())

What is interesting is that many of the quality features, for instance BsmtQual (Basement quality) is a categorical feature while the overall quality is a numerical feature. Perhaps good to have all quality labels as categorical.

*Visualize the sale condition - Normal, Abnormal etc. *

In [None]:
sns.countplot('SaleCondition',hue='SaleType',data=house_train_full)
plt.legend(loc='upper right')

In [None]:
sns.countplot('SaleCondition',data=house_train_full)
plt.legend(loc='upper right')

This is what we have from the sale type
SaleType: Type of sale
		
       WD 	Warranty Deed - Conventional
       CWD	Warranty Deed - Cash
       VWD	Warranty Deed - VA Loan
       New	Home just constructed and sold
       COD	Court Officer Deed/Estate
       Con	Contract 15% Down payment regular terms
       ConLw	Contract Low Down payment and low interest
       ConLI	Contract Low Interest
       ConLD	Contract Low Down
       Oth	Other

There is more clarity when I look at the description of the sale condition
SaleCondition: Condition of sale

       Normal	Normal Sale
       Abnorml	Abnormal Sale -  trade, foreclosure, short sale
       AdjLand	Adjoining Land Purchase
       Alloca	Allocation - two linked properties with separate deeds, typically condo with a garage unit	
       Family	Sale between family members
       Partial	Home was not completed when last assessed (associated with New Homes)

In order for the model to generalize well, the sample selection for training the model should be the near representative of the data set. Sale condition could be one of the feature to bin the data. However, lets explore the overall quality distribution in the dataset. 

In [None]:
sns.countplot('OverallQual',data=house_train_full,palette='viridis')
plt.title('House Count per Overall Quality');


Wow, this looks more like a bell shaped with maximum overall quality between 4 to 8. Visualize the Sale Price vs the Overall Quality more closely than done in the scatter matrix

In [None]:
sns.swarmplot('OverallQual','SalePrice',data=house_train_full)
plt.title('Sale Price vs Overall Quality')
plt.legend(loc='upper right');

Things are becoming little clear now.

Most expensive houses, SalePrice above 99 percentile, all have overall quality rating 8 and above. Most expensive house also has the best quality rating. However, there are also a few with 10 rating but with median price range. The least expensive have ratings 1 to 4. There are more sales of houses with the quality rating between 5 to 9. Both 3 and 4 quality rating have houses with median price range. This could be associated with the neighborhood the houses are located.


In [None]:
sns.swarmplot('Neighborhood','SalePrice',data=house_train_full[house_train_full['OverallQual']==10])
plt.title('Houses with Quality Rating 10, SalePrice vs Neighborhood');


So, Edwards locality has least expensive housing with highest quality. Would be interesting to check which sale condition these houses fall into ---> new , renovated etc. 

We will leave it at this point. What is clear and is expected to be true for other quality ratings as well that Neighborhood is an important predictor for Sale Price.

In [None]:
# Boxplot SalePrice vs Neighborhood
plt.figure(figsize=(20,10))
sns.boxplot('Neighborhood','SalePrice',data=house_train_full,palette='viridis')
plt.tight_layout(True)
plt.title('SalePrice vs Neighborhood');

Clearly, neighborhood is a strong predictor of the house price

In [None]:
# Visualize the boxplot sorted by median SalePrice and plotted in descending order

# create the sorted dataframe
grouped = house_train_full.groupby(['Neighborhood'])
df = pd.DataFrame({col:vals['SalePrice'] for col,vals in grouped})

meds = df.median()
meds.sort_values(ascending=False, inplace=True)
df = df[meds.index]

# generate boxplot
plt.figure(figsize=(20,10))

df.boxplot(grid=False)

plt.tight_layout(True)
plt.title('SalePrice vs Neighborhood (sorted based on descending order of median price)')
plt.xlabel('Ames Neighborhoods')
plt.ylabel('House Sale Prices');

Now, we see the localities on the x-axis from most expensive to least expensive (left to right) based on the median house price

## Part 2: Data Preparation

In [None]:
#create a copy of the original train set
housing = house_train_full.copy()

In [None]:
# read the categorical and the numerical features
categorical_features, numerical_features = type_features(housing,housing.columns)
len(categorical_features)

In [None]:
percent_na(list(housing.columns),housing).sort_values(ascending=False).head(10)

In [None]:
# drop list of columns
drop_list_1 = ['Id','PoolQC','MiscFeature','Alley','Fence']

In [None]:
# function to drop the columns
def drop_feature(df,drop_list):
    for feature in drop_list:
        df.drop(feature,axis=1,inplace=True)
    return df

In [None]:
save_id = house_test.copy()

In [None]:
# drop the features from the training set and the test set
drop_feature(housing,drop_list_1)
drop_feature(house_test,drop_list_1)

In [None]:
housing.head()

In [None]:
#update the feature list
categorical_features, numerical_features = type_features(housing,housing.columns)
len(categorical_features)

OverallQual is a strong predictor of the housing price. It takes the integer values from 1 to 10. It is better to convert this feature into a categorical column.

In [None]:
num_to_cat_list = ['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold']

In [None]:
#convert the selected numerical features to categorical
def to_category(df,num_list):
    for feature in num_list:
        df[feature] = df[feature].astype('str')
    return df

In [None]:
to_category(housing,num_to_cat_list)
to_category(house_test,num_to_cat_list)

In [None]:
# update the categorical and the numerical feature list
categorical_features,numerical_features = type_features(housing,housing.columns)
len(categorical_features)

In [None]:
housing['OverallQual'].value_counts()

In order that the model sees the houses of various quality, I would use Stratified split on the housing dataset into train and validation test set. 

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

strat_split = StratifiedShuffleSplit(n_splits=10,test_size=0.2,random_state=42)
for train_index,valid_index in strat_split.split(housing,housing['OverallQual']):
    strat_house_train = housing.loc[train_index]
    strat_house_valid = housing.loc[valid_index]


In [None]:
len(strat_house_train)

In [None]:
strat_house_train.head()

In [None]:
# separate the SalePrice, the final label to be predicted
X_train = strat_house_train.drop('SalePrice',axis=1)
#evaluation is based on the log transformed value of the SalePrice
y_train = np.log(strat_house_train['SalePrice'])

X_valid = strat_house_valid.drop('SalePrice',axis=1)
y_valid = np.log(strat_house_valid['SalePrice'])

X_test = house_test.copy()


Preparation step sequence
- create a numerical feature dataset 
- impute the missing values 
- apply transformation on the selected features
- scale the features


In [None]:
# reapply the function to extract the categorical and the numeric features
categorical_features,numerical_features = type_features(X_train,X_train.columns)

In [None]:
X_train_num = X_train[numerical_features]
X_train_num.head()

In [None]:
X_train_cat = X_train[categorical_features]
X_train_cat.head()

lets deal with the numerical dataset first. I would define a pipeline and apply all the steps on the validation dataset

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [None]:
imputer = KNNImputer()
X_train_num_imp = imputer.fit_transform(X_train_num)

Create a custom transformer to log transform the entire numerical features


In [None]:
scalar = StandardScaler()
X_train_num_prep = scalar.fit_transform(X_train_num_imp)

In [None]:
X_train_num_prep

In [None]:
# Reconstruct the numerical features dataframe
X_train_num_prepared = pd.DataFrame(X_train_num_prep,
                                   columns=list(X_train_num.columns),index=X_train_num.index)
X_train_num_prepared.head()

Create a pipeline for the numerical features. The pipeline would be used to transform the validation & test set 

In [None]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([('num_imputer',KNNImputer()),
                         ('std_scalar',StandardScaler()),
                         ])

In [None]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

In [None]:
hot_encoder = OneHotEncoder(sparse=False)
ord_encoder = OrdinalEncoder()

before we can impute the missing values in the categorical feature set, the non missing values should be encoded

In [None]:
# define a function to apply one hot encoder and then impute the missing values
def encode(df):
    # keep only the non null values 
    arr = np.array(df.dropna())
    # reshape the data for encoding
    arr_reshape = arr.reshape(-1,1)
    # encode the data
    arr_encoded = ord_encoder.fit_transform(arr_reshape)
    # bring the encoded data back to the df
    df.loc[df.notnull()] = np.squeeze(arr_encoded)
    return df


In [None]:
for cat_feature in categorical_features:    
    encode(X_train_cat[cat_feature])

In [None]:
X_train_cat.head()

Now we can use the KNN imputer on the missing values in the categorical dataset

In [None]:
cat_imputer = KNNImputer()
X_train_cat_imp = cat_imputer.fit_transform(X_train_cat)

Construct the dataframe with the categorical features

In [None]:
# Reconstruct the categorical features dataframe
X_train_cat_prepared = pd.DataFrame(X_train_cat_imp,
                                   columns=list(X_train_cat.columns),index=X_train_cat.index)
X_train_cat_prepared.head()

In [None]:
#np.max(X_train_cat_prepared)

Pipeline to deal with the categorical 

In [None]:
cat_pipeline = Pipeline([('cat_imputer',cat_imputer),
                         ('cat_std_sclar',StandardScaler())])

create a full pipeline for the numerical and categorical features combined. for ithis we will make use of the ColumnTransfomer from Sci-kit learn compose package available in version 0.2 and above

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = numerical_features
cat_attribs = categorical_features

full_pipeline = ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',cat_pipeline,cat_attribs),
])

Summary of the data preparation step applied on the training data 
- train dataset was split into two dataset containing the numerical and categorical features
- the numerical dataset was imputed first using KNN and then standard scalar was applied 
- the categorical dataset was first encoded using the custom encode function and then imputed using KNN

In [None]:
X_train_num_prepared.head()

In [None]:
X_train_cat_prepared.head()

In [None]:
X_train_prepared = pd.concat([X_train_num_prepared,X_train_cat_prepared],axis=1)

In [None]:
X_train_prepared.head()

In [None]:
X_train_num = X_train[numerical_features]
X_train_cat = X_train[categorical_features]

for cat_feature in categorical_features:
    encode(X_train_cat[cat_feature])
    
warnings.filterwarnings(action='ignore',message='')    

X_train_num_prep = num_pipeline.fit_transform(X_train_num)
X_train_num_prepared = pd.DataFrame(X_train_num_prep,
                                   columns=list(X_train_num.columns),index=X_train_num.index)

X_train_cat_imp = cat_pipeline.fit_transform(X_train_cat)
X_train_cat_prepared = pd.DataFrame(X_train_cat_imp,
                                    columns=list(X_train_cat.columns),index=X_train_cat.index)

X_train_prepared = pd.concat([X_train_num_prepared,X_train_cat_prepared],axis=1)

### Prepare the validation dataset 

In [None]:
X_valid_num = X_valid[numerical_features]
X_valid_cat = X_valid[categorical_features]

for cat_feature in categorical_features:
    encode(X_valid_cat[cat_feature])
    
X_valid_num_prep = num_pipeline.transform(X_valid_num)
X_valid_num_prepared = pd.DataFrame(X_valid_num_prep,
                                   columns=list(X_valid_num.columns),index=X_valid_num.index)

X_valid_cat_imp = cat_pipeline.transform(X_valid_cat)
X_valid_cat_prepared = pd.DataFrame(X_valid_cat_imp,
                                    columns=list(X_valid_cat.columns),index=X_valid_cat.index)

X_valid_prepared = pd.concat([X_valid_num_prepared,X_valid_cat_prepared],axis=1)

In [None]:
X_valid_prepared.head()

In [None]:
### Prepare the test dataset
X_test_num = X_test[numerical_features]
X_test_cat = X_test[categorical_features]

for cat_feature in categorical_features:
    encode(X_test_cat[cat_feature])
    
X_test_num_prep = num_pipeline.transform(X_test_num)
X_test_num_prepared = pd.DataFrame(X_test_num_prep,
                                   columns=list(X_test_num.columns),index=X_test_num.index)

X_test_cat_imp = cat_pipeline.transform(X_test_cat)
X_test_cat_prepared = pd.DataFrame(X_test_cat_imp,
                                   columns=list(X_test_cat.columns),index=X_test_cat.index)

X_test_prepared = pd.concat([X_test_num_prepared,X_test_cat_prepared],axis=1)

In [None]:
X_test_prepared.head()

In [None]:
print(X_train_prepared.shape,X_valid_prepared.shape,X_test_prepared.shape)

In [None]:
pd.Series(X_train_prepared.columns)[5:30]

There are many columns which could negatively impact the model performance due to multi-collinearity. To deal with it feature engineering and subsequent removal of these feature is helpful. I created a custom class to add additional attributes. There is a lot of scope of improvement as far as feature engineering is concerned. Below is based on the hunch that I get by quickly looking into the feature set. This is exactly where domain experience plays a role. 

***This is a late realization and should have been part of the earlier pipeline. This can be done by addressing it earlier. I leave it as it is now for the moment.*** :-)

My model so far best performed with all the feature set and no removal of the observations from the dataset. However, getting rid of some of the outliers would definitely help. 

In [None]:
## Custom Attribute Class Transformer
from sklearn.base import BaseEstimator,TransformerMixin

BsmtFinSF1_idx, BsmtFinSF2_idx ,BsmtUnfSF_idx, TotalBsmtSF_idx = 3,4,5,6
FstFlrSF_idx, SndFlrSF_idx, GrLivArea_idx = 7,8,10
BsmtFullBath_idx, BsmtHalfBath_idx = 11,12
FullBath_idx, HalfBath_idx = 13,14
BedroomAbvGr_idx, KitchenAbvGr_idx, TotRmsAbvGrd_idx = 15,16,17
GarageCars_idx, GarageArea_idx = 19,20
OverallQual_idx, OverallCond_idx =41,42
BsmtQual_idx, BsmtCond_idx = 53,54 
GarageQual_idx, GarageCond_idx = 68,69



class CustomAttribsAdder(BaseEstimator,TransformerMixin):
    def __init__(self,trans=True):
        self.trans = trans
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        if self.trans:
            
            X[:,GrLivArea_idx] = X[:,GrLivArea_idx] * 3 
            X[:, OverallQual_idx] = X[:, OverallQual_idx] * 3
            
            overall_quality = X[:, OverallQual_idx] * X[:, OverallCond_idx]
            garage_quality = X[:,GarageQual_idx] * X[:,GarageCond_idx]
            bsmt_quality = X[:,BsmtQual_idx] * X[:,BsmtCond_idx]
            
            #garage_area_per_car = X[:, GarageCars_idx] + X[:,GarageArea_idx]
            
            rooms_above_ground = X[:,TotRmsAbvGrd_idx] + X[:,KitchenAbvGr_idx] + X[:,BedroomAbvGr_idx]
            
            full_sqft = X[:,GrLivArea_idx] + X[:,FstFlrSF_idx] + X[:,SndFlrSF_idx]
            
            bsmt_fin = (X[:,BsmtFinSF1_idx] + X[:,BsmtFinSF2_idx]) / X[:,TotalBsmtSF_idx]
            bsmt_unfin = X[:,BsmtUnfSF_idx] / X[:,TotalBsmtSF_idx]
            
            total_bath_grd = X[:,FullBath_idx] + X[:,HalfBath_idx]
            total_bath_bsmt = X[:,BsmtFullBath_idx] + X[:,BsmtHalfBath_idx]
            return np.c_[X,overall_quality,garage_quality,bsmt_quality,rooms_above_ground,full_sqft,
                        bsmt_fin,bsmt_unfin,total_bath_grd,total_bath_bsmt]
        else:
            return X
        
        

In [None]:
attr_adder =  CustomAttribsAdder(trans=True)
X_train_attribs = attr_adder.transform(X_train_prepared.values)


In [None]:
X_valid_attribs = attr_adder.transform(X_valid_prepared.values)
X_test_attribs = attr_adder.transform(X_test_prepared.values)

In [None]:
X_train_prepared = pd.DataFrame(X_train_attribs,
                                columns=list(X_train_prepared.columns)+['overall_quality','garage_quality','bsmt_quality','rooms_above_ground','full_sqft',\
                                                                        'bsmt_fin','bsmt_unfin','total_bath_grd','total_bath_bsmt'],index=X_train_prepared.index)

In [None]:
X_valid_prepared = pd.DataFrame(X_valid_attribs,
                                columns=list(X_valid_prepared.columns)+['overall_quality','garage_quality','bsmt_quality','rooms_above_ground','full_sqft',\
                                                                        'bsmt_fin','bsmt_unfin','total_bath_grd','total_bath_bsmt'],index=X_valid_prepared.index)

In [None]:
X_test_prepared = pd.DataFrame(X_test_attribs,
                                columns=list(X_test_prepared.columns)+['overall_quality','garage_quality','bsmt_quality','rooms_above_ground','full_sqft',\
                                                                        'bsmt_fin','bsmt_unfin','total_bath_grd','total_bath_bsmt'],index=X_test_prepared.index)

In [None]:
# remove the features - try out with various combinations 
drop_list_2 = ['OverallCond','GarageArea','GarageCond','BsmtCond',\
               'BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','1stFlrSF','2ndFlrSF',\
              'TotalBsmtSF','BsmtUnfSF','HalfBath','BsmtHalfBath',\
              'FullBath','BsmtFullBath']

#drop_list_2 = ['YearBuilt', 'OverallCond','GarageCars',\
#              'BedroomAbvGr','KitchenAbvGr','1stFlrSF','2ndFlrSF']

# call the function defined earlier to drop the columns from the dataset
drop_feature(X_train_prepared,drop_list_2)
drop_feature(X_valid_prepared,drop_list_2)
drop_feature(X_test_prepared,drop_list_2)

## Ensemble Model Stacking

The brew package has been deprecated. Still I want to try out stacking multiple models. 

In [None]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict,RepeatedKFold

In order to prevent data leakage during cross validation and later during the grid and randomized search, it is important that the validation fold during the cross validation is as good as an unseen data. Else, the estimate of the score would be too optimistic and not reliable. One simple solution is to pass the estimator in cross validation and grid/randomized search via a pipeline. 

In [None]:
rf_pipe = Pipeline([('RandomForest',RandomForestRegressor())])
gb_pipe = Pipeline([('GradientBoost',GradientBoostingRegressor())])
ada_pipe = Pipeline([('AdaBoost',AdaBoostRegressor())])
svr_pipe = Pipeline([('SupportVector',SVR())])

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    models['RF'] = rf_pipe                                        
    models['GB'] = gb_pipe
    models['SVR'] = svr_pipe
    return models

In [None]:
# evaluate a given model using k fold cross validation
def evaluate_model(model,X,y):
    cv = RepeatedKFold(n_splits=10,n_repeats=3,random_state=42)
    scores = cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=cv,n_jobs=-1,error_score='raise')
    return scores

In [None]:
# compare machine learning models for comparison 
models = get_models()
# evaluate the models and store the results
results,names = list(),list()

for name,model in models.items():
    scores = evaluate_model(model, X_train_prepared, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))


In [None]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 =list()
    level0.append(('RF',RandomForestRegressor(random_state=42)))
    level0.append(('GB',GradientBoostingRegressor(random_state=42)))
    level0.append(('AdaBoost',AdaBoostRegressor()))
    #define the meta learner model 
    level1=LinearRegression()
    #define the stacking ensemble
    model = StackingRegressor(estimators=level0,final_estimator=level1,cv=5)
    return model
    

In [None]:
stack_model = get_stacking()

In [None]:
stack_model.fit(X_train_prepared,y_train)


In [None]:
predictions_stack_model = stack_model.predict(X_valid_prepared)
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,predictions_stack_model))}')

The performance of the stacked model is not better than the individual models. Hence, we would stick with the base models and fine tune them.

## Train Regression Models

### Predict using the default hyper-parameters without any cross validation and fine tuning.

In [None]:
rf_reg = RandomForestRegressor(random_state=42) 
rf_reg.fit(X_train_prepared,y_train)
predictions_rf_reg = rf_reg.predict(X_valid_prepared)
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,predictions_rf_reg))}')

In [None]:
gb_reg = GradientBoostingRegressor(random_state=42)
gb_reg.fit(X_train_prepared,y_train)
predictions_gb_reg = gb_reg.predict(X_valid_prepared)
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,predictions_gb_reg))}')

In [None]:
ada_reg = AdaBoostRegressor(random_state=42) 
ada_reg.fit(X_train_prepared,y_train)
predictions_ada_reg = ada_reg.predict(X_valid_prepared)
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,predictions_ada_reg))}')

Lets try out with Support Vector Machine

In [None]:
# import the support vector regressor 
from sklearn.svm import SVR 
np.random.seed(42)
svr_reg = SVR()
svr_reg.fit(X_train_prepared,y_train)
valid_predict_svr_reg = svr_reg.predict(X_valid_prepared)
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,valid_predict_svr_reg))}')

### Model Fine Tuning

In order to prevent data leakage during cross validation and later during the grid and randomized search, it is important that the validation fold during the cross validation is as good as an unseen data. Else, the estimate of the score would be too optimistic and not reliable. One simple solution is to pass the estimator in cross validation and grid/randomized search via a pipeline. 

In [None]:
rf_pipe = Pipeline([('RandomForest',RandomForestRegressor())])
gb_pipe = Pipeline([('GradientBoost',GradientBoostingRegressor())])
ada_pipe = Pipeline([('AdaBoost',AdaBoostRegressor())])
svr_pipe = Pipeline([('SupportVector',SVR())])

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score, cross_val_predict

forest_scores = cross_val_score(rf_pipe,X_train_prepared,y_train,
                               scoring='neg_mean_squared_error',cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
print('Cross Validation Random Forest RMSE = {}'.format(forest_rmse_scores))
print (f'Mean is {np.mean(forest_rmse_scores)} Std. Deviation = {np.std(forest_rmse_scores)}')

In [None]:
#Cross validation on The gradient Boosting regressor
gb_scores = cross_val_score(gb_pipe,X_train_prepared,y_train,
                               scoring='neg_mean_squared_error',cv=10)
gb_rmse_scores = np.sqrt(-gb_scores)
print('Cross Validation Gradient Boosting RMSE = {}'.format(gb_rmse_scores))
print (f'Mean is {np.mean(gb_rmse_scores)} Std. Deviation = {np.std(gb_rmse_scores)}')

In [None]:
# cross validation on the support vector regressor
svm_scores = cross_val_score(svr_pipe,X_train_prepared,y_train,scoring='neg_mean_squared_error',
                            cv=10)
svm_scores = np.sqrt(-svm_scores)
print('Cross Validation Support Vector Regression RMSE = {}'.format(svm_scores))
print (f'Mean is {np.mean(svm_scores)} Std. Deviation = {np.std(svm_scores)}')


In [None]:
#cross validation on the AdaBoost regressor
ada_scores = cross_val_score(ada_pipe,X_train_prepared,y_train,scoring='neg_mean_squared_error',
                            cv=10)
ada_scores = np.sqrt(-ada_scores)
print('Cross Validation AdaBoost Regression RMSE = {}'.format(ada_scores))
print (f'Mean is {np.mean(ada_scores)} Std. Deviation = {np.std(ada_scores)}')

### Hyper parameter tuning

First we will try the Grid search for both Random Forest and Gradient Boosting

In [None]:
# Import Grid Search
from sklearn.model_selection import GridSearchCV

In [None]:
gb_reg = GradientBoostingRegressor()
gb_param_grid = [{'n_estimators':[100,200,300],'max_features':[8,16,32,64],
                 'max_depth':[3,5,7]}]

grid_search_gb = GridSearchCV(gb_reg,gb_param_grid,cv=5,scoring='neg_mean_squared_error',
                              return_train_score=True)
grid_search_gb.fit(X_train_prepared,y_train)

In [None]:
gb_best_reg = sklearn.base.clone(grid_search_gb.best_estimator_)
gb_best_reg.fit(X_train_prepared,y_train)
valid_pred_gb = gb_best_reg.predict(X_valid_prepared)
print('RMSE={}'.format(np.sqrt(mean_squared_error(y_valid,valid_pred_gb))))

In [None]:
rf_reg = RandomForestRegressor(random_state=42)
rf_param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [100,200,300], 'max_features': [8,16,32,64]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [100, 200], 'max_features': [2, 3, 4]},
  ]

grid_search_rf = GridSearchCV(rf_reg,rf_param_grid,cv=5,scoring='neg_mean_squared_error',
                              return_train_score=True)
grid_search_rf.fit(X_train_prepared,y_train)

In [None]:
# best estimator based on Grid search
grid_search_rf.best_estimator_

In [None]:
# print the test scores for each of the tried combination
cvres = grid_search_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# check the RMSE score on the validation data based on the grid search best estimator
rf_reg_grid = sklearn.base.clone(grid_search_rf.best_estimator_ )
rf_reg_grid.fit(X_train_prepared,y_train)
pred_valid_rf_grid = rf_reg_grid.predict(X_valid_prepared)
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,pred_valid_rf_grid))}')

In [None]:
feature_importance = rf_reg_grid.feature_importances_
# create a basic plot - improvement , plot tge feature names on the x-axis 
plt.plot(feature_importance)

In [None]:
X_train_prepared.columns[65:70]

### Principal Component Analysis

Dimensionality can be a curse. I tried out reducing it but only to the extent that the variance of the data should be explained by atleast 95%. This way I need not keep playing with arbitrary choice of n_components hyperparameter in PCA. 

In [None]:
# import the pca package
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train_prepared)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print (f'{d}')

In [None]:
pca = PCA(n_components = d)
X_train_reduced = pca.fit_transform(X_train_prepared)
X_valid_reduced = pca.transform(X_valid_prepared)
X_test_reduced = pca.transform(X_test_prepared)

In [None]:
rf_pca = RandomForestRegressor(random_state=42)
rf_pca.fit(X_train_reduced,y_train)
valid_pred = rf_pca.predict(X_valid_reduced)
print(f'RMSE on validation data = {np.sqrt(mean_squared_error(y_valid,valid_pred))}')

Reducing the dataset into its principal components does not seem to be improving the model performance on the validation dataset. Grid search can also be applied on PCA. However, I would leave out this option.

### Randomized Grid Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
param_distribs = {'n_estimators':randint(100,300),
                 'max_features':randint(16,67),
                 'max_depth':randint(3,6)}

In [None]:
rf_reg_rand =  RandomForestRegressor(random_state=42)
rand_search = RandomizedSearchCV(rf_reg_rand,param_distributions=param_distribs,cv=10,
                                 scoring='neg_mean_squared_error',random_state=42,n_iter=10)



In [None]:
rand_search.fit(X_train_prepared,y_train)

In [None]:
rand_search.best_estimator_

In [None]:
rf_reg_rand_best = sklearn.base.clone(rand_search.best_estimator_)

In [None]:
rf_reg_rand_best.fit(X_train_prepared,y_train)
valid_pred = rf_reg_rand_best.predict(X_valid_prepared)

In [None]:
print(f'RMSE = {np.sqrt(mean_squared_error(y_valid,valid_pred))}')

## Submission

In [None]:
final_pred = rf_reg_grid.predict(X_test_prepared)
price_arr = np.exp(final_pred)
output = pd.DataFrame({"Id":save_id['Id'], "SalePrice":price_arr})
output.to_csv('submission_rf_001.csv', index=False)