In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn

# Feature Engineering and Feature Transformation modules
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler 

# Feature Transformation modules 
from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p

# Feature Selection and model selection modules 
from sklearn.feature_selection import RFE, VarianceThreshold, f_regression, SelectFromModel 
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.feature_selection import f_regression, SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, StratifiedKFold, cross_val_score


# Module of ML models 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression, LassoCV, RidgeCV, ElasticNetCV 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC, SVR 
from xgboost import XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor 


# Layout Design 
sns.set_context("paper", font_scale = 1, rc={"grid.linewidth": 3})
pd.set_option('display.max_rows', 100, 'display.max_columns', 400)

# Evaluation Metric modules
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score

# Pipeline modules 
# First Inherit the base classes
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer 
from sklearn.pipeline import make_union, FeatureUnion
import pickle
import joblib

In [2]:
# Initializing random generators for models 
# Set a seed value
seed_value= 12321 

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
# import tensorflow as tf
# tf.set_random_seed(seed_value)

# 5. For layers that introduce randomness like dropout, make sure to set seed values 
# model.add(Dropout(0.25, seed=seed_value))

# 6. Configure a new global `tensorflow` session
# from keras import backend as K
# session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
# K.set_session(sess)

# Elementary Statistical Analysis

In [3]:
# Reading the dataset
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
df_origin = pd.read_csv('../input/additional-dataset-for-training/AmesHousing.csv')
df_train.head()

In [4]:
# Identify if there exists an extra column in the original data which isn't present in provided train data
print(len(df_origin))
print()
extra_cols = [col for col in df_origin.columns if col not in df_train.columns]
print(extra_cols)

**Observations**:
1. We can observe that the column names are different in original data as to provided train data
2. There is an extra feature named 'PID' - which is parcel identification number - can be used with city website for parcel review. Since its an Id, we will drop it

In [5]:
# Droppping the 'PID' feature 
df_origin.drop(['PID'], axis=1, inplace=True)

In [9]:
# Use the origin data as new training data
# df_train.columns = df_origin.columns
df_combined_train = df_origin
df_combined_train.drop_duplicates(inplace=True)
df_combined_train.columns = df_combined_train.columns.str.replace(' ', '')
print(df_combined_train.shape)

In [12]:
# Inspecting the train data
df_combined_train.head()

In [13]:
# Inspecting the test dataset
df_test.head()

In [14]:
# Gathering Appropriate Information 
# Drop 'Order' column from training data and 'Id' column from test data
df_combined_train.drop(['Order'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)
print(df_combined_train.info())

In [15]:
# Remove duplicate values and change the wrong data types 
df_combined_train = df_combined_train.loc[~df_combined_train.index.duplicated(), :]
df_test = df_test.loc[~df_test.index.duplicated(), :]

# Hence, there is no column with dubious or incorrect data types

In [16]:
print(df_combined_train.shape)
print(df_test.shape)

In [17]:
# Getting insights of the features and outliers
df_combined_train.describe([0.25,0.50,0.75,0.99])

In [18]:
# Separate numerical and categorical columns
numerical_cols = [col for col in df_combined_train.columns if df_combined_train[col].dtype!='object']
categorical_cols = [col for col in df_combined_train.columns if df_combined_train[col].dtype=='object']

In [19]:
# Checking percentage of null values present in training dataset 
missing_num = df_combined_train.isna().sum().sort_values(ascending=False)
missing_perc = (df_combined_train.isna().sum()/len(df_combined_train)*100).sort_values(ascending=False)
missing_perc

EDA only for learning purposes done separately. Else you should always combine both the train and test sets and then only perform EDA for easier analysis. That is done later anyways.

In [20]:
# Calculating percentage of null values
def null_values(dataframe):
  missing_values = dataframe.isna().sum().sort_values(ascending=False)
  missing_perc = (((dataframe.isna().sum())/len(dataframe))*100).sort_values(ascending=False)
  return missing_values, missing_perc

#Passing in the training and test datasets to calculate the percentage of missing values in training and test data
null_sum_train, null_perc_train = null_values(df_combined_train)
null_sum_test, null_perc_test = null_values(df_test)

In [21]:
null_sum_train

In [22]:
null_perc_train

In [23]:
miss_train_sum_perc = pd.concat([null_sum_train, null_perc_train], axis=1, keys=['Sum', 'Percentage'])
miss_test_sum_perc = pd.concat([null_sum_test, null_perc_test], axis=1, keys=['Sum', 'Percentage'])

In [24]:
miss_train_plot = miss_train_sum_perc[miss_train_sum_perc['Percentage']>0]
miss_train_plot

In [25]:
miss_test_plot = miss_test_sum_perc[miss_test_sum_perc['Percentage']>0]
miss_test_plot

**19 attributes have missing values and 5 features( PoolQC,MiscFeature,Alley,Fence,FireplaceQu) have missing percentage greater than 45%**

In [26]:
# Printing the numerical dataframe
df_numerical_train = df_combined_train.select_dtypes(include=['int64','float64'])
df_numerical_train.head()

In [27]:
# Printing the categorical dataframe
df_categorical_train = df_combined_train.select_dtypes(exclude=['int64','float64'])
df_categorical_train.head()

In [29]:
# No.of unique values in each of the numerical columns
for col in df_numerical_train.columns:
    print(str(col)+"-"*len(str(col))+str(df_numerical_train[col].nunique()))

In [30]:
# No.of unique values in each of the categorical columns
for col in df_categorical_train.columns:
    print(str(col)+"-"*len(str(col))+str(df_categorical_train[col].nunique()))

## Observations:
1. There are 80 columns in this dataset. Need to do a bit of feature-selection later on 
2. There are several columns(numerical+categorical) that have few no. of unique variables when compared to the overall count of all training instances
3. The test dataset size is almost equivalent to the size of training dataset. Do I have to append more training instance?

# Data Cleaning + Visualization 

In [31]:
# Important 
def showvalues(ax,m=None):
    for p in ax.patches:
        ax.annotate("%.1f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),\
                    ha='center', va='center', fontsize=14, color='k', rotation=0, xytext=(0, 7),\
                    textcoords='offset points',fontweight='light',alpha=0.9)

In [32]:
plt.figure(figsize=(20, 20))
plt.subplot(2, 1, 1)
ax1 = sns.barplot(x=miss_train_plot.index, y='Percentage', data=miss_train_plot)
showvalues(ax1)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.subplot(2, 1, 2)
ax2 = sns.barplot(x=miss_test_plot.index, y='Percentage', data=miss_test_plot)
showvalues(ax2)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.show()

In [33]:
len(df_combined_train.select_dtypes(include=['int64','float64']).columns)

In [34]:
# Visualising numerical predictor variables with Target Variables, and this as well can be used for univariate distributions as well
# df_num_train = df_numerical_train[[col for col in df_numerical_train.columns if col != 'MS SubClass']]
fig,axs= plt.subplots(12,3,figsize=(20, 80))

# adjust horizontal space between plots 
fig.subplots_adjust(hspace=0.6)

# We need to flatten the axes for iterating over them. Here the axes in the dimension [12, 3] is transformed to a vector consisting of 12*3 = 36 values.
for i,ax in zip(df_numerical_train.columns, axs.flatten()):
    sns.scatterplot(x=i, y='SalePrice', hue='SalePrice',data=df_numerical_train, ax=ax, palette='viridis_r')
    plt.xlabel(i,fontsize=12)
    plt.ylabel('SalePrice',fontsize=12)

    # ax.set_yticks(np.arange(0,900001,100000))
    ax.set_title('SalePrice'+' - '+str(i),fontweight='bold',size=20)

**Observations**:
1. We can observe that the house prices rise steadily with increase in rating quality 
2. 

**Justifications for not using KNN**
1. KNN algorithm works best for classification where there is a clear demarcation of feature values and corresponding labels as clusters
2. The number of unique target values is large enough and non-existence of a large number of training samples make it too difficult for the algorithm to segregate target lables and interpolate the same on test data

**Justification for not using SVR and Kernel SVR**
1. SVR is best utilized for estimating values where the  no. of dimensions >= no. of training samples. In short, it is favoured for smaller datasets
2. The decision boundary can't be estimated easily due to the lack of a particular progression pattern between the feature and the target variable

In [41]:
# Visualizing categorical predictors with target variable
def facetgrid_boxplot(x, y, **kwargs):
  sns.boxplot(x=x, y=y)
  x = plt.xticks(rotation=90)

# pd.melt is a useful function. You have written its functionality in your notebook.
f = pd.melt(df_combined_train, id_vars=['SalePrice'], value_vars=sorted(df_categorical_train.columns))
g = sns.FacetGrid(f, col='variable', col_wrap=3, sharex=False, sharey=False)

# Mapping onto the function where it will plot the boxplot
g = g.map(facetgrid_boxplot, 'value', 'SalePrice')

In [42]:
# Visualize the distribution of dataframe 'f'
f.head(10)

***SalePrice isn't normally distributed. It has a right skewed distribution.***

In [43]:
# Distribution of Target variable (SalePrice)
plt.figure(figsize=(8,6))
sns.distplot(df_combined_train['SalePrice'],hist_kws={"edgecolor": (1,0,0,1)})

In [44]:
# Skew and kurtosis for SalePrice 
print("Skewness: %f" % df_combined_train['SalePrice'].skew())
print("Kurtosis: %f" % df_combined_train['SalePrice'].kurt())

# Hence, the skewness AND kurtosis have naturally declined by a bit on increasing the number of training instances

In [67]:
# Applying log transformation to remove skewness and make target variable normally distributed(we apply natural log here)
# Create a copy of training AND test datasets to apply transformations
df_train_copy = df_combined_train.copy()
y = df_combined_train['SalePrice']
df_test_copy = df_test.copy()

# Don't just copy the column names as the order is different 
# df_test_copy.columns = df_train_copy.columns

df_train_copy['SalePrice'] = np.log1p(df_combined_train['SalePrice'])

In [None]:
# Reset the index for test data
# df_test_copy.set_index('Id', inplace=True)
# df_test_copy.index.name = None

In [46]:
# Distribution of Target variable (SalePrice) - again to see if the skewness has decreased 
plt.figure(figsize=(8,6))

# hist_kws parameter refers to edgecolour of bins in histogram
sns.distplot(df_combined_train['SalePrice'], hist_kws={"edgecolor": (1,0,0,1)})

In [47]:
# Again calculate the skewness and kurtosis
print("Skewness: " + str(df_train_copy['SalePrice'].skew()))
print("Kurtosis: " + str(df_train_copy['SalePrice'].kurtosis()))

***As we can observe, the skewness and kurtosis values after the requisite transformations have minimied and the distribution is almost normal***

In [48]:
# Plotting the Pearsson's correlation heatmap between the numerical features
# The code has been derived from Seaborn API docs.
plt.figure(figsize=(30, 20))
corr = df_numerical_train.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(20, 20))
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True, cmap='RdPu')

Hence we can see there is a high degree of multicollinearity in the dataset. We can either drop them by performing certain feature selection techniques or let some regularization ML techniques(such as Lasso and Ridge regression), do the needful.

**We will perform some advanced Feature Engineering techniques such as(if required):**
1. Outlier Detection and removal(for extremely skewed distributions of features)
2. Missing Value Imputation
3. Scaling and Normalization(if required)





In [49]:
# Visualization of categorical variables
# cat_cols = [col for col in df_train_copy.columns if df_train_copy[col].dtype=='object']
# df_cat_subset = df_train_copy[cat_cols]
df_categorical_train.head()

In [None]:
# Counting VALUE COUNTS for categorical columns
for col in df_categorical_train.columns:
    print(str(col) + "-"*len(str(col)) + str(df_categorical_train[col].value_counts()))
    print()

In [None]:
# Drop 'Utilities' and 'Pave' columns as it has extremely low cardinality, and possibly, the model cannot learn much from it 
# df_train_copy.drop(['Street', 'Utilities'], axis=1, inplace=True)
# We will decide upon the other features later 

In [None]:
# Numerical columns 
# For the time being, as the percentage of missing values is extremely large for four features, namely ['PoolQc', 'MiscFeature', 'Alley' AND 'Fence'], we would drop them from training and test datasets
# If appropriate information regarding these columns arrives, we would impute them accordingly 
# drop_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
# df_train_copy.drop(drop_features, axis=1, inplace=True)
# df_test_copy.drop(drop_features, axis=1, inplace=True)

In [68]:
# Let's separate out the target variable 
df_train_copy = df_train_copy.drop(['SalePrice'], axis = 1)

In [None]:
# quantile75 = df_train_copy['GrLivArea'].quantile(0.75)
# quantile25 = df_train_copy['GrLivArea'].quantile(0.25)
# IQR = quantile75 - quantile25
# IQR
# print(IQR)
# print()

# fig, (ax1, ax2) = plt.subplots(2, 1)
# plt.subplots_adjust(top=1.5, bottom=0.5, wspace=1)
# fig.figsize=[50, 15]

# Plot the BOXPLOT for this feature 
# df_train_copy['GrLivArea'].plot(kind='box', ax=ax1) 


# Plot the KDEplot for this feature
# df_train_copy['GrLivArea'].plot(kind='kde', ax=ax2)
# ax2.set_xlabel('GrLivArea')
# print()

# Print Skewness
# print(df_train_copy['GrLivArea'].skew())

In [None]:
# Count the number of values in GrLivArea > 3000
# print(df_train_copy[df_train_copy['GrLivArea']>3000].shape)

# Since there are only 18 of them, let's cap them altogether with 3000

In [None]:
# Here we would cap the outliers above , as they contribute to the data skewness
# extreme_upper = df_train_copy['GrLivArea'].quantile(0.75) + 3 * IQR
# df_train_copy['GrLivArea'] = np.where(df_train_copy['GrLivArea'] > extreme_upper, extreme_upper, df_train_copy['GrLivArea'])
# df_train_copy['GrLivArea'] = np.where(df_train_copy['GrLivArea']>3000, 3000, df_train_copy['GrLivArea'])

# Again plotting KDE plot 
# df_train_copy['GrLivArea'].plot(kind='kde')
# print(df_train_copy['GrLivArea'].skew())

***Hence we have created an approximate normal distribution***

In [None]:
# Let's replace the outliers in upper part of feature set with 75th quantile values
# df_train_copy.loc[df_train_copy['GrLivArea']>=4500, 'GrLivArea'] = quantile25
# Or you can also choose to drop these outlier values from dataframe


In [None]:
# df_train_copy['GrLivArea'].plot.hist()

In [None]:
# df_train_copy['age']=df_train_copy['YrSold']-df_train_copy['YearBuilt']

# See why its been done like this
# Some of the non-numeric predictors are stored as numbers; convert them into strings will convert those columns into dummy variables later.
# df_train_copy['MSSubClass'] = df_train_copy['MSSubClass'].astype(str) 
# df_train_copy['YrSold'] = df_train_copy['YrSold'].astype(str) #year
# df_train_copy['MoSold'] = df_train_copy['MoSold'].astype(str) #month

In [70]:
# According to the description, this column refers to the Linear Street of feet. We can't impute using 0. Let's try with 'mean' or 'median' first 
df_train_copy['LotFrontage_median'] = df_train_copy['LotFrontage'].fillna(df_train_copy['LotFrontage'].dropna().median())


# Since we have access to test data, we would impute with values from test data. Else impute with values from training data
df_test_copy['LotFrontage_median'] = df_test_copy['LotFrontage'].fillna(df_test_copy['LotFrontage'].dropna().median())

# Let's study the KDE distribution of this feature 
# df_train_copy['LotFrontage_median'].plot(kind='kde')

# Printing the skewness and krutosis of this distribution 
# print(df_train_copy['LotFrontage_median'].skew())
# print(df_train_copy['LotFrontage_median'].kurtosis())

In [71]:
# Let's fill it using random imputation function
df_train_copy['LotFrontage_fill'] = df_train_copy['LotFrontage']
random_sample = df_train_copy['LotFrontage'].dropna().sample(df_train_copy['LotFrontage'].isnull().sum(), random_state=seed_value)
random_sample.index = df_train_copy[df_train_copy['LotFrontage'].isnull()].index
df_train_copy.loc[df_train_copy['LotFrontage_fill'].isnull(), 'LotFrontage_fill'] = random_sample

# Perform the same on test data
df_test_copy['LotFrontage_fill'] = df_test_copy['LotFrontage']
random_sample = df_test_copy['LotFrontage'].dropna().sample(df_test_copy['LotFrontage'].isnull().sum(), random_state=seed_value)
random_sample.index = df_test_copy[df_test_copy['LotFrontage'].isnull()].index
df_test_copy.loc[df_test_copy['LotFrontage_fill'].isnull(), 'LotFrontage_fill'] = random_sample

# Let's study the KDE plot for this variable 
df_train_copy['LotFrontage_fill'].round(0).plot(kind='kde')

# Again study kurtosis
print(df_train_copy['LotFrontage_fill'].round(0).kurtosis())

In [52]:
# Studying GarageYr. Belt
df_train_copy['GarageYrBlt'].plot(kind='kde')

# Missing Value Imputation + Encoding Categorical Variables 

In [72]:
# Again define numerical and categorical columns as per the new dataset 
num_cols = [col for col in df_train_copy.columns if df_train_copy[col].dtype != 'object']
cat_cols = [col for col in df_train_copy.columns if col not in num_cols]

In [73]:
# Functional: Home functionality (Assume typical unless deductions are warranted)
df_train_copy['Functional'] = df_train_copy['Functional'].fillna('Typ')
df_test_copy['Functional'] = df_test_copy['Functional'].fillna('Typ')

df_train_copy['Electrical'] = df_train_copy['Electrical'].fillna('SBrkr') #Filling with modef
df_test_copy['Electrical'] = df_test_copy['Electrical'].fillna('SBrkr')

# data description states that NA refers to "No Pool"
df_train_copy['PoolQC'] = df_train_copy['PoolQC'].fillna('Missing')
df_test_copy['PoolQC'] = df_test_copy['PoolQC'].fillna('Missing')

# Replacing the missing values with 0, since no garage = no cars in garage inferred from data dictionary
df_train_copy['GarageYrBlt'] = df_train_copy['GarageYrBlt'].fillna(0)
df_test_copy['GarageYrBlt'] = df_test_copy['GarageYrBlt'].fillna(0)

# Filling missing values in KitchenQuality with 'TA', meaning average  
df_train_copy['KitchenQual'] = df_train_copy['KitchenQual'].fillna('TA')
df_test_copy['KitchenQual'] = df_test_copy['KitchenQual'].fillna('TA')

# Filling missing values with mode
df_train_copy['Exterior1st'] = df_train_copy['Exterior1st'].fillna(df_train_copy['Exterior1st'].mode()[0])
df_test_copy['Exterior1st'] = df_test_copy['Exterior1st'].fillna(df_test_copy['Exterior1st'].mode()[0])

df_train_copy['Exterior2nd'] = df_train_copy['Exterior2nd'].fillna(df_train_copy['Exterior2nd'].mode()[0])
df_test_copy['Exterior2nd'] = df_test_copy['Exterior2nd'].fillna(df_test_copy['Exterior2nd'].mode()[0])

df_train_copy['SaleType'] = df_train_copy['SaleType'].fillna(df_train_copy['SaleType'].mode()[0])
df_test_copy['SaleType'] = df_test_copy['SaleType'].fillna(df_test_copy['SaleType'].mode()[0])

# None means no Garage
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
     df_train_copy[col] = df_train_copy[col].fillna('Missing')
     df_test_copy[col] = df_test_copy[col].fillna('Missing')

# None means no Basement        
for col in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    df_train_copy[col] = df_train_copy[col].fillna('Missing')
    df_test_copy[col] = df_test_copy[col].fillna('Missing')

df_train_copy['MSZoning'] = df_train_copy.groupby('MSSubClass')['MSZoning'].transform(lambda x : x.fillna(x.mode()[0]))
df_test_copy['MSZoning'] = df_test_copy.groupby('MSSubClass')['MSZoning'].transform(lambda x : x.fillna(x.mode()[0]))
# The pandas update function is used to 'Modify in place using non-NA values from another DataFrame. Aligns on indices. There is no return value.'
# All of the above is stated according to documentation.

# df_train_copy[cat_cols].update(df_train_copy[cat_cols].fillna("Missing"))
# df_train_copy[num_cols].update(df_train_copy[num_cols].fillna(0))
# df_test_copy[num_cols].fillna("Missing", inplace=True)

# Update Missing LotFrontage values with 0
df_train_copy['LotFrontage'] = df_train_copy['LotFrontage'].fillna(0)
df_test_copy['LotFrontage'] = df_test_copy['LotFrontage'].fillna(0)


# Update Missing 'Fireplace QA' wirh 'NA' feature
df_train_copy['FireplaceQu'] = df_train_copy['FireplaceQu'].fillna('NA')
df_test_copy['FireplaceQu'] = df_test_copy['FireplaceQu'].fillna('NA')


# Updating 'MasVnrType'(None) and 'MasVnrArea'(0)
df_train_copy['MasVnrType'] = df_train_copy['MasVnrType'].fillna('None')
df_test_copy['MasVnrType'] = df_test_copy['MasVnrType'].fillna('None')

df_train_copy['MasVnrArea'] = df_train_copy['MasVnrArea'].fillna(0)
df_test_copy['MasVnrArea'] = df_test_copy['MasVnrArea'].fillna(0)


# Update 'Utilities' column
for col in ['Utilities', 'Exterior2nd']:
    df_train_copy[col] = df_train_copy[col].fillna('Missing')
    df_test_copy[col] = df_test_copy[col].fillna('Missing')
    
# Fill in missing values 
for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
    df_train_copy[col] = df_train_copy[col].fillna(0)
    df_test_copy[col] = df_test_copy[col].fillna(0)

# Drop ['Alley', 'Fence' and 'MiscFeature']
df_train_copy.drop(['Alley', 'Fence', 'MiscFeature'], axis=1, inplace=True)
df_test_copy.drop(['Alley', 'Fence', 'MiscFeature'], axis=1, inplace=True)

In [74]:
# Finally IMPUTE some more randomly missing values with 0
df_train_copy['BsmtFinSF1'] = df_train_copy['BsmtFinSF1'].fillna(0)
df_test_copy['BsmtFinSF1'] = df_test_copy['BsmtFinSF1'].fillna(0)

df_train_copy['BsmtFinSF2'] = df_train_copy['BsmtFinSF2'].fillna(0)
df_test_copy['BsmtFinSF2'] = df_test_copy['BsmtFinSF2'].fillna(0)

df_train_copy['BsmtUnfSF'] = df_train_copy['BsmtUnfSF'].fillna(0)
df_test_copy['BsmtUnfSF'] = df_test_copy['BsmtUnfSF'].fillna(0)

df_train_copy['TotalBsmtSF'] = df_train_copy['TotalBsmtSF'].fillna(0)
df_train_copy['TotalBsmtSF'] = df_train_copy['TotalBsmtSF'].fillna(0)

df_train_copy['GarageCars'] = df_train_copy['GarageCars'].fillna(0)
df_test_copy['GarageCars'] = df_test_copy['GarageCars'].fillna(0)

df_train_copy['GarageArea'] = df_train_copy['GarageCars'].fillna(0)
df_test_copy['GarageArea'] = df_test_copy['GarageArea'].fillna(0)

***Imputing Categorical Features***

In [75]:
# Use these imputations later on if required. Use these features, add them to the model else use them in conjunction for feature aggregation
df_train_copy['PoolArea'+'_impute'] = df_train_copy['PoolArea'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['MiscVal'+'_impute'] = df_train_copy['MiscVal'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['ScreenPorch'+'_impute'] = df_train_copy['ScreenPorch'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['3SsnPorch'+'_impute'] = df_train_copy['3SsnPorch'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['EnclosedPorch'+'_impute'] = df_train_copy['EnclosedPorch'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['WoodDeckSF'+'_impute'] = df_train_copy['WoodDeckSF'].apply(lambda x : 1 if x>0 else 0)
df_train_copy['OpenPorchSF'+'_impute'] = df_train_copy['OpenPorchSF'].apply(lambda x : 1 if x>0 else 0)
df_train_copy['HalfBath'+'_impute'] = df_train_copy['HalfBath'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['Fireplaces'+'_impute'] = df_train_copy['Fireplaces'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['Fireplaces'+'_impute'] = df_train_copy['Fireplaces'].apply(lambda x : 1 if x>0 else 0)
df_train_copy['2ndFlrSF'+'_impute'] = df_train_copy['2ndFlrSF'].apply(lambda x : 1 if x==0 else 0)
df_train_copy['LowQualFinSF'+'_impute'] = df_train_copy['LowQualFinSF'].apply(lambda x : 1 if x==0 else 0)

# Follow the same procedure for test columns 
df_test_copy['PoolArea'+'_impute'] = df_test_copy['PoolArea'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['MiscVal'+'_impute'] = df_test_copy['MiscVal'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['ScreenPorch'+'_impute'] = df_test_copy['ScreenPorch'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['3SsnPorch'+'_impute'] = df_test_copy['3SsnPorch'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['EnclosedPorch'+'_impute'] = df_test_copy['EnclosedPorch'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['WoodDeckSF'+'_impute'] = df_test_copy['WoodDeckSF'].apply(lambda x : 1 if x>0 else 0)
df_test_copy['OpenPorchSF'+'_impute'] = df_test_copy['OpenPorchSF'].apply(lambda x : 1 if x>0 else 0)
df_test_copy['HalfBath'+'_impute'] = df_test_copy['HalfBath'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['Fireplaces'+'_impute'] = df_test_copy['Fireplaces'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['Fireplaces'+'_impute'] = df_test_copy['Fireplaces'].apply(lambda x : 1 if x>0 else 0)
df_test_copy['2ndFlrSF'+'_impute'] = df_test_copy['2ndFlrSF'].apply(lambda x : 1 if x==0 else 0)
df_test_copy['LowQualFinSF'+'_impute'] = df_test_copy['LowQualFinSF'].apply(lambda x : 1 if x==0 else 0)

***AWARE***
1. The columns have been randomly imputed based on the count of specific features in the dataset. This method is a very simple version to compute missing values which gives importance to 'Missing' values if there is a significant count of this variable, else rest of the features are given equal weighatge and encoded as '1'
2. Here the missing values are not at random(MNAR)


In [76]:
num_cols_1 = [col for col in df_train_copy.columns if df_train_copy[col].dtype!='object' and col!='SalePrice']
cat_cols_1 = [col for col in df_train_copy.columns if col not in num_cols_1 and col!='SalePrice']
num_cols_1.remove('YearRemod/Add')

In [None]:
# We have already removed skewness from target variable, we need to check out the skewness among various features too
# df_train_copy_num = df_train_copy.select_dtypes(['int64', 'float64'])
# skew_features = df_train_copy_num.apply(lambda x : x.skew()).sort_values(ascending=False)

# skew_high = skew_features[skew_features > 0.6] 
# This command returns a series

# high_indices = skew_high.index
# This returns a list of indices

In [None]:
# Refer this for boxplot(numerical value distribution)
# fig, ax = plt.subplots(figsize=(8, 7))
# ax.set_xscale("log")
# ax = sns.boxplot(data=df_train_copy_num , orient="h", palette="Set1")
# ax.xaxis.grid(False)
# ax.set(ylabel="FeatureNames")
# ax.set(xlabel="Numeric values")
# ax.set(title="Numeric Distribution of Features")
# sns.despine(trim=True, left=True)

In [None]:
# Normalize skewed features using a box-cox normal distribution, we can surely use other techniques but it works very well on this dataset
# Check out for other techniques used to normalize skewed features(sum of them being)
# People usually use box-cox and StandardScaler for removing skewed data
# for i in high_indices:
      # What's this 1.002 used for?
#     df_train_copy[i] = boxcox1p(df_train_copy[i], boxcox_normmax(df_train_copy[i] + 1.002))

In [None]:
# Creating more features by log transformation
# Refer this for boxplot(numerical value distribution)
# fig, ax = plt.subplots(figsize=(8, 7))
# ax.set_xscale("log")
# ax = sns.boxplot(data=df_train_copy_num[high_indices] , orient="h", palette="Set1")
# ax.xaxis.grid(False)
# ax.set(ylabel="Feature Names")
# ax.set(xlabel="Numeric values")
# ax.set(title="Numeric Distribution of Features")
# sns.despine(trim=True, left=True)

In [None]:
# from scipy import stats
# high_indices_list = list(high_indices)

In [None]:
# nrows = 11
# ncols = 2

# fig, axes = plt.subplots(nrows, ncols, figsize=(20, 15))

# #Initializing lazy counter
# count = 0

# for i in range(nrows):
#   for j in range(ncols):
#       ax = axes[i, j]

#       if count < len(high_indices.tolist()):
#         ax.plot(df_train_copy[high_indices_list[count]])
#         ax.set(xlabel="Feature Names")
#         ax.title("Skewness distribution of a feature variable")

In [None]:
# NOt useful columns in our predictions, more than 99% rows have same value.
# print(df_train_copy['Utilities'].value_counts())
# NOt useful columns in our predictions, more than 99% rows have same value.
# print(df_train_copy['Street'].value_counts())
# NOt useful columns in our predictions, more than 99% rows have same value.
# print(df_train_copy['PoolQC'].value_counts())

In [None]:
# As we can see above, those columns have very little other useful data as their values and are primarily composed of a single feature. It would be better 
# if we drop them as they are adding up as a redundant feature without giving much insights about the data.
# df_train_copy = df_train_copy.drop(['Utilities', 'Street', 'PoolQC'], axis=1)
# df_test_copy = df_test_copy.drop(['Utilities', 'Street', 'PoolQC'], axis=1)

In [None]:
#The main difference between apply and transform functions is that while apply passes the dataframe in the form of columns to the custom function, 
#whereas the transform method passes the dataframe as a series to the custom function.
#Let's check out the number of 0's in dataset
# for col in numerical_cols:
#   print(col, "\t", len(list(df_train_copy.loc[df_train_copy[col] == 0, col].index)))
#Hence we can observe there are many columns containing 0 as a value
#We need some way to remove their unusefulness as they may be useful

In [None]:
# df_train_copy['TotalBsmtSF'].value_counts()

In [None]:
# Understand this feature descriptor later on. Right now focus on features that are in use 
# df_train_copy['TotalBsmtSF'] = df_train_copy['TotalBsmtSF'].apply(lambda x: np.exp(6) if x <= 0.0 else x)
# df_train_copy['2ndFlrSF'] = df_train_copy['2ndFlrSF'].apply(lambda x: np.exp(6.5) if x <= 0.0 else x)
# df_train_copy['LotFrontage'] = df_train_copy['LotFrontage'].apply(lambda x: np.exp(4.2) if x <= 0.0 else x)
# df_train_copy['MasVnrArea'] = df_train_copy['MasVnrArea'].apply(lambda x: np.exp(4) if x <= 0.0 else x)
# df_train_copy['BsmtFinSF1'] = df_train_copy['BsmtFinSF1'].apply(lambda x: np.exp(6.5) if x <= 0.0 else x)

In [None]:
# Creating more features by log transformation
# def log_transform(result, features):
#   m = result.shape[1]

#   for feature in features:
    
#     # The Pandas assign function assigns a new column to the dataframe with a modified feature. Look up the docs for further information.
#     result = result.assign(newcol = pd.Series(np.log(1.01+result[feature])))
#     # columns.values returns a numpy array
#     result.columns.values[m] = feature + '_log' 
#     m += 1

#   return result

# log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
#                  'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
#                  'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
#                  'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
#                  'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd']

# df_train_copy = log_transform(df_train_copy, log_features)

Now that every possible transformation of data has been taken care of, let's one-hot encode our categorical variables. That's as easy as it sounds.

In [None]:
# df_train_copy_num = df_train_copy.select_dtypes(include=['int64', 'float64'])
# df_train_copy_cat = df_train_copy.select_dtypes(exclude=['int64', 'float64'])

# df_train_copy_cat = pd.get_dummies(df_train_copy_cat, drop_first=True)

# df_train_copy_final = pd.concat([df_train_copy_num, df_train_copy_cat], axis=1)
# remove_cols = ['LotFrontage_median', 'PoolArea_impute', 'MiscVal_impute', 'ScreenPorch_impute', '3SsnPorch_impute', 'EnclosedPorch_impute', 'WoodDeckSF_impute', 'OpenPorchSF_impute', 'OpenPorchSF_impute', 
#            'HalfBath_impute', 'Fireplaces_impute', 'Fireplaces_impute', '2ndFlrSF_impute', 'LowQualFinSF_impute']

In [80]:
# Remove 'SalePrice' from train set features
# y = df_origin['SalePrice']
# num_cols_1_train.remove('SalePrice')
train_data = df_train_copy[num_cols_1]
test_data = df_test_copy[num_cols_1]

# Feature Selection

In [81]:
# Remove FEATURES using Variance Threshold. Drop colums whose variability in values is less, that is, one feature dominates more than others. 
# This can lead to overfitting and hence, these columns must be dropped 
var_threshold = VarianceThreshold(threshold=0.2)
var_threshold.fit_transform(train_data)

var_threshold.get_support()
# len(train_data.columns)

In [82]:
less_variance_columns = [col for col in train_data.columns if col not in train_data.columns[var_threshold.get_support()]]
train_data.drop(less_variance_columns, axis=1, inplace=True)
test_data.drop(less_variance_columns, axis=1, inplace=True)

In [83]:
# Plot the correlation matrix
# Remove 'Id' column from numerical columns 
# numerical_cols.remove('Order')
corr_matrix = train_data.corr()


def highlight_cells(val):
    color = 'yellow' if val >= 0.8 and val != 1 else ' '
    return 'background-color: {}'.format(color)

corr_matrix.style.applymap(highlight_cells)

In [63]:
train_data

In [84]:
# Let's plot a pairgrid of a subset of numerical features
sub_features = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2']
sub_train_data = train_data[sub_features]

# Plot a pairgrid to study scatter plots 
g = sns.PairGrid(sub_train_data)
g.map(sns.scatterplot)

**Observations**:
1. Since we would be using decison-tree based estimartors instead of linear models, computing correlation matrix isn't helpful 
2. Also, under the hood, the assumption behind evaluation of Pearrson and Spearmann's correlations is that the distribution of these features WRT to each other should be linear or monotonic
3. As we can see from the above scatterplots, there isn't any linear or monotonic relationship between the features. Hence, we won't be using correlation values for feature selection
4. Since we are using tree-based models, let's use ExtraTreesRegressor for feature selection. It differs from Random Forest in the way that it considers the entire dataset while botstrapping, whereas the Random Forest considers sampling with replacement. Also, the former is faster as it performs random split in the first instance, whereas the Random Forest computes the best split based on information gain.

In [100]:
# Use Extra Trees Regressor for feature selection
selector = SelectFromModel(estimator=ExtraTreesRegressor(criterion='mse', random_state=seed_value)).fit(train_data, y)
mask = selector.get_support()
feature_names = train_data.columns
new_features = []

for mask_val, feature in zip(mask, feature_names):
    if mask_val == True:
        new_features.append(feature)
        
train_data_1 = selector.fit_transform(train_data, y)
test_data_1 = selector.transform(test_data)

Future Checkboxes:
<p>
<input type="checkbox"> Try KNN Imputer for imputation of categorical columns 
</p>

<p> 
<input type="checkbox"> Try Gradient Boosting algorithms for a better performance
</p>

# **Predictions**

In [119]:
print(train_data.shape)
print(test_data.shape)
print(y.shape)

# Convert training and testing data to pandas DataFrame
train_data_1 = pd.DataFrame(train_data_1)
test_data_1 = pd.DataFrame(test_data_1)
train_data_1.columns, test_data_1.columns = new_features, new_features

# Split the dataset into training and validation sets (according to feature selection based on ExtraTreesRegressor, which is based upon random feature split)
X_train, X_valid, y_train, y_valid = train_test_split(train_data_1, y, test_size=0.2, random_state=seed_value, shuffle=True)

# Split the dataset into training and validation sets (without the feature selection by ExtraTreesRegressor)
X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split(train_data, y, test_size=0.2, random_state=seed_value, shuffle=True)

In [120]:
# Impute missing values in training and validation datasets
# X_train = pd.DataFrame(X_train)
# X_valid = pd.DataFrame(X_valid)
# X_train.columns = new_features
# X_valid.columns = new_features
X_train = X_train.fillna(0)
X_valid = X_valid.fillna(0)

X_train_1 = X_train_1.fillna(0)
X_valid_1 = X_valid_1.fillna(0)

In [121]:
# Calculate Root-Mean Squared Error
def rmsle(y, preds):
    return np.sqrt(mean_squared_error(y, preds))

# Calculate R-2 and Adjusted R2
def r2_value(dataframe, y, preds):
    no_of_instances = len(dataframe)
    no_of_features = len(dataframe.columns)
    r2score = r2_score(y, preds)
    adj_r2score = 1 - (((1-r2score)*(no_of_instances-1))/(no_of_instances-no_of_features-1))
    return r2score, adj_r2score

***Decision tree***

In [115]:
# Initialize the regressor 
dt_reg = DecisionTreeRegressor(random_state=seed_value, criterion='mse')
dt_reg.fit(X_train, y_train)
preds_dt = dt_reg.predict(X_valid)

# Calculate rmse score 
rmse_dt = rmsle(y_valid, preds_dt)

# Calculate R2 and Adjusted R2 score
r2_dt, r_adj_dt = r2_value(train_data_1, y_valid, preds_dt)

print(f"The RMSE value for Random Forest Classifier is {rmse_dt}")
print(f"The R2 value for Random Forest Classifier is {r2_dt}")
print(f"The Adjusted R2 value for Random Forest Classifier is {r_adj_dt}")

***Random Forest(Hyperparameter Tuning with RandomizedSearch CV + K-Fold Cross Validation)***

In [118]:
# Initialize Random Forest Regressor
random_reg = RandomForestRegressor(bootstrap=True, oob_score=True, criterion='mse', random_state=seed_value, verbose=2)
random_reg.fit(X_train, y_train)
preds = random_reg.predict(X_valid)

# Calculate rmse score
rmse_rf = rmsle(y_valid, preds)

# Calculate R2 and Adjusted R2 scores
r2_rf, adj_r2_rf = r2_value(train_data_1, y_valid, preds)


print(f"The RMSE value for Random Forest Regressor is {rmse_rf}")
print(f"The R2 value for Random Forest Regressor is {r2_rf}")
print(f"The Adjusted R2 value for Random Forest Regressor is {adj_r2_rf}")

In [122]:
# Initialize Random Forest Regressor
random_reg_1 = RandomForestRegressor(bootstrap=True, oob_score=True, criterion='mse', random_state=seed_value, verbose=2)
random_reg_1.fit(X_train_1, y_train_1)
preds_1 = random_reg_1.predict(X_valid_1)

# Calculate rmse score
rmse_rf_1 = rmsle(y_valid_1, preds_1)

# Calculate R2 and Adjusted R2 scores
r2_rf_1, adj_r2_rf_1 = r2_value(train_data_1, y_valid_1, preds_1)


print(f"The RMSE value for Random Forest Regressor is {rmse_rf_1}")
print(f"The R2 value for Random Forest Regressor is {r2_rf_1}")
print(f"The Adjusted R2 value for Random Forest Regressor is {adj_r2_rf_1}")

In [123]:
# Printing the estimator for this model 
print(random_reg_1.base_estimator_)

# Print training set out-of-sample score
print(random_reg_1.oob_score_)

In [124]:
# Use GridSearchCV for best combination of hyperparameters and check if our model generalizes further
# Use the entire dataset for training here as it would enable the model to learn from a larger pool of training instances 

# Initialize K-Fold Cross-Validation 
cv = KFold(shuffle=True, random_state=seed_value)

params_random_reg = {
    'n_estimators': [estim for estim in range(100, 500, 50)], 
    'max_depth': [depth for depth in range(10, 100, 5)], 
    'min_samples_leaf': [min_leaves for min_leaves in range(50, 100, 5)], 
    'max_samples': [min_samp for min_samp in range(100, 500, 50)], 
    'max_features': ['sqrt', 'log2', 'auto']
}

param_random_reg_2 = {
                        'bootstrap': [True, False],
                        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                        'max_features': ['auto', 'sqrt'],
                        'min_samples_leaf': [1, 2, 4],
                        'min_samples_split': [2, 5, 10],
                        'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# random_model = GridSearchCV(random_reg, param_grid = params_random_reg, cv=cv)
# random_model.fit(train_data, y)

In [128]:
# Grid Search CV is taking too much time
# Reinitialize another random forest model 
# Let's use Randomized Search CV and use it to for estimating the range of values to be used in GridSearch CV 
# rf_reg_2 = RandomForestRegressor(random_state=seed_value)
# random_model_2 = RandomizedSearchCV(rf_reg_2, n_iter=20, param_distributions = param_random_reg_2, cv=cv)
# random_model_2.fit(train_data, y)
# Get the best parameter values 
# print(random_model_2.best_params_)
# print(random_model_2.best_score_)

In [129]:
# Let's estimate validation set on this range of hyperparameters
final_random_model = RandomForestRegressor(n_estimators=300, min_samples_leaf=55, max_samples=400, max_features='log2', max_depth=45)
final_random_model.fit(X_train, y_train)
preds_val_3 = final_random_model.predict(X_valid)
rmse_val_3 = rmsle(y_valid, preds_val_3)
r2_val_3, adj2_val_3 = r2_value(train_data, y_valid, preds_val_3)

print(f"The RMSE value for tuned Random Forest Classifier is {rmse_val_3}")
print(f"The R2 value for tuned Random Forest Classifier is {r2_val_3}")
print(f"The Adjusted R2 value for Random Forest Classifier is {adj2_val_3}")

**Observations**:
1. After obtaining the best set of parameters through Randomized Search CV, the R2 and adjusted R2 value got reduced. Also the score for test set got reduced on submission. 
2. Maybe its is due to the fact that since the model selection tool has randomly initialized a set of parameters, without fitting each of the model to the dataset. The 
   resultant model hence, doesn't generalize well 

In [131]:
# Understanding feature importances of independent variables
importance = random_reg_1.feature_importances_
feat_importances = pd.Series(importance, index=train_data.columns)
feat_importances.sort_values(ascending=False)

**Observations:**
1. Except for some top 10-12 features, rest of them have a negligible impact on prediction of house prices. 
2. From some preliminary observations, rating of the overall material and finish of the house has a largest bearing on final house prices
3. Rest of the features have a reduced impact

In [133]:
# Let's verify these results with an F-test 
fs_imp_verify = pd.DataFrame()
test = f_regression(X_train, y_train)
fs_imp_verify['Feature'] = train_data_1.columns
fs_imp_verify['Critical Value'] = test[0]
fs_imp_verify['P-Value'] = test[1]
fs_imp_verify.sort_values('P-Value')
fs_imp_verify

**Observations**:
1. Here we have used the F-Test to verify the feature importance values provided by the random forest regressor 
2. The test verifies that MsSubClass is indeed the most important feature for calculation of house prices

**Observations**
1. By adding the additional training data, the overall R2 and adjusted R2 values have decreased, probably due to increase in number of training instances
2. Performing data augmentation has enabled the model to generalize better, as evident from the improved test set score

**Defining Cross-Validation and all possible regression evaluation metrics in the notebook**

In [None]:
# Define Cross Validation and other relatable metrics here. The cell has been deleted
# ridge_regressor = Ridge() 
# params = {'alpha': [1,0.1,0.01,0.001,0.0001,0] , "fit_intercept": [True, False], "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
# scaler = RobustScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

In [None]:
# grid_ridge = GridSearchCV(ridge_regressor, param_grid=params, cv=kfold, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_ridge.fit(X_train, y_train)
# alpha = grid_ridge.best_params_
# ridge_score = grid_ridge.best_score_
# print(alpha)
# print(ridge_score)

In [None]:
# regressor_ridge_best = Ridge(alpha=0.01, fit_intercept='True', solver='cholesky')
# regressor_ridge_best.fit(X_train, y_train)
# predictions = regressor_ridge_best.predict(X_val)

In [None]:
# eval = rmsle(y_val, predictions)
# r2_score(y_val, predictions) #defined as (1 - (SSres/SStot.))

In [None]:
# regressor_ridge_best.fit(X, y)
# predictions_final = regressor_ridge_best.predict(X_test)

In [135]:
# Impute 'Missing' with '0' as there are only a few of these values
# for index in test_data.index:
#     test_data['BsmtFullBath'] = np.where(test_data['BsmtFullBath']=='Missing', 0, test_data['BsmtFullBath'])
#     test_data['BsmtHalfBath'] = np.where(test_data['BsmtHalfBath']=='Missing', 0, test_data['BsmtHalfBath'])


# Now cast the 'object' columns as 'float' types
# convert_dtypes = {
#                     'BsmtFullBath': 'float64',
#                     'BsmtHalfBath': 'float64', 
#                     'BsmtFinSF1' : 'float64', 
#                     'BsmtFinSF2' : 'float64', 
#                     'BsmtUnfSF' : 'float64', 
#                     'TotalBsmtSF': 'float64', 
#                }
# test_data = test_data.astype(convert_dtypes)

***We have got a pretty appreciable R2 and adjusted R2 score on the training dataset. Let's predict for the test dataset***

In [1]:
preds_final = random_reg_1.predict(test_data)

# Submission of final predictions 
submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission.iloc[:, 1] = preds_final
submission.to_csv("./submission_prediction.csv", index=False)

# Retraining model with reduced number of features