In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
test_df       = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', keep_default_na = True) 
train_df      = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', keep_default_na = True) 
sample_sub_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
for df in [train_df, test_df]:
    df.set_index("Id", inplace=True)
train_df.head()

In [None]:
train_df.info()

In [None]:
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
y_train = train_df['SalePrice'].values
all_data = pd.concat((train_df, test_df), axis=0)
all_data.drop(['SalePrice'], axis=1, inplace=True)

# Feature Engineering

In [None]:
numerical_features = all_data.select_dtypes([int, float]).columns

In [None]:
numerical_features

In [None]:
all_data[numerical_features].isna().any()

In [None]:
all_data[['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
          'TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea']].isna().sum()

In [None]:
all_data[['LotFrontage','MasVnrArea','GarageYrBlt']].describe()

In [None]:
all_data[['LotFrontage','MasVnrArea','GarageYrBlt']].hist(bins=20,figsize=(20, 6))

For LotFrontage will be imputed the mean, MasVnrArea the mean. For GarageYrBult 79% of instances explains that garage was built the same year as the house, therefore will be imputed the year when house was built.

In [None]:
def impute_numeric(train_df):
  train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean(), inplace=True)
  train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean(), inplace=True)
  train_df['GarageYrBlt'].fillna(train_df['YearBuilt'], inplace=True)
  for col in ('GarageArea', 'GarageCars'):
    train_df[col].fillna(0,inplace=True)
  for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    train_df[col].fillna(0,inplace=True)
  return train_df

In [None]:
all_data = impute_numeric(all_data)

Let's see now if our numerical values are complete:

In [None]:
all_data[numerical_features].isna().sum()

Now, let's deal with categorical features:

In [None]:
categorical_features = all_data.select_dtypes([object]).columns
categorical_features

In [None]:
all_data[categorical_features].isna().sum()

In [None]:
train_df[['MSZoning','Alley','Utilities','Exterior1st','Exterior2nd','MasVnrType','BsmtQual','BsmtCond',
          'BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','KitchenQual','Functional','FireplaceQu',
          'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature','SaleType']].dtypes

In [None]:
train_df[['MSZoning','Alley','Utilities','Exterior1st','Exterior2nd','MasVnrType','BsmtQual','BsmtCond',
          'BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','KitchenQual','Functional','FireplaceQu',
          'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature','SaleType']].isna().sum()

In [None]:
test_df[['MSZoning','Alley','Utilities','Exterior1st','Exterior2nd','MasVnrType','BsmtQual','BsmtCond',
          'BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','KitchenQual','Functional','FireplaceQu',
          'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature','SaleType']].isna().sum()

In [None]:
def categorical_impute(all_data):
  all_data["PoolQC"].fillna("None", inplace=True)
  all_data["MiscFeature"].fillna("None", inplace=True)
  all_data["Alley"].fillna("None", inplace=True)
  all_data["Fence"].fillna("None", inplace=True)
  all_data["FireplaceQu"].fillna("None", inplace=True)
  for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
      all_data[col].fillna('None', inplace=True)
  for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
      all_data[col].fillna('None', inplace=True)
  all_data["MasVnrType"].fillna("None", inplace=True)
  all_data["Electrical"].fillna("SBrkr", inplace=True)
  return all_data

In [None]:
all_data=categorical_impute(all_data)

In [None]:
all_data[categorical_features].isna().sum()

In [None]:
for categories in ['MSZoning','Utilities','Exterior1st','Exterior2nd','KitchenQual','Functional','SaleType']:
    print(categories)
    print(pd.concat([train_df, test_df])[categories].sort_values().unique())
    print('\n')

In [None]:
def categorical_impute_rest(all_data):
  all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0], inplace=True)
  all_data['Utilities'].fillna(all_data['Utilities'].mode()[0], inplace=True)
  all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0], inplace=True)
  all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0], inplace=True)
  all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0], inplace=True)
  all_data['Functional'].fillna(all_data['Functional'].mode()[0], inplace=True)
  all_data['SaleType'].fillna(all_data['SaleType'].mode()[0], inplace=True)
  return all_data

In [None]:
all_data=categorical_impute_rest(all_data)

In [None]:
all_data[categorical_features].isna().sum()

In [None]:
def features_type(all_data):
  categorical_features = all_data.select_dtypes([object]).columns
  numerical_features = all_data.select_dtypes([int, float]).columns
  print('Number of categorical nominal features: ',len(categorical_features))
  print('Number of numerical features: ',len(numerical_features))
  print('Number of total features: ',len(categorical_features)+len(numerical_features))

In [None]:
features_type(all_data)

In [None]:
all_data.info()

In [None]:
all_data[categorical_features].isna().any()

Now, we need to know which of these features above are 'signifcantly skewed' in other words if there is a class which is almost in all instances the same, for this I will show the distribution of classes for each feature in percentage. For example MSSubclass, its top class corresponds to 37% of instances and top 2 classes correspond to 56%.

In [None]:
first_class=[]
for l in categorical_features:
  first_class.append(all_data[l].value_counts(normalize=True)[0])

In [None]:
second_class=[]
for l in categorical_features:
  second_class.append(all_data[l].value_counts(normalize=True)[0]+all_data[l].value_counts(normalize=True)[1])

In [None]:
pd.concat([pd.DataFrame(categorical_features,columns=['Feature']),
           pd.DataFrame(first_class, columns=['Top 1']),
           pd.DataFrame(second_class, columns=['Top 1 & 2'])],axis=1)

Those features in which the first class representst more than 98% means that a huge amount of instances are of the same class, which in other words does not add too much information  and therefore will be dropped.

Because of this will be dropped the following features:

- Street
- Utilities
- Condition2
- RoofMatl
- Heating
- PoolQC

In [None]:
all_data.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating', 'PoolQC'], axis=1, inplace=True)

In [None]:
all_data[numerical_features].isna().any()

In [None]:
features_type(all_data)

MSSubClass and MoSold correspond to categorical nominal features, but are misrepresented as numerical, so now I will change its type:

In [None]:
sns.countplot(all_data.MSSubClass)

In [None]:
sns.countplot(all_data.MoSold)

In [None]:
for feat in [['MSSubClass','MoSold']]:
    all_data[feat] = all_data[feat].astype(str)

In [None]:
features_type(all_data)

Let's create more features derived from the numerical in order to increase the information given to the model, below I have created 4 features and changed one:

In [None]:
all_data['TotalBath'] = all_data['BsmtFullBath'] + all_data['BsmtHalfBath'] * 0.5 + all_data['FullBath'] + all_data['HalfBath'] * 0.5
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'] + all_data['BsmtFinSF1'] + all_data['BsmtFinSF2']
all_data['Age'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['Total_porch_sf'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] + all_data['EnclosedPorch'] + all_data['ScreenPorch'] + all_data['WoodDeckSF'])

In [None]:
features_type(all_data)

Now, show the distribution and compute the skewness of each numerical feature and apply log transform to each one with value higher than 1.0:

In [None]:
categorical_features = all_data.select_dtypes([object]).columns
numerical_features = all_data.select_dtypes([int, float]).columns

In [None]:
fig = plt.figure(figsize=(25,40)) #figure size
a = 13  # number of rows
b = 3  # number of columns
c = 1  # initialize plot counter

for feat in numerical_features:
    plt.subplot(a, b, c)
    sns.kdeplot(x=all_data[feat])
    c+=1
    
plt.tight_layout()
plt.show()

In [None]:
skewed_feat=[]
for k in numerical_features:
  if(np.abs(all_data[k].skew())>1.0):
    print('Skew of feature: ',k,' is: ',all_data[k].skew())
    skewed_feat.append(k)
  else:
    pass

print('Number of skewed features: ',len(skewed_feat))

In [None]:
for col in skewed_feat:
  all_data[col] = np.log1p(all_data[col])

Encoding of categorical nominal features:

In [None]:
features_type(all_data)

In [None]:
dummy_cols=0
for c in categorical_features:
  dummy_cols=dummy_cols+(len(all_data[c].unique())-1)

dummy_cols

Therefore, after one-hot encoding of these features we will have a total of:

263 = 38 (numerical) + 225 (categorical_encoded)

In [None]:
all_data_dummy= pd.get_dummies(all_data[categorical_features], drop_first=True)
all_data=pd.concat([all_data,all_data_dummy],axis=1) # joining converted dummy feature and original df_all dataset
all_data= all_data.drop(all_data[categorical_features],axis=1) #removing original categorical columns
all_data.shape

In [None]:
all_data.describe().T

In [None]:
all_data

### Label normalization

Consist in applying a function which can transform the distribution of our label to a gaussian-shape, this is because machine learning models are based on normal distributions either features or label, therefore in order to obtain a considerably high accuracy in our prediction our dataset must meet such criteria, the function we will use is logarithmic, thus we have to remember that after the prediction we must get our label back to its 'dimension', this is achieved by applying exponential to such predictions.   

In [None]:
train_df['SalePrice'].hist(bins=50)

In [None]:
train_df['SalePrice'].skew()

We will use the statistical test D'Agostino, which needs to declare a null hypothesis and alternative hypothesis.
This test outputs a "p-value". The higher this p-value is the closer the distribution is to normal.
Defining a threshold of 0.05 means that if such value is lower we reject the null hypothesis that the distribution is normal and viceversa.

- H0: The distribution is Normal.
- H1: The distribution is not Normal.

In [None]:
from scipy.stats.mstats import normaltest

In [None]:
normaltest(train_df['SalePrice'].values)

As p-value is much lower than 0.05 we reject H0, therefore we have to find a proper method to convert our label.

In [None]:
label=np.log(train_df['SalePrice'])

In [None]:
plt.hist(label)

In [None]:
pd.DataFrame(label).skew()

In [None]:
#ntrain = train_df.shape[0]
ntrain

In [None]:
#ntest = test_df.shape[0]
ntest

In [None]:
training=all_data[:ntrain]
testing=all_data[ntrain:]

# Modeling

The following models will be built and compared using their corresponding error measurements:

- RidgeCV
- LassoCV
- ElasticNetCV
- XGBoost Regressor

Before building the different models let's declare some error metrics in order to compare the performace of each one:

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy.special import inv_boxcox

def error_metrics(y_pred,y_val):
  print('MSE: ',mean_squared_error(y_pred,y_val))
  print('RMSE: ',np.sqrt(mean_squared_error(y_pred,y_val)))
  print('Coefficient of determination: ',r2_score(y_pred,y_val))

As a typical step we have to split our dataset into training and validation sets in order to know the behaviour of the model for out-of-bag instances and given the metrics improve by setting the hyperparameters.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, label_train, label_val = train_test_split(training, label, test_size=0.25, random_state=42)

In [None]:
X_train.shape, label_train.shape, X_val.shape, label_val.shape

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()

X_train_s = s.fit_transform(X_train)
X_val_s = s.transform(X_val)

## RidgeCV

In [None]:
from sklearn.linear_model import RidgeCV

alphas = [1e-3, 5e-3, 0.02, 0.05]

ridgeCV = RidgeCV(alphas=alphas, cv=4).fit(X_train_s, label_train)

ridgeCV_pre = ridgeCV.predict(X_val_s)

print('Alpha found: ',ridgeCV.alpha_)

In [None]:
error_metrics(ridgeCV_pre,label_val)

## LassoCV

In [None]:
from sklearn.linear_model import LassoCV

alphas2 = np.array([5e-8, 1e-7, 1e-6])

lassoCV = LassoCV(alphas=alphas2, max_iter=5e4, cv=3).fit(X_train_s, label_train)

lassoCV_pre = lassoCV.predict(X_val_s)

print('Alpha found: ',lassoCV.alpha_)

In [None]:
error_metrics(lassoCV_pre,label_val)

## ElasticNetCV

In [None]:
from sklearn.linear_model import ElasticNetCV

l1_ratios = np.linspace(0.8, 0.99, 10)

elasticNetCV = ElasticNetCV(alphas=alphas2, l1_ratio=l1_ratios, max_iter=1e4).fit(X_train_s, label_train)
elasticNetCV_pre = elasticNetCV.predict(X_val_s)

print('Alpha found: ',elasticNetCV.alpha_)
print('l1_ratio: ', elasticNetCV.l1_ratio_)

In [None]:
error_metrics(elasticNetCV_pre,label_val)

## XGBoost Regressor

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBRegressor

The hyperparameters used in the following model were found using several GridSearch and some were due to past projects, I would just recommend you to change a bit these values and see if it can improve even more, but strongly advice you to learn the best ways to find such values. 

In [None]:
XGB = XGBRegressor(colsample_bytree=0.2,
                    gamma=0.0,
                    learning_rate=0.01,
                    max_depth=3,
                    min_child_weight=1.5,
                    n_estimators=9500,                                                                  
                    reg_alpha=0.7,
                    reg_lambda=0.7,
                    subsample=0.2,
                    seed=42,
                    silent=1).fit(X_train_s, label_train)

XGB_pre = XGB.predict(X_val_s)

In [None]:
error_metrics(XGB_pre,label_val)

In [None]:
#np.sqrt(mean_squared_error(ridgeCV_pre,label_val))
#np.sqrt(mean_squared_error(lassoCV_pre,label_val))
#np.sqrt(mean_squared_error(elasticNetCV_pre,label_val))
np.sqrt(mean_squared_error(XGB_pre,label_val))

In [None]:
data = {'RidgeCV': [mean_squared_error(ridgeCV_pre,label_val),np.sqrt(mean_squared_error(ridgeCV_pre,label_val)),r2_score(ridgeCV_pre,label_val)],
        'LassoCV': [mean_squared_error(lassoCV_pre,label_val),np.sqrt(mean_squared_error(lassoCV_pre,label_val)),r2_score(lassoCV_pre,label_val)],
        'ElasticNetCV': [mean_squared_error(elasticNetCV_pre,label_val),np.sqrt(mean_squared_error(elasticNetCV_pre,label_val)),r2_score(elasticNetCV_pre,label_val)],
        'XGBoost': [mean_squared_error(XGB_pre,label_val),np.sqrt(mean_squared_error(XGB_pre,label_val)),r2_score(XGB_pre,label_val)]}
 
pd.DataFrame(data, index=['MSE','RMSE','R2 score'])

In the summary above we can see XGBoost slightly outperformed the other models, but such difference becomes significant in scoring, because of that I will continue with this model and predict the label of the instances contained in the test file.

As our model was trained with stardardized features we have to do the same with testing:

In [None]:
testing_s = s.transform(testing)
test_prediction=XGB.predict(testing_s)

In [None]:
test_prediction

Remember when we normalized our label using log?, Now that we have predicted for testing these are in such 'dimension', therefore we have to apply exponential to return such values to currencies:

In [None]:
np.exp(test_prediction)

In [None]:
testing.index

In [None]:
test_XGB_pred = np.exp(XGB.predict(testing_s))
XGB_submission = pd.DataFrame({
        "Id": testing.index,
        "SalePrice": test_XGB_pred
    })

XGB_submission.set_index('Id',inplace=True)
XGB_submission.to_csv("XGB_submission.csv")

In [None]:
XGB_submission.head()

I would like to know any feedback in order to increase the performance of the models or tell me if you found a different one even better!

If you liked this notebook I would appreciate so much your upvote if you want to see more projects/tutorials like this one. I encourage you to see my projects portfolio, am sure you will love it.

Thank you!