<b>Setup:</b>
<br>
<ol>
    <li>Import all the required packages.</li>
    <li>Initialize objects for encoders.</li>
</ol>

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

<b>Data acquisition:</b>
<br>
Read the train and test data. They are available in the same directory as the Jupyter notebook.

In [2]:
train = pd.read_csv(os.path.join('Dataset', 'train.csv'))
test = pd.read_csv(os.path.join('Dataset', 'test.csv'))

Checking the shape of the train data.

In [3]:
train.shape

(1460, 81)

Checking the shape of the test data.

In [4]:
test.shape

(1459, 80)

Generate copies of the train and test data for processing.

In [5]:
train_processed = train.copy()
test_processed = test.copy()

Statistical description of the train data.

In [6]:
train_processed.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
#Assigning the target variable value for future use.
target_col = 'SalePrice'

<b>Data cleaning and pre-processing:</b>

• Check and remove if there are any rows with empty value for target columns.

In [8]:
#Checking if any rows contain empty value for target columns
print("Initial shape:", train_processed.shape)
if train_processed[train_processed[target_col].isnull() == True].shape[0] > 0:
    train_processed = train_processed[train_processed[target_col].isnull() == False]
print("Final shape:", train_processed.shape)

Initial shape: (1460, 81)
Final shape: (1460, 81)


• Extracting the target column separately.

In [9]:
y_train = train_processed[target_col]

In [10]:
y_train.shape[0]

1460

• Dropping the target column from the train data.
<br>
• Additionally, the 'Id' column is also dropped since it doesn't contribute to the dataset.

In [11]:
if target_col in train_processed.columns:
    train_processed.drop([target_col], axis=1, inplace = True)
if 'Id' in train_processed.columns:
    train_processed.drop(['Id'], axis=1, inplace = True)
if target_col in test_processed.columns:
    test_processed.drop([target_col], axis=1, inplace = True)
if 'Id' in test_processed.columns:
    test_processed.drop(['Id'], axis=1, inplace = True)

<ul>
    <li>All the categorical columns have been identified using the description of the data provided in 'data_description.txt'.</li>
    <li>This is necessary since there are columns with numerical values that are in fact categorical in nature.</li>
    <li>Finally, we mark all other columns as numerical columns.</li>
</ul>

In [12]:
categorical_data_attributes = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 
                               'Utilities','LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 
                               'Condition2', 'BldgType', 'HouseStyle','RoofStyle', 'RoofMatl', 
                               'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
                              'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                               'BsmtFinType2', 'Heating','HeatingQC', 'Electrical', 'KitchenQual',
                               'Functional', 'FireplaceQu', 'GarageType','GarageFinish', 'GarageQual',
                               'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MoSold','MiscFeature', 
                               'SaleType', 'SaleCondition', 'OverallQual', 'OverallCond', 'CentralAir']

to_transform_data_attributes = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

cols = train_processed.columns

num_cols = list(set(cols) - set(categorical_data_attributes) - set(to_transform_data_attributes))

• Here, the columns with missing values are enumerated in terms of percentage of missing values per column.
<br>
• A threshold, <i>50%</i>, is set for filtering out columns with percent of missing values greater than the specified value.
<br>
• This is done to ensure that the data transformation, to be applied, doesn't skew the dataset towards any particular bias.

In [14]:
null_value_counts = train_processed.isnull().sum()
null_value_percent = null_value_counts/train_processed.shape[0]
significance_level = 0.5
null_value_majority = null_value_percent[null_value_percent>significance_level]
null_value_majority_columns = (null_value_majority.reset_index().iloc[:,0]).values
print("Columns to be dropped due to missing values:", [x for x in null_value_majority_columns])

Columns to be dropped due to missing values: ['Alley', 'PoolQC', 'Fence', 'MiscFeature']


<b>Data Transformations:</b>

The following transformations are applied onto the test and training data.
<ol>
    <li>The columns identified with missing values over the specified threshold are removed from the datasets.</li>
    <li>All the categorical columns are transformed into one-hot encoded columns with the original columns dropped from the datasets. Any missing values are replaced with the mode values for the corresponding columns.</li>
    <li>The missing values in numercial columns are filled using the mode or median values for the coressponding columns.</li>
    <li>Few numerical columns are transformed as follows:
        <ol>
            <li>The age of the house at the time of sale is calculated and added as a new column.</li>
            <li>The years since the remodel is calculated and added as a new column.</li>
            <li>The age of the garage is also calculated similarly and added as a new column.</li>
        </ol>
    </li>
    <li>Additionally, the year of sale is converted to string values.</li>
</ol>

In [17]:
def preProcessor(data):
    for entry in null_value_majority_columns:
        if entry in data.columns:
            data.drop([entry], axis=1, inplace = True)
    for entry in categorical_data_attributes:
        if entry in data.columns:
            if data[entry].isnull().sum() > 0:
                data[entry] = data[entry].fillna(data[entry].mode()[0])
            onehot_encoded = pd.get_dummies(data[entry], dummy_na = False, prefix = entry)
            data = pd.concat([data, onehot_encoded],axis=1)
            data.drop([entry], axis=1, inplace = True)
    for entry in num_cols:
        if entry in data.columns:
            if data[entry].isnull().sum() > 0:
                if entry in ['BsmtFullBath', 'GarageCars', 'BsmtHalfBath']:
                    data[entry] = data[entry].fillna(data[entry].mode()[0])
                elif entry not in ['GarageArea']:
                    data[entry] = data[entry].fillna(data[entry].median())
    if 'GarageArea' in data.columns and data['GarageArea'].isnull().sum() > 0:
        data['GarageArea'] = data.groupby('GarageCars')['GarageArea'].transform(lambda x: x.fillna(x.median()))
    if 'GarageYrBlt' in data.columns and data['GarageYrBlt'].isnull().sum() > 0:
        data['GarageYrBlt'] = data['GarageYrBlt'].fillna(data['YearBuilt'])
    return data

total_data = pd.concat([train_processed, test_processed]).reset_index()
total_data.drop(['index'], axis=1, inplace = True)
total_data = preProcessor(total_data)
total_data['Age_of_house'] = total_data.apply(lambda row: row['YrSold'] - row['YearBuilt'], axis=1)
total_data['Remodel_age'] = total_data.apply(lambda row: row['YrSold'] - row['YearRemodAdd'], axis=1)
total_data['Garage_age'] = total_data.apply(lambda row: row['YrSold'] - row['GarageYrBlt'], axis=1)
total_data.drop(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1, inplace = True)
total_data['YrSold'] = total_data['YrSold'].astype(str)

train_processed_fin = total_data.iloc[:y_train.shape[0], :]
test_processed_fin = total_data.iloc[y_train.shape[0]:, :]
print("Transformed train data:")
display(train_processed_fin.head(10))
print("\nTransformed test data:")
display(test_processed_fin.head(10))

Transformed train data:


Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,OverallCond_5,OverallCond_6,OverallCond_7,OverallCond_8,OverallCond_9,CentralAir_N,CentralAir_Y,Age_of_house,Remodel_age,Garage_age
0,65.0,8450,196.0,706.0,0.0,150.0,856.0,856,854,0,...,1,0,0,0,0,0,1,5.0,5.0,5.0
1,80.0,9600,0.0,978.0,0.0,284.0,1262.0,1262,0,0,...,0,0,0,1,0,0,1,31.0,31.0,31.0
2,68.0,11250,162.0,486.0,0.0,434.0,920.0,920,866,0,...,1,0,0,0,0,0,1,7.0,6.0,7.0
3,60.0,9550,0.0,216.0,0.0,540.0,756.0,961,756,0,...,1,0,0,0,0,0,1,91.0,36.0,8.0
4,84.0,14260,350.0,655.0,0.0,490.0,1145.0,1145,1053,0,...,1,0,0,0,0,0,1,8.0,8.0,8.0
5,85.0,14115,0.0,732.0,0.0,64.0,796.0,796,566,0,...,1,0,0,0,0,0,1,16.0,14.0,16.0
6,75.0,10084,186.0,1369.0,0.0,317.0,1686.0,1694,0,0,...,1,0,0,0,0,0,1,3.0,2.0,3.0
7,68.0,10382,240.0,859.0,32.0,216.0,1107.0,1107,983,0,...,0,1,0,0,0,0,1,36.0,36.0,36.0
8,51.0,6120,0.0,0.0,0.0,952.0,952.0,1022,752,0,...,1,0,0,0,0,0,1,77.0,58.0,77.0
9,50.0,7420,0.0,851.0,0.0,140.0,991.0,1077,0,0,...,0,1,0,0,0,0,1,69.0,58.0,69.0



Transformed test data:


Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,OverallCond_5,OverallCond_6,OverallCond_7,OverallCond_8,OverallCond_9,CentralAir_N,CentralAir_Y,Age_of_house,Remodel_age,Garage_age
1460,80.0,11622,0.0,468.0,144.0,270.0,882.0,896,0,0,...,0,1,0,0,0,0,1,49.0,49.0,49.0
1461,81.0,14267,108.0,923.0,0.0,406.0,1329.0,1329,0,0,...,0,1,0,0,0,0,1,52.0,52.0,52.0
1462,74.0,13830,0.0,791.0,0.0,137.0,928.0,928,701,0,...,1,0,0,0,0,0,1,13.0,12.0,13.0
1463,78.0,9978,20.0,602.0,0.0,324.0,926.0,926,678,0,...,0,1,0,0,0,0,1,12.0,12.0,12.0
1464,43.0,5005,0.0,263.0,0.0,1017.0,1280.0,1280,0,0,...,1,0,0,0,0,0,1,18.0,18.0,18.0
1465,75.0,10000,0.0,0.0,0.0,763.0,763.0,763,892,0,...,1,0,0,0,0,0,1,17.0,16.0,17.0
1466,68.0,7980,0.0,935.0,0.0,233.0,1168.0,1187,0,0,...,0,0,1,0,0,0,1,18.0,3.0,18.0
1467,63.0,8402,0.0,0.0,0.0,789.0,789.0,789,676,0,...,1,0,0,0,0,0,1,12.0,12.0,12.0
1468,85.0,10176,0.0,637.0,0.0,663.0,1300.0,1341,0,0,...,1,0,0,0,0,0,1,20.0,20.0,20.0
1469,70.0,8400,0.0,804.0,78.0,0.0,882.0,882,0,0,...,1,0,0,0,0,0,1,40.0,40.0,40.0


Checking if the final transformed data has any null/missing values.

In [18]:
test = train_processed_fin.isnull().sum()
test[test>0]

Series([], dtype: int64)

It can be observed that all missing values have been filled.

<b>Regressor model generation, evaluation and prediction</b>

<u>SVM with RBF kernel and gamma selected automatically</u>

In [31]:
from sklearn.svm import SVC  
svregressor = SVC(kernel='rbf', gamma='auto')
svregressor.fit(train_processed_fin, y_train) 
y_pred = svregressor.predict(train_processed_fin)
print(y_pred)
print("SVM with RBF kernel accuracy:", np.mean(y_train == y_pred))

[208500 181500 223500 ... 266500 142125 147500]
SVM with RBF kernel accuracy: 1.0


In [32]:
#Copy of the test data
test2 = pd.read_csv(os.path.join('Dataset', 'test.csv'))

In [35]:
pred = svclassifier.predict(test_processed_fin)

submission_prediction = pd.DataFrame()
submission_prediction['Id'] = test2['Id']
submission_prediction['SalePrice'] = pred

submission_prediction.to_csv('submission.csv', index=False)

<u>Kaggle score:</u> 0.44840

<u>Random forest regressor</u>

In [29]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
clf = clf.fit(train_processed_fin, y_train)
y_pred = clf.predict(train_processed_fin)
print("Random forest classifier MSE:", mean_squared_error(y_train, y_pred))

Random forest classifier MSE: 121194007.07529189


In [37]:
y_pred = clf.predict(test_processed_fin)

submission_prediction = pd.DataFrame()
submission_prediction['Id'] = test2['Id']
submission_prediction['SalePrice'] = y_pred

submission_prediction.to_csv('submission.csv', index=False)

<u>Kaggle score:</u> 0.15896

<u>Gradient boosting</u>

In [38]:
gb = GradientBoostingRegressor(n_estimators=50, learning_rate = 1, max_depth = 5, random_state = 0)
gb.fit(train_processed_fin, y_train)
y_pred = gb.predict(train_processed_fin)
mean_squared_error(y_train, y_pred)

7724582.008148395

In [39]:
y_pred = gb.predict(test_processed_fin)

submission_prediction = pd.DataFrame()
submission_prediction['Id'] = test2['Id']
submission_prediction['SalePrice'] = y_pred

submission_prediction.to_csv('submission.csv', index=False)

<u>Kaggle score:</u> 0.62641

<u>Applying GridSearchCV to tune the hyperparameters for Gradient Boosting</u>

In [40]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [0.05, 0.1, 0.2, 0.5, 1],#[0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
    "max_depth":[3,5,8, 10, 12, 15],
    "n_estimators":[50]
    }

clf = GridSearchCV(gb, parameters, cv=10, n_jobs=-1)

clf = clf.fit(train_processed_fin, y_train)
y_pred = clf.predict(train_processed_fin)
print("Best parameter values after grid search:", clf.best_params_)

Best parameter values after grid search: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}


<u>Applying the identified parameter values to Gradient Boosting</u>

In [41]:
gb = GradientBoostingRegressor(n_estimators=50, learning_rate = 0.1, max_depth = 5, random_state = 0)
gb.fit(train_processed_fin, y_train)
y_pred = gb.predict(train_processed_fin)
mean_squared_error(y_train, y_pred)

118855569.22669393

In [42]:
y_pred = gb.predict(test_processed_fin)

submission_prediction = pd.DataFrame()
submission_prediction['Id'] = test2['Id']
submission_prediction['SalePrice'] = y_pred

submission_prediction.to_csv('submission.csv', index=False)

<u>Kaggle score:</u> 0.15225

<u>Lasso Regression</u>

In [43]:
from sklearn.linear_model import Lasso
ls = Lasso(alpha=1,normalize=True, max_iter=1e5)
ls.fit(train_processed_fin, y_train)
y_pred = ls.predict(train_processed_fin)
mean_squared_error(y_train, y_pred)

397620494.3062208

In [44]:
y_pred = ls.predict(test_processed_fin)

submission_prediction = pd.DataFrame()
submission_prediction['Id'] = test2['Id']
submission_prediction['SalePrice'] = y_pred

submission_prediction.to_csv('submission.csv', index=False)

<u>Kaggle score:</u> 0.18872

<u>Basic Keras model</u>

In [45]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

early_stopping_monitor = EarlyStopping(patience=3) #Hyperparameter tuning

#def baseline_model():
model = tf.keras.models.Sequential() #Sequential model
model.add(tf.keras.layers.Dense(1024, input_dim=train_processed_fin.shape[1], activation=tf.nn.relu, kernel_initializer='normal'))
model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse','mae','accuracy'])

seed = 7
np.random.seed(seed)

model.fit(train_processed_fin.as_matrix(), 
           y_train.as_matrix(), 
           batch_size=32, #Setting batch size for ease of processing in local machines
           epochs=30, #Maximum of 30 epochs
           validation_split=0.2, 
           callbacks=[early_stopping_monitor]) #Early stopping hyperparameter

# print("Evaluating training accuracy...")
model.evaluate(train_processed_fin.as_matrix(), y_train.as_matrix())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 1168 samples, validate on 292 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


[1819958021.6109588, 1819958000.0, 25618.37, 0.0]

<u>Note:</u> This model is not utilized for submission since the MSE is very high.

<b>Final results:</b>
<br>
The best performing model is Gradient Boosting model (with hyperparameter tuning) with a Kaggle score of <b>0.15225</b>