In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train.head()

In [None]:
train['SalePrice'].describe()

In [None]:
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test.head()

In [None]:
description = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt', sep = 'delimiter')
description.head(20)

We can see here that the numbers are in fact categorical. When cleaning up NAs, we can't use the mean or median for this particular column.

# Data Cleaning

We can see from a quick glance at our datasets that they contain a lot of null values. We need to clean these up to perform better analysis.

Let's take a look at how many null values each column contains.

In [None]:
train.isnull().sum().head(40)

Some of the columns where NAs occur are not purely figures, so they can't be filled using median. In some cases, they are not categorical either. I will assign these NAs as zero.

In [None]:
train['LotFrontage'].fillna(train['LotFrontage'].median(), inplace = True)
train['GarageYrBlt'].fillna(value = 0, inplace = True)
train['MasVnrArea'].fillna(train['MasVnrArea'].median(), inplace = True)
train['PoolQC'].fillna('Unknown', inplace = True)
train['Alley'].fillna('Unknown', inplace = True) 
train['FireplaceQu'].fillna('Unknown', inplace = True) 
train['MasVnrType'].fillna('Unknown', inplace = True)  
train['Electrical'].fillna('Unknown', inplace = True) 
train['BsmtFinType2'].fillna('Unknown', inplace = True) 
train['BsmtFinType1'].fillna('Unknown', inplace = True) 
train['BsmtExposure'].fillna('Unknown', inplace = True) 
train['BsmtQual'].fillna('Unknown', inplace = True) 
train['BsmtCond'].fillna('Unknown', inplace = True)  
train['Fence'].fillna('Unknown', inplace = True) 
train['MiscFeature'].fillna('Unknown', inplace = True)  
train['GarageCond'].fillna('Unknown', inplace = True) 
train['GarageQual'].fillna('Unknown', inplace = True)  
train['GarageFinish'].fillna('Unknown', inplace = True)  
train['GarageType'].fillna('Unknown', inplace = True)

In [None]:
train.isnull().sum()

Let's check if any NA values exist in the test dataset as well.

In [None]:
test.isnull().sum().head(40)

A lot of missing data occurs here as well. We will use the same cleaning parameters as the train dataset to maintain uniformity.

In [None]:
test['LotFrontage'].fillna(test['LotFrontage'].median(), inplace = True)
test['GarageYrBlt'].fillna(value = 0, inplace = True)
test['MSZoning'].fillna(value = 0, inplace = True)
test['MasVnrArea'].fillna(test['MasVnrArea'].median(), inplace = True)
test['PoolQC'].fillna('Unknown', inplace = True)
test['Alley'].fillna('Unknown', inplace = True) 
test['FireplaceQu'].fillna('Unknown', inplace = True) 
test['MasVnrType'].fillna('Unknown', inplace = True)  
test['Electrical'].fillna('Unknown', inplace = True) 
test['BsmtFinType2'].fillna('Unknown', inplace = True) 
test['BsmtFinType1'].fillna('Unknown', inplace = True) 
test['BsmtExposure'].fillna('Unknown', inplace = True) 
test['BsmtQual'].fillna('Unknown', inplace = True) 
test['BsmtCond'].fillna('Unknown', inplace = True)  
test['Fence'].fillna('Unknown', inplace = True) 
test['MiscFeature'].fillna('Unknown', inplace = True)  
test['GarageCond'].fillna('Unknown', inplace = True) 
test['GarageQual'].fillna('Unknown', inplace = True)  
test['GarageFinish'].fillna('Unknown', inplace = True)  
test['GarageType'].fillna('Unknown', inplace = True)
test['SaleType'].fillna('Unknown', inplace = True)
test['Utilities'].fillna('Unknown', inplace = True)
test['Exterior1st'].fillna('Unknown', inplace = True)
test['Exterior2nd'].fillna('Unknown', inplace = True)
test['BsmtFinSF1'].fillna(test['BsmtFinSF1'].median(), inplace = True)
test['BsmtFinSF1'].fillna(value = 0, inplace = True)
test['BsmtUnfSF'].fillna(test['BsmtUnfSF'].median(), inplace = True)
test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].median(), inplace = True)
test['BsmtFullBath'].fillna(value = 0, inplace = True)
test['BsmtHalfBath'].fillna(value = 0, inplace = True)
test['KitchenQual'].fillna('Unknown', inplace = True)
test['Functional'].fillna('Unknown', inplace = True)
test['GarageCars'].fillna(test['GarageCars'].median(), inplace = True)
test['GarageArea'].fillna(test['GarageArea'].median(), inplace = True)

In [None]:
test.isnull().sum().tail(40)

# Visualization

We need to visualize data to help us make inferences and understand the data better. Let's make a few.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(x = 'Street', y = 'SalePrice', data = train)
plt.xlabel('Street Type')
plt.ylabel('Sale Price')
plt.show()

We can see that in streets where pavement was used in place of gravel, houses generally sell for more.

In [None]:
sns.barplot(x = 'MoSold', y = 'SalePrice', data = train)
plt.xlabel('Month Sold')
plt.ylabel('Sale Price')
plt.show()

The averages here are pretty much the same, with quite a lot of outliers in almost every month. Let's check the correlation between the month sold and the sale price.

In [None]:
train[['MoSold', 'SalePrice']].corr()

There's a very weak correlation between month sold and sale price. Let's look for better indicators.

In [None]:
sns.barplot(x = 'MSSubClass', y = 'SalePrice', data = train)
plt.show()

Using the key in the description text shown earlier, we can see that on average, two-storey houses that were built after 1945 sell for more than any other type of house.

Let's check if the type of zone has any effect as well.

In [None]:
sns.barplot(x = 'MSZoning', y = 'SalePrice', data = train)
plt.show()

FV Zones sell for the most, while C sells for the least.

In [None]:
train[['LotFrontage', 'SalePrice']].corr()

There is a moderate positive relationship between lot frontage and sale price. Let's check for better price indicators.

In [None]:
train[['GarageYrBlt', 'SalePrice']].corr()

The year a garage was built seems to be an even better price indicator than the lot frontage.

Let's make a correlation table of the best indicators on selling price.

In [None]:
train[['MasVnrArea','OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF'
       , 'GrLivArea', 'FullBath', 'SalePrice']].corr()

The best price indicator is the Overall Quality of the house, followed by the Ground Living Area. Let's visualize these.

In [None]:
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = train)
plt.xlabel('Overall House Quality')
plt.ylabel('Selling Price')
plt.title('Linear Regression of House Quality')
plt.show()

A very strong positive correlation is seen.

In [None]:
sns.regplot(x = 'GrLivArea', y = 'SalePrice', data = train)
plt.xlabel('Ground Living Area')
plt.ylabel('Selling Price')
plt.title('Linear Regression of Living Area')
plt.show()

# Model Evaluation

Let's select the 8 categories that correlated with selling price to build our model around.

In [None]:
x_data = train[['OverallQual', 'GrLivArea', 'MasVnrArea', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'FullBath']]
x_data.head()

In [None]:
x_data.shape

In [None]:
test_data = test[['OverallQual', 'GrLivArea', 'MasVnrArea', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'FullBath']]

# Linear Regression

Let's use linear regression to analyse our training data.

In [None]:
y_data = train['SalePrice']
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 42)
lre = LinearRegression()
lr_model = lre.fit(x_train, y_train)
lre.score(x_test, y_test)

We get an Rsquared value of 0.8083, which is pretty good. Let's plot some graphs to make sure there was no underfitting/overfitting on predictions.

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
pred_train = lre.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))
print(r2_score(y_train, pred_train))

pred_test= lre.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred_test))) 
print(r2_score(y_test, pred_test))

Our Linear Regression model shows R-squared value of 74.6 and 80.8% on train and test data respectively, which is pretty good. Let's calculate the normalized RMSE to give us a better idea of what we're seeing.

In [None]:
RMSE_test = (np.sqrt(mean_squared_error(y_test,pred_test)))
min_SalePrice = train['SalePrice'].min()
max_SalePrice = train['SalePrice'].max()
norm_RMSE_test =  RMSE_test/(max_SalePrice - min_SalePrice)
print(norm_RMSE_test)

Our normalized RMSE value on the testing set is 0.05. The closer the normalized RMSE value is to 0 shows how good a model is.

# Distribution Plots

Let's visualize how our linear regression model works on the test data against the training data.

In [None]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))

    ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)
    ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Overall quality')
    plt.gca().legend(('Actual Values','Predicted Values'))

    plt.show()

In [None]:
yhat_train = lre.predict(x_train)
yhat_test = lre.predict(x_test)

In [None]:
Title = 'Distribution  Plot of  Predicted Value Using Training Data vs Training Data Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", Title)

In [None]:
yhat1 = lre.predict(test_data)
Title = 'Distribution  Plot of  Predicted Value Using Training Data vs Test Data Distribution'
DistributionPlot(y_train, yhat1, "Actual Values (Train)", "Predicted Values (Test)", Title)

Our linear regression model did a pretty good job of predicting the actual values of sale price on the test data.

In [None]:
test_id = test['Id']

# Data Joining

We are joining the data for easier manipulation here.

In [None]:
df = x_data.append(test_data)
num_columns = df.select_dtypes(include=np.number).columns
df = df.reset_index()
df = df.drop('index',axis = 1)

We will drop some outlying values in our dataset.

In [None]:
drop_id = df[df['GrLivArea'] > 4000].index
drop_id = drop_id[drop_id < 1459]
df['MasVnrArea'][df[df['MasVnrArea'] > 1500].index] = df['MasVnrArea'].mean()
df = df.drop(drop_id)
df = df.reset_index()

In [None]:
df.shape

In [None]:
y_data1 = train['SalePrice']
y_data1 = y_data1.drop(drop_id)

Remember in our combined dataframe, the training dataset ends at 1459 entries. In order to avoid assigning testing values into our training data, we need to specify the positions.

In [None]:
from sklearn.preprocessing import StandardScaler
X_train  = df[:-1459].drop(['index'], axis=1)
X_test  = df[-1459:].drop(['index'], axis=1)

scaler = StandardScaler()
X_train[num_columns]= scaler.fit_transform(X_train[num_columns])
X_test[num_columns]= scaler.transform(X_test[num_columns])

X_train.shape, X_test.shape 

In regression, we need to make sure we are fitting the logarithm of the target column to the logarithm of the training column. So let's convert our Sale Price to logarithm.

In [None]:
y_data_log = np.log(y_data1)

In [None]:
y_data_log.shape

# XGBoost

We are using XGBoost to determine the most important features in our dataset.

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_data1)
imp_feature = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = X_train.columns)
imp_feature = imp_feature.sort_values(['Importance'], ascending = False)

print(imp_feature)

We can see that Overall Quality is the most important feature in determining sale price of a house. Other variables have a negligible influence on the eventual price.

In [None]:
imp_feature.shape

In [None]:
from sklearn.kernel_ridge import KernelRidge
answer = {}
for i in range(1, 8):
    imp_column = imp_feature.iloc[:i].index
    ridge = KernelRidge(alpha = 0.5, coef0 = 3, degree = 2, kernel ='polynomial')
    ridge = ridge.fit(X_train[imp_column], y_data_log)
    answer[i] = np.sqrt(mean_squared_error(y_data_log, ridge.predict(X_train[imp_column])))

In [None]:
minimum = answer[1]
ind_min = 1
for ind in range(1,len(answer.values())):
    if answer[ind] < minimum:
        minimum = answer[ind]
        ind_min = ind

In [None]:
imp_column = imp_feature.iloc[:ind_min+1].index

In [None]:
model = KernelRidge(alpha = 0.5, coef0 = 3.5, degree = 2, kernel = 'polynomial')

model.fit(X_train[imp_column], y_data_log)

print("RMSE of the whole training set: {}".format(np.sqrt(mean_squared_error(y_data_log,model.predict(X_train[imp_column])))))

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
model_lasso = Lasso(alpha=0.5)
model_lasso.fit(X_train[imp_column], y_data_log) 
pred_train_lasso= model_lasso.predict(X_train[imp_column])
print(np.sqrt(mean_squared_error(y_data_log,pred_train_lasso)))


We have an RMSE of 0.3959, which is worse than the Ridge Regression.

In [None]:
prediction = np.exp(model.predict(X_test[imp_column]))

In [None]:
output = pd.DataFrame({'Id': test_id, 'SalePrice': prediction})
output.to_csv('submission.csv', index=False)

In [None]:
print("Your submission was successfully saved!")