Data:
- Transaction date (purchase)
- House age
- Distance to the nearest MRT station (metric not defined)
- Amount of convenience stores
- Location (latitude and longitude)
- House price of unit area 





In [None]:
conda install -c conda-forge hdbscan

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import numpy as np
from scipy.stats import shapiro     #normality test
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from scipy.spatial import distance
from sklearn.decomposition import PCA
from yellowbrick.regressor import ResidualsPlot 
from mpl_toolkits.mplot3d import Axes3D   #plot 3D
from hdbscan import HDBSCAN
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm
import statsmodels.stats.api as sms
from sklearn import metrics

# Pre-processing

- Reading dataset 

In [None]:
# reading dataset 
data = pd.read_csv('../input/real-estate-price-prediction/Real estate.csv')

print(data.info(),
      '\n___________duplicated()___________\n', data.duplicated().any(), 
      '\n___________isnull()___________\n', data.isnull().sum()
      )
data.head(3)

Dropping non useful columns

In [None]:
#dropping columns
columns_to_drop = ['X1 transaction date', 'No']
data = data.drop(columns_to_drop, axis=1)
data.head(3)

# Exporatory Data Analysis

Functions

In [None]:
#normality Shapiro-Wilk test function
def normality_test(data):
  stat, p_value = shapiro(data)    #Shapiro-Wilk test
  alpha = 0.05

  if p_value > alpha:
    print('Normality test: Gaussian')  #fail in reject H0 (null hypothesis H0: follow normal distribution)
  else:
    print('Normality test: Non Gaussian') #reject H0 (alternative hypothesis H1: does not follow normal distribution)


#data plot (2D and 3D)
def data_scatter(data_, pca, n_dim, ax, color):
  if(n_dim == 2):
    plt.scatter(data_[:,0], data_[:,1], color=color)    #plot 2D
  else:
    ax.scatter(data_[:,0], data_[:,1], data_[:,2], color=color)  #plot 3D
    ax.set_zlabel('Dimension 3 (%.f %%)' % (round(pca.explained_variance_ratio_.cumsum()[2], 2)*100)) #third principal component
  plt.xlabel('Dimension 1 (%.f %%)' % (round(pca.explained_variance_ratio_.cumsum()[0], 2)*100)) #first principal component
  plt.ylabel('Dimension 2 (%.f %%)' % (round(pca.explained_variance_ratio_.cumsum()[1], 2)*100)) #second principal component

Box-plot

In [None]:
plt.figure(figsize=(13,5))

for feat, grd in zip(data, range(231,237)):
  plt.subplot(grd)
  sns.boxplot(y=data[feat], color='grey')
  plt.ylabel('Value')
  plt.title('Boxplot\n%s'%feat)
plt.tight_layout()

Dropping outliers
- Outlier defined by HDBSCAN clustering

In [None]:
#HDBSCAN clustering
hdb = HDBSCAN(min_cluster_size=2).fit(data)
hdb_pred = hdb.labels_

#data color
col_cl = ['grey'] * len(hdb_pred)

#defining outlier (index and color)
index_outlier = []
for i, out in zip(range(len(data)), hdb_pred):
  if out == -1:               
    index_outlier.append(i)   #index of data defined as outlier
    col_cl[i] = 'firebrick'       #outlier defined as black


#data visualization 
#PCA: dimension reduction
pca = PCA()
data_ = pca.fit_transform(data)

#2D data plot
ax = plt.figure(figsize=(4, 3))
data_scatter(data_, pca, 2, ax, col_cl)

#3D data plot
fig = plt.figure(figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=20, azim=134)
data_scatter(data_, pca, 3, ax, col_cl)

In [None]:
#visualizing data (without outliers)
plt.figure(figsize=(15, 3))

#plot old data
plt.subplot(131)
plt.plot(data['Y house price of unit area'], color='grey')
plt.ylim(top=np.max(data['Y house price of unit area'])+10)
plt.scatter(index_outlier, data.loc[index_outlier]['Y house price of unit area'], color='firebrick')    #plotting outliers

#defining new data
new_data = data.drop(index_outlier)

#plot new data
plt.subplot(132)
plt.plot(new_data['Y house price of unit area'], color='grey')
plt.ylim(top=np.max(data['Y house price of unit area'])+10)

#new house price values boxplot plot
plt.subplot(133)
sns.boxplot(y=new_data['Y house price of unit area'], color='grey')

plt.tight_layout()

In [None]:
data = new_data.reset_index()
data.describe()

Normality test

In [None]:
print('Assumption of normality')
normality_test(data)

Correlation matrix

In [None]:
#correlation matrix
mask = np.triu(np.ones_like(data.corr())) 

plt.figure(figsize=(15, 7))

plt.subplot(121)
sns.heatmap(data.corr(method='spearman'), annot=True, linewidths=.9, fmt= '.2f', cmap='Greys', mask=mask) 

plt.subplot(122)
sns.heatmap(data.corr(method='spearman'), annot=True, linewidths=.9, fmt= '.2f', cmap='Greys', mask=mask) 

plt.tight_layout()

Feature comparison (dependence)
- dependent and independent features

In [None]:
sns.pairplot(data, y_vars='Y house price of unit area', palette = sns.set_palette(['#696969']),
             x_vars=['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']);

# Linear Regression

**Prediction**
- Train-test split

In [None]:
#dropping columns
X = data.loc[:,'X2 house age' : 'X6 longitude']
y = data.loc[:,'Y house price of unit area']

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
reg = LinearRegression().fit(X_train, y_train)    #fit train set
price_predict = reg.predict(X_test)

#regression coefficients
index_reg = ['Intercept', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X4 number of convenience stores', 'X6 longitude']
pd.DataFrame(data=np.append(reg.intercept_, reg.coef_), index=index_reg, columns=['values']).transpose()

Predictions plot

In [None]:
#Predictions plot
plt.figure(figsize=(10,4))

def plot_regression(real, predicted, color, title):
  plt.scatter(real, predicted, color=color)
  plt.plot([real.min(), real.max()], [real.min(), real.max()], 'k--', lw=4)
  plt.xlabel('Real Price')
  plt.ylabel('Predicted')
  plt.title(title)


#Real price vs train-test predictions plot (without outliers)
plt.subplot(131)
plot_regression(y_test, price_predict, 'cornflowerblue', 'Linear Regression Predictions \nTrain-test split')
plt.show()

**Prices plot**
- Plot: real vs cross-validation
- Distance: prediction distance in general (real vs train-test split) 

In [None]:
#Predictions comparison plot
plt.figure(figsize=(15,4))

plt.plot(np.array(y_test), color='grey', label='Real')
plt.plot(price_predict, color='cornflowerblue', label='Train-test split')
plt.xlabel('House')
plt.ylabel('Price')
plt.title('Predictions Comparison (same split)')
plt.legend(loc=4)
plt.show()


#Predictions distance (in general)
print('Euclidean distance between prices')
print('- Real vs train-test: %.3f' % distance.euclidean(y_test, price_predict))

**Evaluation**

In [None]:
#R2 coefficient, MAE and MSE measures
print('Test split evaluation \n',
      'Coefficient of determination R2: %.3f \n' % r2_score(y_test, price_predict),
      'Mean Absolute Error: %.2f \n' % (sum(abs(y_test - price_predict)) / len(y_test)),    
      'Mean Square Error: %.2f' % mean_squared_error(y_test, price_predict))


**Residuals analysis**

Normality

In [None]:
residuals = y_test - price_predict

print('Assumption of normality in residuals')
normality_test(residuals)

Homoscedasticity

In [None]:
visualizer = ResidualsPlot(LinearRegression(), hist=False, train_color='darksalmon', test_color='darkseagreen')
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof(); 

Autocorrelation: Durbing-Watson test


In [None]:
stat_ols = sm.OLS(y, X)
stat_ols = stat_ols.fit()

Durbin_Watson = durbin_watson(stat_ols.resid)
r = 1 - Durbin_Watson/2

if round(r) == 0:
  print('Without auto-correlation')
else:
  print('With auto-correlation')

print('\nDurbin_Watson:', Durbin_Watson)

Goldfeld-Quandt homoscedasticity test

In [None]:
X_constant = sm.add_constant(X)
stat_ols_const = sm.OLS(y,X_constant).fit()
resids = stat_ols_const.resid

gq_test = sms.het_goldfeldquandt(resids, stat_ols_const.model.exog)[1]

print('Goldfeld-Quandt test')
if gq_test < 0.05:
  print('Heteroscedasticity with p-value =', gq_test)
else:
  print('Homoscedasticity with p-value =', gq_test)