First, we load the data and view it to get a sense of the data. Then we see whether the data is already in a numerical form or not and whether there's any missing value by using pandas' .dtypes property and isnull(). We also store the transaction number in another variable to be used for plotting later.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd
data=pd.read_csv("/kaggle/input/real-estate-price-prediction/Real estate.csv")

print(data.head())
print(data.dtypes)
print(data.isnull().sum())
no=data['No']
data.drop('No',axis=1,inplace=True)
print(data.head())

In [None]:
data.describe()

As we can see there isnt any missing value or undesirable data format among the data so we could proceed to EDA by plotting the columns (features) to get a better understanding of the data. I divided the datapoints' price into 3 category+ low, high and med with med being the datapoints which price lies on +-1 standard deviation from median. The standard that i chose here is pretty arbitrary as other notebooks have shown the data is non-gaussian. I only use this categorization to make visualization easier.

In [None]:
import seaborn as sns #seaborn and matplotlib for visualization
import matplotlib.pyplot as plt
median=data['Y house price of unit area'].median()
std=data['Y house price of unit area'].std()
high=median+std
low=median-std
print("\nStandard deviation:",std,"Median:",median,"Low:",low,"High:",high,"\n")
data['price']='nan'
data.loc[(data['Y house price of unit area']>=high),'price']='High'
data.loc[((data['Y house price of unit area']<high)&(data['Y house price of unit area']>low)),'price']='Med'
data.loc[(data['Y house price of unit area']<=low),'price']='Low'
print(data.head())
sns.displot(data,x='Y house price of unit area',kind='kde')
plt.title('price')
for item in data.drop(['Y house price of unit area','price'],axis=1).columns.tolist():
    sns.displot(data,x=item,kind='kde',hue='price')
    plt.title(item)

In [None]:
sns.heatmap(data.corr())

Now lets plot the real estates based on its latitude and longitude so we could get a sense of the layout and location.

In [None]:
plt.scatter(data['X6 longitude'][data['price']=='High'],data['X5 latitude'][data['price']=='High'],c='g',label="High",alpha=0.5)
plt.scatter(data['X6 longitude'][data['price']=='Med'],data['X5 latitude'][data['price']=='Med'],c='b',label="Med",marker='^',alpha=0.5)
plt.scatter(data['X6 longitude'][data['price']=='Low'],data['X5 latitude'][data['price']=='Low'],c='r',label="Low",marker='p',alpha=0.5)
plt.title("Location")
plt.legend()
plt.show()

As we can see, the correlation between price and location (latitude and longitude) in this data is pretty linear in the sense that the higher-priced real estates lies in the location with higher longitude and latitude + postivie correlation from the heatmap correlation plot so we could probably still get a pretty accurate result from using the raw latitude and longitude data without needing to add a cluster class, or can we?, let's try that out.

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
db=DBSCAN(eps=0.008)
clust=db.fit(data[['X6 longitude','X5 latitude']])
print(np.unique(clust.labels_))
data['cluster']=db.fit_predict(data[['X6 longitude','X5 latitude']])
sns.scatterplot(data=data,x='X6 longitude',y='X5 latitude',hue='cluster')
plt.show()

data['cluster'].replace(-1,3,inplace=True)
print(data['cluster'].unique())
sns.scatterplot(data=data,x='X6 longitude',y='X5 latitude',hue='cluster')

Below I normalized the data using Z score normalization (z = (x - u) / s) as Linear Regression is sensitive to its features' scale. I also divided the data into 3 sets: the orginal one, the clustered one (Dropping both longitude and latitude) and a dimensionally reduced one (2 dimensionality reduction using PCA).

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
target=data['Y house price of unit area']
dataxori=data.drop(['cluster','price','Y house price of unit area'],axis=1)
dataxclust=data.drop(['X6 longitude','X5 latitude','price','Y house price of unit area'],axis=1)

from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
dataxoriss=ss.fit_transform(dataxori)
dataxclustss=ss.fit_transform(dataxclust)
dataxPCA=PCA(n_components=4).fit_transform(dataxoriss)#reduce 2 dimesions to 4


Next we split the data into train and test sets with test size of 0.3 and random state 5 to compare it.

In [None]:
xtrainori,xtestori,ytrainori,ytestori=train_test_split(dataxoriss,target,test_size=0.3,random_state=5)#30% test data
xtrainclust,xtestclust,ytrainclust,ytestclust=train_test_split(dataxclustss,target,test_size=0.3,random_state=5)
xtrainpca,xtestpca,ytrainpca,ytestpca=train_test_split(dataxPCA,target,test_size=0.3,random_state=5)

All done, time to build the model. Here I tried 3 models (Linear Regression, Ridge Regression and Polynomial Regression) and 3 scoring functions (R^2, MSE and MAE). All applied on the 3 different datasets that I have prepared before.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
def report(ytrue,ypred,title):
    print(title)
    print("R2")
    print(r2_score(ytrue,ypred))
    print("MSE")
    print(mean_squared_error(ytrue,ypred))
    print("MAE")
    print(mean_absolute_error(ytrue,ypred),'\n\n')
    return ' '
pipe=make_pipeline(PolynomialFeatures(2),LinearRegression())
lin=LinearRegression()
lin.fit(xtrainori,ytrainori)
print("Linear Regression")
print(report(ytrainori,lin.predict(xtrainori),'Original Train'))
print(report(ytestori,lin.predict(xtestori),'Original Test'))
lin.fit(xtrainclust,ytrainclust)
print(report(ytrainclust,lin.predict(xtrainclust),'Clustered Train'))
print(report(ytestclust,lin.predict(xtestclust),'Clustered Test'))
lin.fit(xtrainpca,ytrainpca)
print(report(ytrainpca,lin.predict(xtrainpca),'PCA Train'))
print(report(ytestpca,lin.predict(xtestpca),'PCA Test'))

lin1=Ridge()
lin1.fit(xtrainori,ytrainori)
print("Ridge Regression")
print(report(ytrainori,lin1.predict(xtrainori),'Original Train'))
print(report(ytestori,lin1.predict(xtestori),'Original Test'))
lin1.fit(xtrainclust,ytrainclust)
print(report(ytrainclust,lin1.predict(xtrainclust),'Clustered Train'))
print(report(ytestclust,lin1.predict(xtestclust),'Clustered Test'))
lin1.fit(xtrainpca,ytrainpca)
print(report(ytrainpca,lin1.predict(xtrainpca),'PCA Train'))
print(report(ytestpca,lin1.predict(xtestpca),'PCA Test'))

lin2=pipe
lin2.fit(xtrainori,ytrainori)
print("Polynomial Regression")
print(report(ytrainori,lin2.predict(xtrainori),'Original Train'))
print(report(ytestori,lin2.predict(xtestori),'Original Test'))
lin2.fit(xtrainclust,ytrainclust)
print(report(ytrainclust,lin2.predict(xtrainclust),'Clustered Train'))
print(report(ytestclust,lin2.predict(xtestclust),'Clustered Test'))
lin2.fit(xtrainpca,ytrainpca)
print(report(ytrainpca,lin2.predict(xtrainpca),'PCA Train'))
print(report(ytestpca,lin2.predict(xtestpca),'PCA Test'))


Next we try to train the model using the test data and then plot the result together with the actual value per datapoint so we could get a picture of how well the models fit the actual unseen values.

In [None]:
import numpy as np
lin.fit(xtrainpca,ytrainpca)
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot()
ax.plot(np.arange(len(xtestpca)),ytestpca,label='True')
ax.plot(np.arange(len(xtestpca)),lin.predict(xtestpca),c='g',linestyle='--',label='ori')
plt.title("PCA Linear")
plt.show()

lin2.fit(xtrainpca,ytrainpca)
fig1=plt.figure(figsize=(20,10))
ax1=fig1.add_subplot()
ax1.plot(np.arange(len(xtestpca)),ytestpca,label='True')
ax1.plot(np.arange(len(xtestpca)),lin2.predict(xtestpca),c='g',linestyle='--',label='ori')
plt.title("PCA Ridge")
plt.show()

lin2.fit(xtrainpca,ytrainpca)
fig2=plt.figure(figsize=(20,10))
ax2=fig2.add_subplot()
ax2.plot(np.arange(len(xtestpca)),ytestpca,label='True')
ax2.plot(np.arange(len(xtestpca)),lin2.predict(xtestpca),c='g',linestyle='--',label='ori')
plt.title("PCA Poly")
plt.show()


Next, plotting the the model with full data and its error, from top to bottom, repeating: Linear, Ridge, Polynomial to get a sense of how well the model could fit itself to the data.

In [None]:
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot()
lin.fit(dataxoriss,target)
ax.plot(no,target,label='True')
predori=lin.predict(dataxoriss)
ax.plot(no,predori,c='g',linestyle='--',label='ori')
lin.fit(dataxclustss,target)
predclust=lin.predict(dataxclustss)
ax.plot(no,predclust,c='r',linestyle='--',label='clust')
lin.fit(dataxPCA,target)
predpca=lin.predict(dataxPCA)
ax.plot(no,predpca,c='y',linestyle='--',label='PCA')
plt.legend()
plt.title("Hasil")
plt.show()

In [None]:
lin1.fit(dataxoriss,target)
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot()
ax.plot(no,target,label='True')
predori1=lin1.predict(dataxoriss)
ax.plot(no,predori1,c='g',linestyle='--',label='ori')
lin1.fit(dataxclustss,target)
predclust1=lin1.predict(dataxclustss)
ax.plot(no,predclust1,c='r',linestyle='--',label='clust')
lin1.fit(dataxPCA,target)
predpca1=lin1.predict(dataxPCA)
ax.plot(no,predpca1,c='y',linestyle='--',label='PCA')
plt.legend()
plt.title("Hasil")
plt.show()

In [None]:
lin2.fit(dataxoriss,target)
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot()
ax.plot(no,target,label='True')
predori2=lin2.predict(dataxoriss)
ax.plot(no,predori2,c='g',linestyle='--',label='ori')
lin2.fit(dataxclustss,target)
predclust2=lin2.predict(dataxclustss)
ax.plot(no,predclust2,c='r',linestyle='--',label='clust')
lin2.fit(dataxPCA,target)
predpca2=lin2.predict(dataxPCA)
ax.plot(no,predpca2,c='y',linestyle='--',label='PCA')
plt.legend()
plt.title("Hasil")
plt.show()

Plotting the difference between actual and predicted values, Linear:

In [None]:
fig1=plt.figure(figsize=(20,10))
ax1=fig1.subplots(3)
ax1[0].plot(no,target-predori)
ax1[0].set_title("Error ori")
ax1[1].plot(no,target-predclust)
ax1[1].set_title("Error clust")
ax1[2].plot(no,target-predpca)
ax1[2].set_title("Error pca")
plt.show()

Ridge:

In [None]:
fig1=plt.figure(figsize=(20,10))
ax1=fig1.subplots(3)
ax1[0].plot(no,target-predori1)
ax1[0].set_title("Error ori")
ax1[1].plot(no,target-predclust1)
ax1[1].set_title("Error clust")
ax1[2].plot(no,target-predpca1)
ax1[2].set_title("Error pca")
plt.show()

Polynomial:

In [None]:
fig1=plt.figure(figsize=(20,10))
ax1=fig1.subplots(3)
ax1[0].plot(no,target-predori2)
ax1[0].set_title("Error ori")
ax1[1].plot(no,target-predclust2)
ax1[1].set_title("Error clust")
ax1[2].plot(no,target-predpca2)
ax1[2].set_title("Error pca")
plt.show()