# Real Estate Price Prediction Dataset Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
d2 = pd.read_csv('../input/real-estate-price-prediction/Real estate.csv')
d2.head(10)

## Basic EDA

In [None]:
d2.shape

In [None]:
d2.info()

In [None]:
tbd = ['X1 transaction date','No']
d2 = d2.drop(tbd, axis=1)
d2.head(3)

In [None]:
d2.describe()

In [None]:
d2.hist(figsize=(10,10))

In [None]:
corr = d2.corr()
corr

In [None]:
fig, ax = plt.subplots(figsize=(22,15))
sns.heatmap(corr, annot=True, ax=ax)

### Correlation Analysis
 'X3 distance to the nearest MRT station' column shows least correlation.

In [None]:
# Checking for null vlaues
d2.isnull().sum()

In [None]:
# Numerical attributes comparison using scatterplot
sns.barplot(x=d2['X4 number of convenience stores'], y=d2['Y house price of unit area']);

As the number of convenience stores increase in the locality, House price goes up.<br>
This shows <b>positive</b> relation between these attributes.

In [None]:
sns.regplot(x=d2['X2 house age'], y=d2['Y house price of unit area']);

The trend line shows house price reducing with increase in house age.<br>
These attributes show a <b>negative</b> relation

In [None]:
sns.regplot(x=d2['X3 distance to the nearest MRT station'], y=d2['Y house price of unit area']);

This states that the houses that are near to MRT station have <b>higher</b> price

In [None]:
sns.lineplot(x=d2['X2 house age'], y=d2['X3 distance to the nearest MRT station']);

This shows that houses with an average age of <b>15 - 20 years</b> have <b>high distances</b> to MRT station while the houses aged for <b>35+</b> years are <b>more closer</b> to the stations

### Conclusion from EDA and Graph plots:
1. Data is clean having no null values<br>
2. Data doesn't have High correlation amongst attributes<br>
3. Houses with more convenience stores in the area, with low age have high prices<br>
4. Houses that are aged have more MRT stations near them and fall in low price.

## Outlier Detection

In [None]:
plt.figure(figsize=(13,5))

for feat, grd in zip(d2, range(231,237)):
    plt.subplot(grd)
    sns.boxplot(y=d2[feat], color='grey')
    plt.ylabel('Value')
    plt.title('Boxplot\n%s'%feat)
plt.tight_layout()

Data has outliers present outside the quartile range.

## Splitting the data

In [None]:
X2 = d2.loc[:,'X2 house age' : 'X6 longitude']
y2 = d2.loc[:,'Y house price of unit area']

In [None]:
#Train-test split
from sklearn.model_selection import train_test_split

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.2, random_state=1)
print(X_train_2.shape, X_test_2.shape)
print(y_train_2.shape, y_test_2.shape)

## Statistical Analysis

In [None]:
from statsmodels.graphics.gofplots import qqplot

qqplot(X2,line='s')
plt.show()

In [None]:
from scipy.stats import skew

print(skew(X2))

Data does not appear to be following Gaussian / Normal Distribution. Both positively and Negatively skewed data is observed.

In [None]:
# Scaling Data using Min-Max Scaler

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm_2 = mms.fit_transform(X_train_2)
X_test_norm_2 = mms.transform(X_test_2)

# Scaling Data using Standard Scaler

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_2)
X_train_std_2 = scaler.transform(X_train_2)
X_test_std_2 = scaler.transform(X_test_2)

# Comparing Scaled Data

xx = np.arange(len(X_train_std_2))
yy1 = X_train_norm_2[:,0]
yy2 = X_train_std_2[:,0]
plt.scatter(xx,yy1,color='b')
plt.scatter(xx,yy2,color='r')

print(X_train_std_2.shape)
print(X_test_std_2.shape)

Min-Max Scaler gives more uniform scaling than Standard Scaler

## OLS regression analysis

In [None]:
import statsmodels.api as sm
model_ols = sm.OLS(y_train_2, X_train_norm_2)
fitted = model_ols.fit()
print(fitted.summary())

In [None]:
from scipy.stats import shapiro
fig, ax = plt.subplots(figsize=(16,4), ncols=2)
ax[0] = sns.scatterplot(x=y_train_2, y=fitted.resid, ax=ax[0])
ax[1] = sns.histplot(fitted.resid, ax=ax[1])

statistic, p_value = shapiro(fitted.resid)
if p_value>0.05:
    print("Distribution is normal. Statistic: {0:.3}, p-value: {1:.4}".format(statistic, p_value))
else:
    print("Distribution is not normal. Statistic: {0:.3}, p-value: {1:.4}".format(statistic, p_value))

## KNN Regressor model construction

In [None]:
from sklearn.neighbors import KNeighborsRegressor as knn
model4 = knn(n_neighbors=3,p=1,algorithm='brute')
model4.fit(X_train_norm_2,y_train_2)

In [None]:
ypred3 = model4.predict(X_test_norm_2)
ypred3

In [None]:
model4.score(X_test_norm_2,y_test_2)

In [None]:
k_values = np.arange(1,100,2)

In [None]:
train_score_arr = []
val_score_arr = []

for k in k_values:
    
    model2 = knn(n_neighbors=k,p=1)
    model2.fit(X_train_norm_2,y_train_2)
    
    train_score = model2.score(X_train_norm_2, y_train_2) 
    train_score_arr.append(train_score*100)
    
    val_score = model2.score(X_test_norm_2, y_test_2)
    val_score_arr.append(val_score*100)
    
    print("k=%d, train_accuracy=%.2f%%, test_accuracy=%.2f%%" % (k, train_score * 100, val_score*100))

In [None]:
plt.plot(k_values,train_score_arr,'g')
plt.plot(k_values,val_score_arr,'r')

From the above values we can conclude that the best value for k will be between 9 to 13

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score_train = cross_val_score(model4, X_train_norm_2, y_train_2, cv=10, scoring='r2')
print(cross_val_score_train)

In [None]:
cross_val_score_train.mean()

In [None]:
from sklearn.metrics import r2_score

print(r2_score(y_test_2, ypred3))

In [None]:
c = pd.DataFrame(ypred3, columns=['Estimated Price'])
c.head()

In [None]:
d = pd.DataFrame(y_test_2)
d = y_test_2.reset_index(drop=True)
d.head()

In [None]:
ynew = pd.concat([c,d], axis=1)
ynew