# Real Estates Price Prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# libraries for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Read and understand data

In [None]:
real_df = pd.read_csv('/kaggle/input/real-estate-price-prediction/Real estate.csv')
real_df.head()

In [None]:
real_df = real_df.drop(columns=['X1 transaction date'], axis=1)
real_df.head()

In [None]:
real_df.shape

In [None]:
real_df.isnull().sum()

In [None]:
real_df.info()

In [None]:
real_df.No.nunique()

In [None]:
real_df.describe()

## 2. Visualize and treat data

In [None]:
real_df.head()

In [None]:
# checking outliers using boxplot
plt.figure(figsize=[13,5])
plt.subplot(1,3,1)
plt.boxplot(data=real_df, x='X2 house age')
plt.title('House Age')

plt.subplot(1,3,2)
plt.boxplot(data=real_df, x='X3 distance to the nearest MRT station')
plt.title('MRT Distance')

plt.subplot(1,3,3)
plt.boxplot(data=real_df, x='X4 number of convenience stores')
plt.title('NoOf Stores')
plt.show()

In [None]:
# checking outliers using boxplot
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
plt.boxplot(data=real_df, x='X5 latitude')
plt.title('Latitude')

plt.subplot(1,2,2)
plt.boxplot(data=real_df, x='X6 longitude')
plt.title('Longitude')
plt.show()

In [None]:
real_df.corr()

In [None]:
# heatmap for correlation
plt.figure(figsize=[10,6])
sns.heatmap(real_df.corr(), annot=True)
plt.show()

In [None]:
# pairplot
sns.pairplot(real_df)
plt.show()

## 3. Train Test Split

In [None]:
# splitting
from sklearn.model_selection import train_test_split

X = real_df.drop(columns=['No', 'Y house price of unit area'])
y = real_df[['Y house price of unit area']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

### 3.1 Features Scaling

In [None]:
# col = X_train.columns
# col

In [None]:
# # Using Normalization(MinMaxScaler) Scaler
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train[col] = scaler.fit_transform(X_train[col])

In [None]:
X_train.describe()

## 4. Model Building

In [None]:
# linearRegression
from sklearn.linear_model import LinearRegression

lrm = LinearRegression()
lrm.fit(X_train, y_train)

In [None]:
lrm.coef_

## 5. Prediction

In [None]:
y_train_pred = lrm.predict(X_train)
y_train_pred

## 6. Evaluation

In [None]:
# predict test set prices
y_test_pred = lrm.predict(X_test)

In [None]:
# residuals distribution
sns.distplot(y_test-y_test_pred)
plt.show()

- We can see that the residuls are normalized at '0'.
- <b>So can proceed with the model</b>

In [None]:
y_train_pred2 = y_train_pred.reshape(-1, )

In [None]:
# residuals scatter
sns.scatterplot(x=y_train['Y house price of unit area'], y=y_train_pred2)

In [None]:
# r-Squared score
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_test_pred)
r2

In [None]:
coefs = list(lrm.coef_)[0]
coefs = list(coefs)
coefs

# Built a Linear Regression with r2_score of 67% 