# -- Implementing Linear Regression On Real State Data To Predict House Prices --

## Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as skl
import joblib as jb

## Loading And Understanding The Data

In [3]:
df = pd.read_csv("Real estate.csv")

In [4]:
df.head(5)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [5]:
df.shape

(414, 8)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   No                                      414 non-null    int64  
 1   X1 transaction date                     414 non-null    float64
 2   X2 house age                            414 non-null    float64
 3   X3 distance to the nearest MRT station  414 non-null    float64
 4   X4 number of convenience stores         414 non-null    int64  
 5   X5 latitude                             414 non-null    float64
 6   X6 longitude                            414 non-null    float64
 7   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(2)
memory usage: 26.0 KB


In [7]:
df.describe()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,2013.148971,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,0.281967,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,2012.667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,2012.917,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,2013.167,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,2013.417,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,2013.583,43.8,6488.021,10.0,25.01459,121.56627,117.5


## Data Cleaning And Pre-Processing

In [8]:
df.rename(columns = {"X1 transaction date":"Transaction Date","X2 house age":"House Age",
                    "X3 distance to the nearest MRT station":"Distance To MRT Station","X4 number of convenience stores":"No of Convenience Store",
                    "X5 latitude":"Latitude","X6 longitude":"Longitude","Y house price of unit area":"House Price of Unit Area"}, inplace = True)

In [9]:
df

Unnamed: 0,No,Transaction Date,House Age,Distance To MRT Station,No of Convenience Store,Latitude,Longitude,House Price of Unit Area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


In [10]:
df.isnull().sum()

No                          0
Transaction Date            0
House Age                   0
Distance To MRT Station     0
No of Convenience Store     0
Latitude                    0
Longitude                   0
House Price of Unit Area    0
dtype: int64

In [11]:
df.isna().sum()

No                          0
Transaction Date            0
House Age                   0
Distance To MRT Station     0
No of Convenience Store     0
Latitude                    0
Longitude                   0
House Price of Unit Area    0
dtype: int64

## Separating Input and Output Data

In [14]:
X = df.drop("House Price of Unit Area",axis = 1)
y= df["House Price of Unit Area"]

## Scaling and Normalization

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Separating Training Data and Test Data

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

## Implementing The Model On Training Data

In [20]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [22]:
model.fit(X_train,y_train)

## Prediction On Training Data

In [24]:
y_predict_train = model.predict(X_train)

In [25]:
from sklearn.metrics import r2_score,mean_absolute_error

In [26]:
r2_score(y_train,y_predict_train)

0.5888965057262948

In [27]:
mean_absolute_error(y_train,y_predict_train)

6.132553700285354

## Prediction On Test Data

In [28]:
y_predict_test = model.predict(X_test)

In [29]:
r2_score(y_test,y_predict_test)

0.5532450005316665

In [30]:
mean_absolute_error(y_test,y_predict_test)

6.266422833091925