# Multivariate Linear Regression

## Import Dataset

In [1]:
# from google.colab import files
# files.upload()

## Import Libraries

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

## Dataset Visualization

In [3]:
# Import Dataset
dataset = pd.read_excel('real-estate.xlsx')

In [4]:
print(dataset)

      No  X1 transaction date  ...  X6 longitude  Y house price of unit area
0      1          2012.916667  ...     121.54024                        37.9
1      2          2012.916667  ...     121.53951                        42.2
2      3          2013.583333  ...     121.54391                        47.3
3      4          2013.500000  ...     121.54391                        54.8
4      5          2012.833333  ...     121.54245                        43.1
..   ...                  ...  ...           ...                         ...
409  410          2013.000000  ...     121.50381                        15.4
410  411          2012.666667  ...     121.54310                        50.0
411  412          2013.250000  ...     121.53986                        40.6
412  413          2013.000000  ...     121.54067                        52.5
413  414          2013.500000  ...     121.54310                        63.9

[414 rows x 8 columns]


In [5]:
dataset.columns

Index(['No', 'X1 transaction date', 'X2 house age',
       'X3 distance to the nearest MRT station',
       'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
       'Y house price of unit area'],
      dtype='object')

In [6]:
dataset.head(10)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1
5,6,2012.666667,7.1,2175.03,3,24.96305,121.51254,32.1
6,7,2012.666667,34.5,623.4731,7,24.97933,121.53642,40.3
7,8,2013.416667,20.3,287.6025,6,24.98042,121.54228,46.7
8,9,2013.5,31.7,5512.038,1,24.95095,121.48458,18.8
9,10,2013.416667,17.9,1783.18,3,24.96731,121.51486,22.1


In [7]:
dataset.tail(5)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
409,410,2013.0,13.7,4082.015,0,24.94155,121.50381,15.4
410,411,2012.666667,5.6,90.45606,9,24.97433,121.5431,50.0
411,412,2013.25,18.8,390.9696,7,24.97923,121.53986,40.6
412,413,2013.0,8.1,104.8101,5,24.96674,121.54067,52.5
413,414,2013.5,6.5,90.45606,9,24.97433,121.5431,63.9


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   No                                      414 non-null    int64  
 1   X1 transaction date                     414 non-null    float64
 2   X2 house age                            414 non-null    float64
 3   X3 distance to the nearest MRT station  414 non-null    float64
 4   X4 number of convenience stores         414 non-null    int64  
 5   X5 latitude                             414 non-null    float64
 6   X6 longitude                            414 non-null    float64
 7   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(2)
memory usage: 26.0 KB


## Splitting Dataset

In [9]:
# split data into train and test
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1:].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0) # 75% split
print(X_train.shape)
print(X_test.shape)

(310, 6)
(104, 6)


## Normalization of Dataset

In [10]:
# normalization of input training data
m = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
print(m.shape)
print(std.shape)

(6,)
(6,)


In [11]:
X_train = (X_train-m) / std 

In [12]:
# normalization of testing data
X_test = (X_test-m) / std 

## Input Feature Vector

In [13]:
# creating input feayure vector with first column as 1
X = np.ones(shape=(X_train.shape[0], 1+X_train.shape[1]))
X[:, 1:] = X_train 

In [14]:
X.shape

(310, 7)

## Calculating Bias and Weight Vector

In [15]:
# calculationg optimal weight and bias vector
inv = np.linalg.inv(np.dot(X.T, X))
theta = np.dot(inv, X.T)
theta = np.dot(theta, y_train).reshape(-1, 1)

In [16]:
print(theta.shape)

(7, 1)


## Prediction using Testing Dataset

In [17]:
# input feature vector for testing purpose
X_testing = np.ones(shape=(X_test.shape[0], 1+X_test.shape[1]))
X_testing[:, 1:] = X_test 

In [18]:
# predicting output using linear regression
predict = np.dot(X_testing, theta)

for i in range(predict.shape[0]):
  print("Predicted:", predict[i], "Correct Label:", y_test[i])

Predicted: [39.77707328] Correct Label: [45.3]
Predicted: [13.17717303] Correct Label: [14.4]
Predicted: [42.6047416] Correct Label: [46.]
Predicted: [12.98203955] Correct Label: [15.6]
Predicted: [42.73582117] Correct Label: [50.2]
Predicted: [40.45119868] Correct Label: [38.1]
Predicted: [41.25880602] Correct Label: [48.5]
Predicted: [33.92979773] Correct Label: [41.4]
Predicted: [50.03313158] Correct Label: [51.6]
Predicted: [46.38378205] Correct Label: [40.1]
Predicted: [46.46939477] Correct Label: [42.]
Predicted: [41.01325285] Correct Label: [42.4]
Predicted: [39.20124825] Correct Label: [28.5]
Predicted: [40.60115306] Correct Label: [39.1]
Predicted: [47.16745737] Correct Label: [53.7]
Predicted: [38.77540445] Correct Label: [36.8]
Predicted: [40.7010488] Correct Label: [40.6]
Predicted: [40.7393178] Correct Label: [55.2]
Predicted: [41.00594658] Correct Label: [39.3]
Predicted: [42.7391369] Correct Label: [44.3]
Predicted: [49.77669611] Correct Label: [53.5]
Predicted: [31.4818

## Error Calculation

In [19]:
error = y_test - predict
sq_err = np.square(error)
RMSE = np.sqrt(np.sum(sq_err))
print("RMSE:", RMSE)

RMSE: 85.38344738907517
