In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

The task will be to build linear regression WITHOUT using machine learning libraries

In [2]:
data = pd.read_csv('/kaggle/input/final-house/house.csv')
data

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age,price
0,1,26.184098,1286.68,204.003817,22,67,96004.804557
1,1,34.866901,1855.25,186.980360,8,30,92473.722568
2,1,36.980709,692.09,111.224999,24,24,98112.519942
3,1,17.445723,1399.49,237.998760,1,66,92118.326874
4,1,52.587646,84.65,100.996400,20,3,98976.653176
...,...,...,...,...,...,...,...
4303,9,175.166533,1299.71,217.739012,2,5,102707.356224
4304,6,215.352151,1346.36,77.724676,14,5,99695.064992
4305,5,299.563972,1585.37,235.537881,3,5,93803.715617
4306,1,29.193907,1896.23,147.223827,4,2,92914.729126


For our task, it's enough to keep just one column that correlates well with the overall price

In [3]:
features = ['net_sqm', 'bedroom_count', 'center_distance', 'metro_distance', 'floor', 'age']
X = data[features].values 
y = data['price'].values.reshape(-1, 1) 

Normalization

In [4]:
X_mean = np.mean(X, axis=0)  
X_std = np.std(X, axis=0)    
y_mean = np.mean(y)
y_std = np.std(y)

X_normalized = (X - X_mean) / X_std
y_normalized = (y - y_mean) / y_std

In [5]:
X_b = np.c_[np.ones((X_normalized.shape[0], 1)), X_normalized]

Set parameters

In [6]:
learning_rate = 0.01
n_iterations = 1000
m = len(X_b)

Weight initialization

In [7]:
w = np.random.randn(X_b.shape[1], 1)

Gradient descent MSE

In [8]:
for iteration in range(n_iterations):
    gradients = 2/m * X_b.T.dot(X_b.dot(w) - y_normalized)
    w = w - learning_rate * gradients

In [9]:
y_pred_normalized = X_b.dot(w)

In [10]:
y_pred = y_pred_normalized * y_std + y_mean

Metrics

In [11]:
mse = np.mean((y_pred - y) ** 2)
rmse = np.sqrt(mse)
mse_baseline = np.mean((y - y_mean) ** 2)
r2 = 1 - mse / mse_baseline

In [12]:
print(w)
print("MSE:", mse)
print("RMSE:", rmse)
print("MSE baseline:", mse_baseline)
print("R^2:", r2)

[[ 1.53907175e-10]
 [ 6.13780748e-01]
 [ 1.94956177e-01]
 [-4.75997170e-01]
 [ 1.08327047e-01]
 [ 2.33300207e-01]
 [-1.84117521e-01]]
MSE: 4315714.313452905
RMSE: 2077.4297373083173
MSE baseline: 15361453.472478816
R^2: 0.7190556010090563
