In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
df = pd.read_csv('real_estate_dataset.csv')

In [45]:
columns = df.columns

In [46]:
np.savetxt('columns.txt', columns, fmt='%s', delimiter=',')

In [47]:
#Use Square_Feet, Garage_Size, Location_Score, Distance_to_Center as features
X = df[['Square_Feet', 'Garage_Size', 'Location_Score', 'Distance_to_Center']].values
y = df['Price'].values

print(f'Shape of X: {X.shape}')
print(f'data type of X: {X.dtype}')

Shape of X: (500, 4)
data type of X: float64


In [48]:
n_samples, n_features = X.shape

In [49]:
coefs = np.ones(n_features + 1)
coefs.shape

(5,)

In [50]:
#predict the price for each sample in X
predictions_bydefn = np.dot(X, coefs[1:]) + coefs[0]

In [51]:
#append a column of ones to X
X = np.hstack([np.ones((n_samples, 1)), X])

In [52]:
#predict the price for each sample in X
predictions = X @ coefs

In [53]:
is_same = np.allclose(predictions, predictions_bydefn)

In [54]:
print(f'Are the predictions the same? {is_same}')

Are the predictions the same? True


In [55]:
errors = predictions - y

In [56]:
print(f'Size of errors : {errors.shape}')
print(f'L2 Norm of errors : {np.linalg.norm(errors)}')


Size of errors : (500,)
L2 Norm of errors : 13297007.321853261


In [57]:
rel_errors = errors / y
print(f'L2 Norm of relative errors : {np.linalg.norm(rel_errors)}')

L2 Norm of relative errors : 22.35214323542266


In [58]:
#Calculate the mead of sqaurre of errors
loss_loop = 0
for i in range(n_samples):
    loss_loop += (predictions[i] - y[i])**2

loss_loop /= n_samples

In [59]:
loss_matrix = np.transpose(errors) @ errors / n_samples

In [60]:
is_diff = np.allclose(loss_loop, loss_matrix)
print(f'Are the losses the same? {is_diff}')

Are the losses the same? True


In [61]:
#Objective function : f(coefs) = 1/n_samples * ||X @ coefs - y||^2
#Gradient of f(coefs) = 2/n_samples * X^T @ (X @ coefs - y

#What is a soltion?
#A solution is a set of coefficients that minimizes the objective function

#How do we find a solution?
#By searching for the coefficients at which the gradient is zero
# Or I can set the gradient to zero and solve for the coefficients

#Write the loss matrix in the terms of data and coefs
loss_matrix = (X @ coefs - y).T @ (X @ coefs - y) / n_samples

In [62]:
#Calculate the gradient of the loss with respect to the coefficients
gradient = 2/n_samples * X.T @ (X @ coefs - y)

In [63]:
#we set grad_matrix = 0 and solve for coefs
#X^T @ X @ coefs = X^T @ y. This is called Normal Equation
#coefs = (X^T @ X)^-1 @ X^T @ y

coefs = np.linalg.inv(X.T @ X) @ X.T @ y

In [64]:
np.savetxt('coefs.txt', coefs, fmt='%f', delimiter=',')

In [65]:
predictions = X @ coefs

In [66]:
#Calculate the errors using the optimal coefficients
errors = predictions - y

In [67]:
#print the L2 norm of the errors
print(f'L2 Norm of errors_model : {np.linalg.norm(errors)}')

L2 Norm of errors_model : 2240271.8037529774


In [68]:
relatve_errors = errors / y
print(f'L2 Norm of relative errors_model : {np.linalg.norm(relatve_errors)}')

L2 Norm of relative errors_model : 4.327097762677231


In [69]:
#Use all the features in the dataset to build a linear model
X = df.drop('Price', axis=1).values
y = df['Price'].values

n_samples, n_features = X.shape
print(f'number of samples and features : {n_samples, n_features}')

number of samples and features : (500, 11)


In [70]:
X = np.hstack([np.ones((n_samples, 1)), X])
coefs = np.linalg.inv(X.T @ X) @ X.T @ y

In [71]:
#Save the coefficients to a file
np.savetxt('coefs_all.csv', coefs, fmt='%f', delimiter=',')

In [72]:
#Calculate the rank of X^T @ X
rank = np.linalg.matrix_rank(X.T @ X)
print(f'Rank of X^T @ X : {rank}')

Rank of X^T @ X : 12


In [73]:
#Solve the normal equation using matrix decomposition
#QR decomposition
Q, R = np.linalg.qr(X)

print(f'Shape of Q : {Q.shape}')
print(f'Shape of R : {R.shape}')

Shape of Q : (500, 12)
Shape of R : (12, 12)


In [74]:
np.savetxt('R.csv', R, fmt='%f', delimiter=',')

In [75]:
sol = Q.T @ Q
np.savetxt('sol.csv', sol, fmt='%f', delimiter=',')

In [76]:
#R*coefs = b

#X = QR
#X^T @ X = R^T @ Q^T @ Q @ R = R^T @ R
#X^T @ y = R^T @ Q^T @ y
#R @ coefs = Q^T @ y

b = Q.T @ y

print(f'Shape of b : {b.shape}')
print(f'Shape of R : {R.shape}')

coefs_qr = np.linalg.inv(R) @ b
# loop to solve for R @ coefs = b using back substitution

coefs_qr_loop = np.zeros(n_features + 1)
for i in range(n_features, -1, -1):
    coefs_qr_loop[i] = b[i]
    for j in range(i + 1, n_features + 1):
        coefs_qr_loop[i] -= R[i, j] * coefs_qr_loop[j]
    coefs_qr_loop[i] /= R[i, i]

#Check if the coefficients are the same
is_same = np.allclose(coefs_qr, coefs_qr_loop)
print(f'Are the coefficients the same? {is_same}')
is_same = np.allclose(coefs_qr, coefs)
print(f'Are the coefficients the same? {is_same}')

np.savetxt('coefs_qr.csv', coefs_qr_loop, fmt='%f', delimiter=',')

Shape of b : (12,)
Shape of R : (12, 12)
Are the coefficients the same? True
Are the coefficients the same? True


In [77]:
#Solving the normal equation using SVD
#X = U @ S @ V^T
#X^-1 =  
U, S, Vt = np.linalg.svd(X, full_matrices=False)


In [78]:
#Eigen decomposition of square matrix
#A = V @ D @ V^-1
#A^-1 = V @ D^-1 @ V^-1
#A = X^T @ X -> symmetric square matrix
#A = V @ D @ V^T , A^-1 = V @ D^-1 @ V^T
#

#X @ coefs = y
#Normal Equation : X^T @ X @ coefs = X^T @ y

#Find inverse of X in least squares sense
#Pseudo inverse of X
#Xdagger = (X^T @ X)^-1 @ X^T

In [79]:
#To complete: Calculate the coefs_svd using the SVD decomposition

#X = U @ S @ V^T
#X^T @ X = V @ S^2 @ V^T
#X^T @ y = V @ S @ U^T @ y
#coefs_svd = V @ S^-1 @ U^T @ y

U, S, Vt = np.linalg.svd(X, full_matrices=False)

coefs_svd = Vt.T @ np.linalg.inv(np.diag(S)) @ U.T @ y

np.savetxt('coefs_svd.csv', coefs_svd, fmt='%f', delimiter=',')

#Check if the coefficients are the same
is_same = np.allclose(coefs_svd, coefs)
print(f'Are the coefficients the same? {is_same}')

Are the coefficients the same? True


In [80]:
#Calculate the predictions using the eigen decomposition of X^T @ X
#X^T @ X = V @ D @ V^T
#Normal Equation : X^T @ X @ coefs = X^T @ y
#Calculate the coefs using the eigen decomposition of X^T @ X
d, v = np.linalg.eig(X.T @ X)
coefs_eigen = v @ np.linalg.inv(np.diag(d)) @ v.T @ X.T @ y

np.savetxt('coefs_eigen.csv', coefs_eigen, fmt='%f', delimiter=',')

#Check if the coefficients are the same
is_same = np.allclose(coefs_eigen, coefs)
print(f'Are the coefficients the same? {is_same}')

Are the coefficients the same? True
