In [1]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler
from numpy.linalg import inv,pinv,LinAlgError

### Data Preparation 

In [2]:
idata, odata = datasets.load_boston(return_X_y = True)

# Training data
temp = idata[0:400,:]
in_train = np.zeros((temp.shape[0],temp.shape[1]+1))
in_train[:,0] = np.ones((temp.shape[0]))
# added dummy column
in_train[:,1:] = temp

out_train = odata[0:400]


# Testing data
temp = idata[400:506,:]
in_test = np.zeros((temp.shape[0],temp.shape[1]+1))
in_test[:,0] = np.ones((temp.shape[0]))
# added dummy column
in_test[:,1:] = temp

out_test = odata[400:506]

print('Shape of Input Train Data : {}'.format(in_train.shape))
print('Shape of Input Test Data : {}'.format(in_test.shape))

Shape of Input Train Data : (400, 14)
Shape of Input Test Data : (106, 14)


In [3]:
# Fitting data
scaler = StandardScaler()
# calulating mean and std dev values
scaler.fit(in_train[:,1:])
# calculating z values for training dataset
in_train[:,1:] = scaler.transform(in_train[:,1:])
# calculating z values for testing dataset
in_test[:,1:] = scaler.transform(in_test[:,1:])

print('Shape of Input Train Data : {}'.format(in_train.shape))
print('Shape of Input Test Data : {}'.format(in_test.shape))

Shape of Input Train Data : (400, 14)
Shape of Input Test Data : (106, 14)


### Linear Regression using Gradient Descent 

In [4]:
thetas = np.random.uniform(0,1,in_train.shape[1])

# hyperparamters
niterations = 1000
alpha = 0.01 # learning rate
m = in_train.shape[0] # no of rows
n = in_train.shape[1] # no of cols(attributes)

for i in range(niterations):
    # new values of thetas initialized to 0
    update = np.zeros(n)
    # output prediction of 400 houses using current vals of thetas
    ypred = np.dot(in_train,thetas)
    # error in prediction w.r.t ground truth
    error = ypred - out_train
    for j in range(n):
        update[j] = np.sum(error * (in_train.T)[j])
    thetas = thetas - (alpha/m)*update

print('Thetas:',thetas)

# testing the model with test data
predictions = np.dot(in_test,thetas)

mae = metrics.mean_absolute_error(y_true = out_test,y_pred = predictions)
mse = metrics.mean_squared_error(y_true = out_test,y_pred = predictions)

print('MAE: ',mae)
print('MSE: ',mse)

Thetas: [24.33348501 -1.02392755  0.92697006  0.030713    0.55318308 -1.20739345
  3.73784288 -0.04425876 -2.5525599   2.16114583 -1.18243271 -1.6576885
  0.04667597 -3.4584818 ]
MAE:  4.810013930209505
MSE:  33.36485954822415


### Multiple Linear Regression using Normal Equation Method 
     theta = inverse(XT . X) . XT . Y

In [5]:
theta = np.zeros(in_train.shape[1])

try:
    XTXi = inv(np.dot(in_train.T,in_train))
except LinAlgError:
    XTXi = pinv(np.dot(in_train.T,in_train))

XTy = np.dot(in_train.T,out_train)

theta = np.dot(XTXi, XTy)

print('Thetas: ',theta)

predictions = np.dot(theta, in_test.T)

mae = metrics.mean_absolute_error(y_true = out_test,y_pred = predictions)
mse = metrics.mean_squared_error(y_true = out_test,y_pred = predictions)

print('MAE: ',mae)
print('MSE: ',mse)

Thetas:  [24.3345     -1.14370921  1.12191092  0.35913222  0.48497247 -1.7061696
  3.58169796  0.07554815 -2.8156326   3.05189603 -1.97502535 -1.7937352
 -0.05252128 -3.50239563]
MAE:  5.142232214465328
MSE:  37.893778599602385
