In [2]:
######### VECTORIZED IMPLEMENTATION: LINEAR REGRESSION ########
from google.colab import files
uploaded = files.upload()

Saving test_feature_matrix.xlsx to test_feature_matrix.xlsx
Saving test_output.xlsx to test_output.xlsx
Saving training_feature_matrix.xlsx to training_feature_matrix.xlsx
Saving training_output.xlsx to training_output.xlsx


In [3]:
import pandas as pd
import numpy as np

In [14]:
X_train = pd.read_excel('training_feature_matrix.xlsx', header = None)
Y_train = pd.read_excel('training_output.xlsx', header = None)
X_test = pd.read_excel('test_feature_matrix.xlsx', header = None)
Y_test = pd.read_excel('test_output.xlsx', header = None)

In [15]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(245, 2)
(245, 1)
(104, 2)
(104, 1)


In [6]:
## data has 2 features, we need to add a column of ones because x0 = 1
print("X_train")
print(X_train.head())

X_train
     0        1
0  2.6  1360.00
1  2.0  1292.25
2  1.2  1185.00
3  2.0  1333.25
4  4.8  1580.75


In [16]:
# minimum and maximum features of training data
min_features_X = list(X_train.min(axis = 0))
max_features_X = list(X_train.max(axis = 0))
min_features_Y = list(Y_train.min(axis = 0))
max_features_Y = list(Y_train.max(axis = 0))
print("min_features_X is: ", min_features_X)
print("max_features_X is: ", max_features_X)
print("min_features_Y is: ", min_features_Y)
print("max_features_Y is: ", max_features_Y)

min_features_X is:  [-200.0, 842.0]
max_features_X is:  [8.1, 2039.75]
min_features_Y is:  [10.349999904633]
max_features_Y is:  [29.275000095367]


In [17]:
# normalize data using minimum and maximum feature values of training data
def normalize(data, min_feature, max_feature):
  Z = pd.DataFrame()
  for i in range(data.shape[1]):
    Z[i] = (data.iloc[:,i]-min_feature[i])/(max_feature[i]-min_feature[i])
  return Z

In [18]:
X_train_norm = normalize(X_train, min_features_X, max_features_X)
Y_train_norm = normalize(Y_train, min_features_Y, max_features_Y)

In [19]:
## adding a column of ones to training data
X_train_norm.insert(0, '2', 1)
X_train_norm.set_axis([str(i) for i in range(0,3)], axis = 1, inplace = True)
print("X_train_norm now is: ")
print(X_train_norm.head())

X_train_norm now is: 
   0         1         2
0  1  0.973570  0.432478
1  1  0.970687  0.375913
2  1  0.966843  0.286370
3  1  0.970687  0.410144
4  1  0.984142  0.616781


In [8]:
# matrix1 = (X.T * X)^-1
def matrix1(X):
  Y = np.linalg.inv(np.dot(X.T, X))
  return Y

In [9]:
# matrix2 = (X.T * Y)
def matrix2(X, Y):
  Z = np.dot(X.T, Y)
  return Z

In [12]:
# w = ((X.T * X)^-1)*(X.T * Y)
def param(matrix1, matrix2):
  Z = np.dot(matrix1, matrix2)
  return Z

In [38]:
M1 = matrix1(X_train_norm)
M2 = matrix2(X_train_norm, Y_train_norm)
W = param(M1, M2)
print(W)
print(type(W))

[[0.17536947]
 [0.06644076]
 [0.27309299]]
<class 'numpy.ndarray'>


In [25]:
# predict output values
def predict(test_data, W):
  Y_predict = pd.DataFrame(np.dot(test_data, W))
  return Y_predict

In [27]:
X_test_norm = normalize(X_test, min_features_X, max_features_X)

In [28]:
## adding a column of ones to test data
X_test_norm.insert(0, '2', 1)
X_test_norm.set_axis([str(i) for i in range(0,3)], axis = 1, inplace = True)
print("X_test_norm now is: ")
print(X_test_norm.head())

X_test_norm now is: 
   0         1         2
0  1  0.971648  0.467543
1  1  0.971648  0.445418
2  1  0.968765  0.359215
3  1  0.966843  0.296389
4  1  0.965882  0.245669


In [31]:
Y_pred = predict(X_test_norm, W)

In [32]:
min_Y = Y_train.min()
max_Y = Y_train.max()
def denormalize(Y_predict, min_val, max_val):
  for i in range(len(Y_predict)):
    Y_out = Y_predict*(max_val-min_val) + min_val
  return Y_out

In [33]:
Y_out = denormalize(Y_pred, min_Y, max_Y)

In [34]:
# mean square error
def mean_sqr_error(Y_pred, Y_test):
  error = (Y_test-Y_pred)**2
  mean_error = float(np.mean(error))
  return mean_error

In [36]:
mean_sq_error = mean_sqr_error(Y_out,Y_test)
print("mean square error is: ", mean_sq_error)

mean square error is:  40.17560879752857


In [40]:
##### VECTORIZED IMPLEMENTATION: RIDGE REGRESSION #####

# matrix3 = (X.T * X + lambda * I)^-1
def matrix3(X, l = 0.2):
  n = X.shape[1]
  identity = np.identity(n, float)
  Y = np.linalg.inv(np.dot(X.T, X) + l * identity)
  return Y

In [72]:
M4 = matrix3(X_train_norm, l = 0.1)                      
M5 = matrix2(X_train_norm, Y_train_norm)                  # matrix2 returns (X.T * Y)
W1 = param(M4, M5)                                        # param returns (X * Y)
print(W1)    

[[0.17339954]
 [0.06952482]
 [0.27051344]]


In [73]:
Y_pred1 = predict(X_test_norm, W1)

In [74]:
Y_out1 = denormalize(Y_pred1, min_Y, max_Y)

In [75]:
mean_sq_error1 = mean_sqr_error(Y_out1,Y_test)
print("mean square error is: ", mean_sq_error1)

mean square error is:  40.18865086102616


In [78]:
####### VECTORIZED IMPLEMENTATION: LEAST ANGLE REGRESSION #####

np.random.seed(2)
W2 = np.random.randn(3,1)
W2

array([[-0.41675785],
       [-0.05626683],
       [-2.1361961 ]])

In [79]:
# matrix4 = (X.T * Y) - 0.5 * lambda * sgn(W2)
def matrix4(X, Y, W2, l1 = 0.2):
  Z = np.dot(X.T, Y) - 0.5*l1*np.sign(W2)
  return Z

In [80]:
M6 = matrix1(X_train_norm)                                      # matrix1 = (X.T * X)^-1
M7 = matrix4(X_train_norm, Y_train_norm, W2, l1 = 0.2)
W2 = param(M6, M7)                                              # param = (X * Y)
print(W2)

[[0.17468177]
 [0.06494171]
 [0.27933628]]


In [81]:
Y_pred2 = predict(X_test_norm, W2)

In [82]:
Y_out2 = denormalize(Y_pred2, min_Y, max_Y)

In [83]:
mean_sq_error2 = mean_sqr_error(Y_out2,Y_test)
print("mean square error is: ", mean_sq_error2)

mean square error is:  40.12232755156738
