# This example belongs to @Yang Liu
https://towardsdatascience.com/vectorization-implementation-in-machine-learning-ca652920c55d

# Vectorization Implementation in Machine Learning

In [None]:
#libraries 
import numpy as np 
import matplotlib.pyplot as plt 

import pandas as pd 
import seaborn as sns 

%matplotlib inline 

In [2]:
from sklearn.datasets import load_boston 
boston_dataset = load_boston()

In [3]:
boston_dataset.DESCR

".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000

In [4]:
df = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
df['MEDV'] = boston_dataset.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
Xd = df.drop(columns=['MEDV'])
Xd.insert(0, 'XO', 1)
Xd.head()

Unnamed: 0,XO,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,1,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,1,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,1,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [6]:
#numpy array format 
X= Xd.values 
y= df.MEDV.values 


#scaling 
from sklearn.preprocessing import StandardScaler
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X[:,1:] = scaler_X.fit_transform(X[:,1:])

#sample size 
m = len(df.index)
print(m)

#number of features 
n = X.shape[1]
print(n)
print(X)

506
14
[[ 1.         -0.41978194  0.28482986 ... -1.45900038  0.44105193
  -1.0755623 ]
 [ 1.         -0.41733926 -0.48772236 ... -0.30309415  0.44105193
  -0.49243937]
 [ 1.         -0.41734159 -0.48772236 ... -0.30309415  0.39642699
  -1.2087274 ]
 ...
 [ 1.         -0.41344658 -0.48772236 ...  1.17646583  0.44105193
  -0.98304761]
 [ 1.         -0.40776407 -0.48772236 ...  1.17646583  0.4032249
  -0.86530163]
 [ 1.         -0.41500016 -0.48772236 ...  1.17646583  0.44105193
  -0.66905833]]


In [7]:
#initialize theta 
theta = np.ones(n)

print(theta)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [8]:
#Hypothesis Implementation: For Loop

#hypothesis for the first sample 
hypo = 0
for j in range(n):
    hypo += theta[j]*X[0,j]
hypo

-4.148767317690631

In [9]:
#hypothesis for all the samples 
all_hypo = [] 
for i in range(m):
    hypo_i = 0
    for j in range(n):
        hypo_i += theta[j]*X[i,j]
    all_hypo.append(hypo_i)

all_hypo

[-4.148767317690631,
 -2.602396239666201,
 -2.9078504895707216,
 -3.726441463707478,
 -2.851285511285309,
 -3.7646068095591465,
 -2.2054987773581436,
 0.2196460935336595,
 1.056380914054745,
 -0.4735342716713161,
 0.7795847998115915,
 -1.1780490967137422,
 -3.009813365075279,
 -1.2118687731040505,
 -0.21574616838841482,
 -1.648353853645355,
 -2.783158917179846,
 0.1456700443429742,
 -3.9256542800046788,
 -1.3181245023195984,
 0.7447382915280538,
 0.21352997942525948,
 1.3154038428802872,
 1.3064060334458971,
 0.867596865705546,
 -0.8248078535444139,
 0.2992689909564807,
 0.07690456473656715,
 1.159912652733557,
 0.887134668843031,
 1.0406355739461377,
 0.6018895385741836,
 0.17997576502099255,
 0.23496930572619357,
 -0.026268042793665458,
 -3.0528350215409974,
 -3.3830273196294747,
 -3.973434274420412,
 -4.0876829029792665,
 -2.1990452119393495,
 -2.1286527833694957,
 -5.263543232613433,
 -5.8729514945311205,
 -5.46451546030033,
 -4.239627854788237,
 -5.125884962758213,
 -4.44124467733

In [10]:
# Cost function implementation: For Loop 

cost = 0 
for i in range(m):
    hypo_i = 0
    for j in range(n):
        hypo_i += theta[j]*X[i,j]
    cost_i = (hypo_i - y[i])**2
    cost += cost_i
cost = (1/(2*m))*cost

cost

301.9074068612505

In [11]:
# Derivation implementation

dev_sum = 0
for i in range(m):
    hypo_i = 0
    for j in range(n):
        hypo_i += theta[j]*X[i,j]
    dev_i = (hypo_i - y[i])*X[i,0]
    dev_sum += dev_i
dev_sum = (1/m)*dev_sum

dev_sum

-21.532806324110673

In [12]:
#To calculate the derivation to all θ and output a list, we need another for loop iterate over all the columns:

dev_list = [] 
for k in range(n):
    dev_sum = 0
    for i in range(m):
        hypo_i = 0
        for j in range(n):
            hypo_i += theta[j]*X[i,j]
        dev_i = (hypo_i - y[i])*X[i,k]
        dev_sum += dev_i 
    dev_sum = (1/m)*dev_sum
    
    dev_list.append(dev_sum)
dev_list

[-21.532806324110673,
 6.461531582810539,
 -4.454176288708005,
 7.634852916906943,
 -0.6457575250654759,
 7.0247269742311556,
 -7.276998502608709,
 6.274078330651083,
 -4.597651382768297,
 7.189965870296318,
 8.079554222518627,
 6.810280223167379,
 -4.2452688681675586,
 9.493191476506835]

Gradient Descent: 
In order to achieve descent optimization results, we set the iteration times to be 100 thousands. We need nested four for loops in order to achieve the gradient descent algorithm. The learning rate is set to be 0.0005 and the thetas are initialized as all ones. The code is shown as below:

In [14]:
a = 0.0005
theta = np.ones(n)

cost_list = [] 

for itr in range(100000):
    
    dev_list = [] 
    
    for k in range(n):
        dev_sum=0
        
        for i in range(m):
            hypo_i = 0
            for j in range(n):
                hypo_i += theta[j]*X[i,j]
            dev_i = (hypo_i - y[i])*X[i,k] # after derivative 
            dev_sum += dev_i
        dev_sum = (1/m)*dev_sum
        dev_list.append(dev_sum)
    
    # update theta's value for better results
    theta = theta - a*np.array(dev_list)
    
    cost_val = [theta]
    
    cost_list.append(cost_val)

KeyboardInterrupt: 

In [None]:
#Hypothesis Implementation : Vectorization 

# matrix format 
hypo = X@theta 


In [None]:
# Cost function implementation: Vectorization 

cost_vec = (1/(2*m))*np.transpose((X@theta - y))@(X@theta - y)
cost_vec

In [None]:
# The derivation of cost function to all θ can be vectorized as:
dev = (1/m)*np.transpose(X)@(X@theta - y)
dev

Gradient Descent: Vectorization
The implementation of vectorized gradient descent is super clean and elegant.

In [17]:
a = 0.0005
theta = np.ones(n)
cost_list = []
for i in range(100000):
    
    theta = theta - a*(1/m)*np.transpose(X)@(X@theta - y)
           
    cost_val = [theta]
    cost_list.append(cost_val)

cost_list

[[array([1.0107664 , 0.99676923, 1.00222709, 0.99618257, 1.00032288,
         0.99648764, 1.0036385 , 0.99686296, 1.00229883, 0.99640502,
         0.99596022, 0.99659486, 1.00212263, 0.9952534 ])],
 [array([1.02152742, 0.99354751, 1.00444557, 0.99237717, 1.00064545,
         0.99298688, 1.00726941, 0.9937365 , 1.00458706, 0.99282133,
         0.99193243, 0.99319758, 1.00423817, 0.99051821])],
 [array([1.03228306, 0.99033479, 1.00665546, 0.98858374, 1.0009677 ,
         0.98949771, 1.01089274, 0.99062058, 1.00686475, 0.9892489 ,
         0.98791659, 0.98980813, 1.00634664, 0.98579439])],
 [array([1.04303332, 0.98713105, 1.0088568 , 0.98480226, 1.00128964,
         0.98602007, 1.01450853, 0.98751518, 1.00913191, 0.98568769,
         0.98391267, 0.98642649, 1.00844806, 0.9810819 ])],
 [array([1.05377821, 0.98393627, 1.01104961, 0.98103268, 1.00161127,
         0.98255393, 1.01811679, 0.98442025, 1.01138858, 0.98213767,
         0.97992062, 0.98305264, 1.01054244, 0.97638072])],
 [array([1

# Conclusion
Here is a plot showing the running time difference of the two approaches implementing the same algorithm and using exactly the same learning rate and initial θ values. The two approaches achieved the same accuracy. However, the vectorization approach cost 1.75 seconds while the for loop cost 4558 seconds. The vectorization approach is 2600 times faster than the for loop approach.
