# Distance Functions

In [None]:
import numpy as np
import time

### Compute L2 distance between 2 vectors

In [None]:
# X = [1]   Y = [6]
#     [2]       [5]
#     [3]       [4]
X = np.array([[1],[2],[3]])
Y = np.array([[6],[5],[4]])
print("X: \n{}".format(X))
print("Y: \n{}".format(Y))

In [None]:
distance = np.sqrt(np.sum(np.square(X-Y),axis=0))
print("L2 distance: {}".format(distance))

### Compute distance between Y and each of many vectors in Xdata

In [None]:
# loop over points in Xdata
def distance1(Xdata,Y):
    # Xdata is dataset/feature matrix with d rows and nsample columns
    # Y is a single data point with d rows and 1 column
    # ouptut is 1 row and nsample columns
    nsample = Xdata.shape[1]
    output = np.zeros((1,nsample))
    for i in range(nsample):
        output[0,i] = np.sqrt(np.sum(np.square(Xdata[:,[i]]-Y),axis=0))
    return output

In [None]:
# use vectorized approach
def distance2(Xdata,Y):
    # Xdata is dataset/feature matrix with d rows and nsample columns
    # Y is a single data point with d rows and 1
    # output is 1 row and nsample columns
    return np.sqrt(np.sum(np.square(Xdata - Y),axis=0,keepdims=True))

In [None]:
# Xdata contains 3 data points
# Xdata =[1] [4] [7]
#        [2] [5] [8]
#        [3] [6] [9]
# create 2d array to hold the data
Xdata = np.array([[1,4,7],[2,5,8],[3,6,9]])
print("Xdata: \n{}".format(Xdata))
print("Y: \n{}".format(Y))

In [None]:
# compute distance between Xdata and Y: looping approach
dist_loop = distance1(Xdata,Y)
print("distance_loop: {}".format(dist_loop))
dist_vec = distance2(Xdata,Y)
print("distance_vec: {}".format(dist_vec))

### Timing comparison looping versus vectorized approach

In [None]:
# create dataset 5 dimensions, 10000 samples
Xdata = np.random.randn(5,1000)
Y = np.random.randn(5,1)

In [None]:
# looping case
time_start_loop = time.time()
dist_loop = distance1(Xdata,Y)
time_end_loop = time.time()
print("Time loop: {}".format(time_end_loop - time_start_loop))

In [None]:
# vectorized case
time_start_vec = time.time()
dist_vec = distance2(Xdata,Y)
time_end_vec = time.time()
print("Time vectorized: {}".format(time_end_vec - time_start_vec))

In [None]:
print("diff: {}".format(np.max(np.absolute(dist_loop-dist_vec))))