# Euclidean Distance

In [1]:
import numpy as np
import math

In [2]:
point1 = np.array((1,2,3))
point2 = np.array((1,1,1))

In [3]:
t = point1 - point2
t

array([0, 1, 2])

In [4]:
sum_squares = np.dot(t.T,t)
sum_squares

5

In [5]:
np.sqrt(sum_squares)

2.23606797749979

In [6]:
distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(point1, point2)]))
distance

2.23606797749979

In [7]:
(1-1)^2 + (1-1)^2 + (3-1)^2

4

In [8]:
np.sqrt(4)

2.0

# What is euclidean distance telling the user.

# Cosine Distance or Cosine Similarity

##### DOT Product A.B = |A||B|cos(theta)

In [10]:
from numpy.linalg import norm

# L1 and L2 Norms
### |-1|+|-2|+3+4+5 = 15  - --  -> L1 Norm
##### sqrt(1^2 + 2^2 + 3^2 + 4^2 + 5^2) = 7.416 -  -------> L2 Norm

In [17]:
norm(point1,1)

6.0

In [18]:
norm(point1,2)

3.7416573867739413

In [19]:
from numpy import dot

In [22]:
cos_sim = dot(point1, point2)/(norm(point1)*norm(point2))
cos_sim

0.9258200997725515

# What is cos(0)

In [23]:
math.cos(0)

1.0

In [29]:
math.cos(9)

-0.9111302618846769

# What is cosine similarity telling the user?

# Which of these two is better for finding word similarities?

In [30]:
O = [0.00, 0.00]
A = [1.45, 7.56]
B = [7.81, 12.41]
C = [8.83, 4.48]

In [33]:
distance_A = math.sqrt(sum([(a - b) ** 2 for a, b in zip(A, B)]))
distance_B = math.sqrt(sum([(a - b) ** 2 for a, b in zip(B, C)]))
distance_C = math.sqrt(sum([(a - b) ** 2 for a, b in zip(C, A)]))
distance_A, distance_B, distance_C

(7.998256059917061, 7.995329886877713, 7.996924408796171)

In [34]:
cos_sim_A = dot(A, B)/(norm(A)*norm(B))
cos_sim_B = dot(B, C)/(norm(B)*norm(C))
cos_sim_C = dot(A, C)/(norm(A)*norm(C))
cos_sim_A,cos_sim_B,cos_sim_C

(0.9315258342391336, 0.8579300679601176, 0.6123399158783746)

# Try this

In [38]:
A = [8.00, 2.00]
B = [12.00, 3.00]
C = [32.00, 8.00]

In [39]:
A

[8.0, 2.0]

In [40]:
distance_A = math.sqrt(sum([(a - b) ** 2 for a, b in zip(A, B)]))
distance_B = math.sqrt(sum([(a - b) ** 2 for a, b in zip(B, C)]))
distance_C = math.sqrt(sum([(a - b) ** 2 for a, b in zip(C, A)]))
distance_A, distance_B, distance_C

(4.123105625617661, 20.615528128088304, 24.73863375370596)

In [41]:
cos_sim_A = dot(A, B)/(norm(A)*norm(B))
cos_sim_B = dot(B, C)/(norm(B)*norm(C))
cos_sim_C = dot(A, C)/(norm(A)*norm(C))
cos_sim_A,cos_sim_B,cos_sim_C

(1.0, 1.0, 1.0)

# Comment on the two approaches and make judgement on when to use Euclidean or Cosine Distance metrics.

###### This is a visual representation of euclidean distance (d) and cosine similarity (θ). While cosine looks at the angle between vectors (thus not taking into regard their weight or magnitude), euclidean distance is similar to using a ruler to actually measure the distance. 

# Why Cosine Similarity Matter for Text data.

###### Cosine similarity is generally used as a metric for measuring distance when the magnitude of the vectors does not matter. This happens for example when working with text data represented by word counts

In [42]:
d1 = np.array((5,0,5,6,3,9,8,7,5,6))
d2 = np.array((3,0,2,4,6,9,8,5,2,1))

In [45]:
cos_sim_A = dot(d1, d2)/(norm(d1)*norm(d2))
cos_sim_A

0.9074362105351957

In [49]:
np.sum(d1*d2)

263

In [47]:
dot(d1,d2)

263

In [50]:
norm(d1)

18.708286933869708

In [59]:
A = np.array((1,0,1,1))
B = np.array((1,0,1,0))

In [60]:
t = A - B
t

array([0, 0, 0, 1])

In [61]:
sum_squares = np.dot(t.T,t)
sum_squares
e_dist = np.sqrt(sum_squares)
e_dist

1.0

In [62]:
cos_sim_A = dot(A, B)/(norm(A)*norm(B))
cos_sim_A

0.8164965809277259

In [5]:
import numpy as np
np.sqrt(2)*np.sqrt(3)

2.4494897427831783

In [6]:
2/2.44

0.819672131147541

https://cmry.github.io/notes/euclidean-v-cosine