In [6]:

import numpy as np

def calculate_euclidean_distance(vec1, vec2):
    euclidean_dist = np.sqrt(np.sum((vec1 - vec2) ** 2))
    return euclidean_dist

def sortkey(item):    
    return item[1]

def knearest(vec, data, k):
    result = []
    for row in range(0, len(data)):
        distance = calculate_euclidean_distance(vec, data[row])  # Fixed the function name here
        result.append([row, distance])
    sortedResult = sorted(result, key=sortkey)
    indices = []
    if k < len(data):
        for r in range(0, k):
            indices.append(sortedResult[r][0])
    else:            
        indices = [i[0] for i in sortedResult]
    return indices


data = np.array([[10, 3, 3, 5, 10],
                 [5, 4, 5, 3, 6],
                 [10, 4, 6, 4, 9],
                 [8, 6, 2, 6, 3],
                 [10, 3, 3, 5, 8],                 
                 [9, 2, 1, 2, 11],
                 [9, 3, 1, 2, 11]])

referenceVec = data[0]  # We will find knn of Row 0

# Find 4 nearest neighbors of the reference vector
k = 4
knn = knearest(referenceVec, data, k)
print("Row IDs of", k, 'nearest neighbors:')
print(knn)


Row IDs of 4 nearest neighbors:
[0, 4, 2, 6]


In [None]:
#scikit-learn on Python already has a function for computing k-nearest neighbors more efficiently using special data structures such as the ball tree. Here is an example of how we may use the NearestNeighbors class to find the nearest neighbors

In [2]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

data = np.array([[10,3,3,5,10],
                  [5,4,5,3,6],
                  [10,4,6,4,9],
                  [8,6,2,6,3],
                  [10,3,3,5,8],                 
                  [9,2,1,2,11],
                  [9,3,1,2,11]])

k=4


ReferenceVec=data[0]


nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(data)
distances, indices = nbrs.kneighbors([ReferenceVec])



print("Row IDs of ", k, ' nearest neighbors:')
print(indices)

print("Distances of these ", k, ' nearest neighbors:')
print(distances)

Row IDs of  4  nearest neighbors:
[[0 4 2 6]]
Distances of these  4  nearest neighbors:
[[0.         2.         3.46410162 3.87298335]]


In [4]:
import numpy as np

def jaccard (vec1, vec2):
    minimum=[]
    for i in range(0, len(vec1)):
        minimum.append(min(vec1[i], vec2[i]))

    maximums=[]
    for i in range(0, len(vec1)):
        maximums.append(max(vec1[i], vec2[i]))

    j=sum(minimum)/sum(maximums)

    return j

def cosinesim(vec1, vec2):
    numerator = np.dot(vec1,vec2)
    v1norm= np.sqrt(sum(vec1**2))
    v2norm= np.sqrt(sum(vec2**2))
    c = numerator/(v1norm*v2norm) 
    return c

def tanimotosim(vec1, vec2):
    numerator = np.dot(vec1,vec2)
    v1norm= (sum(vec1**2))
    v2norm= (sum(vec2**2))
    t = numerator/(v1norm+v2norm-numerator) 
    return t

rawdata = [[10, 3, 3, 5],
[12, 13, 20, 7],
[1, 1, 2, 7],
[8, 1, 2, 7],
[2, 1, 2, 7], 
[10, 3, 3, 5]]

data = np.array(rawdata)

r1=0
r2=3

jacc = jaccard(data[r1], data[r2])
print("Jaccard similarity between rows ", \
r1, "and",r2,"is")
print(jacc)

cos = cosinesim(data[r1], data[r2])
print("Cosine similarity between rows ", \
r1, "and",r2,"is")
print(cos)

tanimoto = tanimotosim(data[r1], data[r2])
print("Tanimoto similarity between rows ", \
r1, "and",r2,"is")
print(tanimoto)

Jaccard similarity between rows  0 and 3 is
0.6956521739130435
Cosine similarity between rows  0 and 3 is
0.9545807293526762
Tanimoto similarity between rows  0 and 3 is
0.9051094890510949
