# KNN using Heap
* Top k Movies with Highest Rating
* Implement KNN using Heap O(nlogk)
* Implement KNN by Brute Force O(nklogk)

## Find Top k Movies with Highest Rating

In [None]:
'''
Question
Given the following movie class and a movie, find its related movies of top k highest rating. 
'''

In [None]:
# class Movie:
#     def __init__(self, movie_id, rating):
#         # set movie_id and rating
#     def getId():
#         return self.movie_id
#     def getRating():
#         return self.rating
#     def getRelatedMovies():
#         # return list of Movies connected to self

### Solution 1: Using Heap and Queue

In [277]:
import heapq

def find_queue(movie, k):
    """
        Return list of related movies of top k highest rating using heap and queue. 
        Time complexity : O(nlogk)
        Space complexity : O(n) for k-size heap of neighbor, n-size list of visited, n-size queue
    """
    neighbor = [] 

    queue = []
    
    visited = set()
    visited.add(movie.getId())
    
    for m in movie.getRelatedMovies():
        id_ = m.getId()
        visited.add(id_)
        queue.append(m)
    
    while queue:
        # dequeue first element
        m = queue.pop(0)
        print("m", m.getId())

        # update neighbor (heap) using first element in queue
        rating = m.getRating()
        if len(neighbor) < k:  #build k-neighbor
            heapq.heappush(neighbor, (rating, m))
        elif rating > neighbor[0][0]:  #if rating > min(rating) in k-neighbor, add to heap
            heapq.heappushpop(neighbor, (rating, m))
        
        # loop through all related movies => enqueue 
        for s in m.getRelatedMovies():
            if s.getId() not in visited:
                visited.add(s.getId())
                queue.append(s)
    
    return neighbor

In [278]:
# test case
class Movie():
    def __init__(self, rating, Id, relatedMovies):
        self.rating = rating
        self.id = Id
        self.relatedMovies = relatedMovies
        
    def getId(self):
        return self.id

    def getRating(self):
        return self.rating
    
    def getRelatedMovies(self):
        return self.relatedMovies
    
def printknn(knn):
    for rating, m in knn:
        print("movie", m.getId(), "rating", rating)

        
m0 = Movie(2,0,[m2, m1])
m1 = Movie(4,1,[m0, m3,m4])
m2 = Movie(3,2,[m0])
m3 = Movie(5,3,[m1])
m4 = Movie(2,4,[m1])

printknn(find_queue(m0,2))

m 2
m 1
m 3
m 4
movie 1 rating 4
movie 3 rating 5


### Solution 2: Using Heap and Recursion

In [287]:
import heapq

neighbor = []
visited = set()

def find_rec(movie, k):
    """
        Return list of related movies of top k highest rating using heap and recursion. 
        Time complexity : O(nlogk)
        Space complexity : O(n) for k-size heap of neighbor, n-size list of visited, recursion O(n)
    """
    global neighbor
    global visited
    visited.add(movie.getId())
    findHelper(movie, k)
    return neighbor


def findHelper(movie, k):    
    
    relatedMovies = movie.getRelatedMovies()
    
    # use recursion
    for m in relatedMovies:
       
        if m.getId() not in visited:
            visited.add(m.getId())
            findHelper(m, k)
        
        # use heap
        rating = m.getRating()
        if len(neighbor) < k:  #build k-neighbor
            heapq.heappush(neighbor, (rating, m))
        elif rating > neighbor[0][0]:  #if rating > min(rating) in k-neighbor, add to heap
            heapq.heappushpop(neighbor, (rating, m))
        

In [304]:
# test case
class Movie():
    def __init__(self, rating, Id, relatedMovies):
        self.rating = rating
        self.id = Id
        self.relatedMovies = relatedMovies
        
    def getId(self):
        return self.id

    def getRating(self):
        return self.rating
    
    def getRelatedMovies(self):
        return self.relatedMovies
    
def printknn(knn):
    for rating, m in knn:
        print("movie", m.getId(), "rating", rating)
        
    
m0 = Movie(2,0,[m2, m1])
m1 = Movie(4,1,[m3,m4])
m2 = Movie(3,2,[m0])
m3 = Movie(5,3,[m1])
m4 = Movie(2,4,[m1])

printknn(find_rec(m1,2))

neighbor = []
visited = set()
printknn(find_rec(m2,2))

movie 3 rating 5
movie 3 rating 5
movie 1 rating 4
movie 3 rating 5


In [303]:
m0.getRelatedMovies()
m0.getId()
m0.getRating()

2

## Implement KNN using Heap

In [None]:
'''
Question
Implement KNN using heap.
'''

In [23]:
import heapq
import math
from collections import Counter

def knnHeap(trainSet, test, k):
    """
    Using heap to store k neighbours.
    
    Time complexity : O(nlogk) 
                      since we compute distance for each traininh point O(n) and 
                      perform heapq.heappushpop for k-size heap O(logk)    
    Space complexity : O(k) for k-size heap
    """
    neighbor = []
    
    for data in trainSet:
        x = data[:-1]
        y = data[-1]        
        d = distance(x, test, len(test))
        info = (-d, y)
        
        if len(neighbor) < k:
            heapq.heappush(neighbor, info)  #build heap (size k) by adding first k training points 
        elif d <= -neighbor[0][0]:
            heapq.heappushpop(neighbor, info)  #if distance < = max(distance) in k-neighbor, add to heap
        
        print(neighbor)
        
        count = Counter(y for d, y in neighbor) #majority vote from heap
        
    return count.most_common()[0][0]
    
    
def distance(x1,x2,dim):
    """
    Compute the euclidean distance between two data points.
    """
    d = 0
    for i in range(dim):
        d += pow(x1[i]-x2[i],2)
    return math.sqrt(d)


# test case
trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b'], [6, 6, 6, 'b']]
knnHeap(trainSet, test=[5,5,5], k=2) 

[(-5.196152422706632, 'a')]
[(-5.196152422706632, 'a'), (-1.7320508075688772, 'b')]
[(-1.7320508075688772, 'b'), (-1.7320508075688772, 'b')]


'b'

## Implement KNN by Brute Force

In [38]:
import math
import operator

def knn(trainSet, test, k):
    """
    Using n-size list to store distance between each training point and test data point.
    Sort the list to find the top k smallest distance.
    
    Time complexity : O(n^2logn), n = no. of training examples
                      since O(n) for computing distance between each point of training set and test data,
                      (nlogn) for sorting n-size distance list 
    Space complecity: O(n) for n-size list to store distance
    
    If we just use k-size list to store top k smallest distance, the algorithm casn be improved by
    Time complexity : O(nklogk)    
    """
    d = []
    dim = len(test)-1
    for x in trainSet:
        dist = distance(test, x, dim)
        d.append((x, dist))
    
    d.sort(key=operator.itemgetter(1))    
    
    neighbor = []
    for i in range(k):
        neighbor.append(d[i][0])
    print(neighbor)
        
    count = Counter(y for d, y in neighbor) #majority vote from heap
    return count.most_common()[0][0]


def distance(x1,x2,dim):
    """
    Compute the euclidean distance between two data points.
    """
    d = 0
    for i in range(dim):
        d += pow(x1[i]-x2[i],2)
    return math.sqrt(d)


# test case
trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b'], [6, 6, 6, 'b']]
knnHeap(trainSet, test=[5,5,5], k=2) 

[(-5.196152422706632, 'a')]
[(-5.196152422706632, 'a'), (-1.7320508075688772, 'b')]
[(-1.7320508075688772, 'b'), (-1.7320508075688772, 'b')]


'b'

In [40]:
def majorityVote(neighbors):
    """
    Compute majority vote for a given neighbour.
    """
    classVotes = {}
    for i in range(len(neighbors)):
        label = neighbors[i][-1]
        if label in classVotes:
            classVotes[label] += 1
        else:
            classVotes[label] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]


# test case
neighbors = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
majorityVote(neighbors)

'a'