# KNN - K Nearest Neighbors

Supervised learning.

- Used to classify new data points based on "distance" to known data

- Find the K nearest neighbors, based on your distance metric

<img src="https://images.datacamp.com/image/upload/v1686762721/image2_a2876c62d1.png" alt="Drawing" style="width: 550px;"/>

In [1]:
import numpy as np
import pandas as pd

In [3]:
r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv('.\\MLCourse\\ml-100k\\u.data', sep = "\t", names = r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [5]:
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()

  movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})


Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


Min-Max Feature scaling: is used to bring all values into the range [0,1].

In [7]:
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
#Min-Max Feature scaling:
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x))/(np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [33]:
movieDict = {}
with open('.\\MLCourse\\ml-100k\\u.item') as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split("|")
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = list(map(int,genres))
        #print(genres)
        movieDict[movieID] = (name, genres, movieNormalizedNumRatings.loc[movieID].get('size'),
                              movieProperties.loc[movieID].rating.get("mean"))

In [34]:
movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.7735849056603774,
 3.8783185840707963)

In [37]:
from scipy import spatial

In [38]:
def ComputeDistance(a,b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA,genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA-popularityB)

    return genreDistance + popularityDistance

In [40]:
ComputeDistance(movieDict[2], movieDict[4])

0.8004574042309892

In [41]:
print(movieDict[2])
print(movieDict[4])

('GoldenEye (1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.3567753001715266, 3.550239234449761)


In [42]:
import operator

In [43]:
def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if movie != movieID:
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key = operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    
    return neighbors

In [45]:
movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.7735849056603774,
 3.8783185840707963)

In [47]:
K=5
avgRating = 0
neighbors = getNeighbors(1,K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print(movieDict[neighbor][0] + " "+ str(movieDict[neighbor][3]))

avgRating/= float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127


In [48]:
avgRating

3.7189656165466287