# KNN (K - Nearest - Neighbors)

Predicting the rating of a movie based on the ratings of similar movies and their popularity. 

In [2]:
import pandas as pd
import numpy as np 

r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep = '\t', names = r_cols, usecols = range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


Grouping the data by movie ID and computing the total number of ratings (popularity) and average rating for each movie.

In [3]:
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


Normalizing the rating size to scale 0 - 1:

In [7]:
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


Importing data from u.item file which contains genre information. Creating a dictionary where each entry conatins the movie name, list of genre values, popularity score and average rating.

In [8]:
movieDict = {}
with open(r'ml-100k/u.item', encoding = 'ISO-8859-1') as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

In [9]:
print(movieDict[1])

('Toy Story (1995)', array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.7735849056603774, 3.8783185840707963)


Computing the 'distance' between 2 movies based on their similarities in genres and popularity.


In [10]:
from scipy import spatial

def computeDistance(a, b):
    genreA = a[1]
    genreB = b[1]
    genreDistance = spatial.distance.cosine(genreA, genreB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance

In [11]:
computeDistance(movieDict[3], movieDict[4])

1.2041166380789021

In [15]:
# Higher the distance less similar are the movies 
print(movieDict[3])
print(movieDict[4])
print(movieDict[1])

('Four Rooms (1995)', array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.15265866209262435, 3.033333333333333)
('Get Shorty (1995)', array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.3567753001715266, 3.550239234449761)
('Toy Story (1995)', array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.7735849056603774, 3.8783185840707963)


Computing the K-nearest neighbors for a given test case:

In [14]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = computeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie,dist))
    distances.sort(key = operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors
    
K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print(movieDict[neighbor][0] + '' + str(movieDict[neighbor][3]))
    
avgRating /= K    

Liar Liar (1997)3.156701030927835
Aladdin (1992)3.8127853881278537
Willy Wonka and the Chocolate Factory (1971)3.6319018404907975
Monty Python and the Holy Grail (1974)4.0664556962025316
Full Monty, The (1997)3.926984126984127
George of the Jungle (1997)2.685185185185185
Beavis and Butt-head Do America (1996)2.7884615384615383
Birdcage, The (1996)3.4436860068259385
Home Alone (1990)3.0875912408759123
Aladdin and the King of Thieves (1996)2.8461538461538463


In [17]:
# Comparing the actual average rating of the movie with the predicted rating
avgRating

3.3445905900235564