In [1]:
import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating']
ratings_df = pd.read_csv("C:/Users/212412835/Documents/Notebooks/ml-100k/u.data",sep='\t', names=r_cols, usecols=range(3))
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [2]:
#group this by movie_id and aggregate by rating 

import numpy as np

movie_ratings_df = ratings_df.groupby('movie_id').agg({'rating': [np.size, np.mean]})
print(movie_ratings_df.head());

         rating          
           size      mean
movie_id                 
1           452  3.878319
2           131  3.206107
3            90  3.033333
4           209  3.550239
5            86  3.302326


In [3]:
#Lets normalize the NO of ratings - 0 Means nobody has rated it, 1 means everybody has rated it

movie_ratings = pd.DataFrame(movie_ratings_df['rating']['size'])
movie_ratings.head()
movie_ratings_normalized_df = movie_ratings.apply(lambda x: (x - np.min(x))/(np.max(x)-np.min(x)))
movie_ratings_normalized_df.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [4]:
# Lets construct a dict with a set of big fat data 
# there are 19 genres - 0 means its in the genre, 1 means its not in the genre
# Dict will contain - movie name, list of genres, Normalized no of ratings(indication of popularity), Avg rating of the movie 

movieDict = {}
with open(r'C:/Users/212412835/Documents/Notebooks/Machine Learning/ml-100k/u.item') as f:
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        movieName = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (movieName,genres, movie_ratings_normalized_df.loc[movieID].get('size'),movie_ratings_df.loc[movieID].rating.get('mean'))
        
        

In [5]:
movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.77358490566037741,
 3.8783185840707963)

In [6]:
#Now let's define a function that computes the "distance" between two movies based on how similar their genres are, 
#and how similar their popularity is. Just to make sure it works, we'll compute the distance between movie ID's 2 and 4:

from scipy import spatial

def ComputeDistance(a,b):
    genresA = a[1]
    genresB = b[1]
    genresDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB= b[2]
    popularityDistance = abs(popularityB-popularityA)
    return genresDistance + popularityDistance

print(ComputeDistance(movieDict[2],movieDict[4]))

0.800457404231


In [7]:
#Higher the distance - Less similar the movies 
print movieDict[2]
print movieDict[4]


('GoldenEye (1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.35677530017152659, 3.5502392344497609)


In [14]:
#Now lets use this 
#For any given movie, we will find out KNN of the same. Then we will compare the avg rating of the KNN results,
#with the rating of the movie. If the KNN are similar then the ratings will be the same 
import operator

def findNeighbours(movieID, K):
    distances = []
    for movie in movieDict:
        if(movie != movieID):
            dist = ComputeDistance(movieDict[movie],movieDict[movieID])
            distances.append((movie,dist))
            
            
    distances.sort(key=operator.itemgetter(1))
    neighbours = []
    
    for x in range(K):
        neighbours.append(distances[x][0])
    return neighbours
    

K=10
avgRating = 0
neighbours = findNeighbours(1,K)
print(neighbours)

for neighbour in neighbours:
    avgRating += movieDict[neighbour][3]
    print movieDict[neighbour][0] + " " + str(movieDict[neighbour][3])
    

avgRating /= float(K)

[294, 95, 151, 168, 269, 259, 240, 25, 94, 422]
Liar Liar (1997) 3.15670103093
Aladdin (1992) 3.81278538813
Willy Wonka and the Chocolate Factory (1971) 3.63190184049
Monty Python and the Holy Grail (1974) 4.0664556962
Full Monty, The (1997) 3.92698412698
George of the Jungle (1997) 2.68518518519
Beavis and Butt-head Do America (1996) 2.78846153846
Birdcage, The (1996) 3.44368600683
Home Alone (1990) 3.08759124088
Aladdin and the King of Thieves (1996) 2.84615384615


In [15]:
#The avg rating is 

avgRating

3.3445905900235564

In [16]:
#How does this compare to Toy Story's actual average rating?

movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.77358490566037741,
 3.8783185840707963)