In [46]:
import pandas as pd
import numpy as np

from scipy.spatial.distance import cosine
import operator

import warnings
warnings.filterwarnings('ignore')

In [47]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [48]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [49]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [50]:
df = pd.merge(ratings, movies, on = 'movieId', how = 'left')

In [51]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [52]:
df.groupby('title')['rating'].mean().sort_values(ascending = False).head()

title
Karlson Returns (1970)                                                         5.0
Zeitgeist: Moving Forward (2011)                                               5.0
Dream of Light (a.k.a. Quince Tree Sun, The) (Sol del membrillo, El) (1992)    5.0
Dragons: Gift of the Night Fury (2011)                                         5.0
12 Angry Men (1997)                                                            5.0
Name: rating, dtype: float64

In [53]:
df.groupby('title')['userId'].count().sort_values(ascending = False).head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: userId, dtype: int64

In [54]:
n_ratings = len(df)
n_movies = len(df['movieId'].unique())
n_users = len(df['userId'].unique())

In [55]:
print("Number of Ratings: ", n_ratings)
print("Number of movies: ", n_movies)
print("Number of users: ", n_users)
print("Average ratings per user: ", round(n_ratings/n_users, 2))
print("Average ratings per movie: ", round(n_ratings/n_movies, 2))

Number of Ratings:  100836
Number of movies:  9724
Number of users:  610
Average ratings per user:  165.3
Average ratings per movie:  10.37


In [56]:
user_freq = df[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_movies']
user_freq.head()

Unnamed: 0,userId,n_movies
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [57]:
def genre_to_vector(genres, all_genres):
    vector = [0]*len(all_genres)
    genre_list = genres.split('|')
    for genre in genre_list:
        if genre in all_genres:
            index = all_genres.index(genre)
            vector[index] = 1
    return vector

unique_genres = set()
for genres in df['genres']:
    genre_list = genres.split('|')
    for genre in genre_list:
        unique_genres.add(genre)
        
all_genres = sorted(list(unique_genres))

In [58]:
movieDict = {}
for index, row in df.iterrows():
    movieId = row['movieId']
    title = row['title']
    genre_vector = genre_to_vector(row['genres'], all_genres)
    rating = row['rating']
    movieDict[movieId] = (title, genre_vector, rating)

In [59]:
movieDict[1]

('Toy Story (1995)',
 [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 5.0)

In [60]:
def compute_distance(movie_a, movie_b):
    genre_distance = cosine(movie_a[1], movie_b[1])
    rating_distance = abs(movie_a[2]-movie_b[2])
    total_distance = genre_distance + rating_distance
    return total_distance, genre_distance, rating_distance

In [61]:
movie_id = int(input("Enter the movie ID: "))

def get_neighbors(movie_id, k):
    distances = []
    for movie in movieDict:
        if movie != movie_id:
            dist = compute_distance(movieDict[movie_id], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key = operator.itemgetter(1))
    neighbors = []
    for i in range(min(k, len(distances))):
        neighbors.append(distances[i][0])
    
    return neighbors

K = 5
neighbors = get_neighbors(movie_id, K)
avg_rating = 0

print(f"{movieDict[movie_id][0]} {movieDict[movie_id][2]}")

for neighbor in neighbors:
    avg_rating += movieDict[neighbor][2]
    print(f"{movieDict[neighbor][0]} {movieDict[neighbor][2]}")

if neighbors:
    avg_rating /= len(neighbors)
    print(f"Average Rating of Nearest Neighbors: {avg_rating}")
else:
    print("No recommendation")

Jumanji (1995) 2.0
NeverEnding Story III, The (1994) 2.0
Santa Claus: The Movie (1985) 2.0
Gulliver's Travels (1996) 2.0
Lord of the Rings, The (1978) 2.0
Ponyo (Gake no ue no Ponyo) (2008) 2.0
Average Rating of Nearest Neighbors: 2.0
