In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Loading & preprocessing the data

In [2]:
movies_df = pd.read_csv('movies.csv')
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
rating_df = pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'])
rating_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
df = pd.merge(movies_df, rating_df, on='movieId')
df

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100833,193585,Flint (2017),Drama,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5


In [5]:
df.isnull().sum()

movieId    0
title      0
genres     0
userId     0
rating     0
dtype: int64

# For collaborative filtering, we will consider number of users who have rated the particular movie & also what rating they have given

In [14]:
#calculating rating count
combined_movie_rating = df.groupby('title')['rating'].count()
combined_movie_rating = pd.DataFrame(combined_movie_rating)
combined_movie_rating.rename(columns = {'rating': 'RatingCount'}, inplace = True)
combined_movie_rating.reset_index(inplace = True)
combined_movie_rating

Unnamed: 0,title,RatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [15]:
df1 = pd.merge(df, combined_movie_rating, on = 'title')
df1

Unnamed: 0,movieId,title,genres,userId,rating,RatingCount
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,215
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1
100833,193585,Flint (2017),Drama,184,3.5,1
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1


We will drop the columns where rating count is less than 50.

In [16]:
df1 = df1[df1['RatingCount'] > 50]
df1

Unnamed: 0,movieId,title,genres,userId,rating,RatingCount
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,215
...,...,...,...,...,...,...
98313,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,561,2.0,54
98314,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,586,4.0,54
98315,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,596,4.0,54
98316,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,599,3.5,54


In [17]:
#creating a pivot table
movies_pivot = df.pivot_table(index = 'title', columns = 'userId', values = 'rating').fillna(0)
movies_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
¡Three Amigos! (1986),4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model building

In [23]:
#using the NearestNeighbors model 
#metric will be cosine distance
model_knn = NearestNeighbors(metric = 'cosine', algorith = 'brute')
model_knn.fit(movies_pivot)

In [40]:
#creating a function that can recommend movies based on cosine distance
def recommend_movies(movie_name, num_movies_to_be_recommended):
    dist, ind = model_knn.kneighbors(movies_pivot.loc[movie_name].values.reshape(1, -1), n_neighbors = num_movies_to_be_recommended, return_distance = True)
    rec_movies = []
    scores = []
    
    for i, d in zip(ind.flatten(), dist.flatten()):
        rec_movies.append(movies_pivot.index[i])
        scores.append(round(d, 2))
        
    df = pd.DataFrame({'recommended_movies': rec_movies, 'cosine_distance': scores})
    df['cosine_similarity'] = 1 - df['cosine_distance']
    df.sort_values(by = 'cosine_similarity', ascending = False, inplace = True)
    df.drop(['cosine_distance'], axis = 1, inplace = True)
    return df
    
        

In [41]:
recommend_movies('Toy Story (1995)', 5)

Unnamed: 0,recommended_movies,cosine_similarity
0,Toy Story (1995),1.0
1,Toy Story 2 (1999),0.57
2,Jurassic Park (1993),0.57
3,Independence Day (a.k.a. ID4) (1996),0.56
4,Star Wars: Episode IV - A New Hope (1977),0.56


In [42]:
recommend_movies('Departed, The (2006)', 5)

Unnamed: 0,recommended_movies,cosine_similarity
0,"Departed, The (2006)",1.0
1,"Prestige, The (2006)",0.61
2,"Dark Knight, The (2008)",0.6
3,Catch Me If You Can (2002),0.6
4,Batman Begins (2005),0.6


In [43]:
recommend_movies('Lord of the Rings: The Fellowship of the Ring, The (2001)', 5)

Unnamed: 0,recommended_movies,cosine_similarity
0,"Lord of the Rings: The Fellowship of the Ring,...",1.0
1,"Lord of the Rings: The Return of the King, The...",0.87
2,"Lord of the Rings: The Two Towers, The (2002)",0.87
3,Pirates of the Caribbean: The Curse of the Bla...,0.66
4,"Matrix, The (1999)",0.65
