In [2]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [8]:
movies.sample(10)

Unnamed: 0,movieId,title,genres
1316,1770,B. Monkey (1998),Crime|Romance|Thriller
473,540,Sliver (1993),Thriller
4047,5749,Ghost Story (1981),Drama|Horror
7319,77364,"Losers, The (2010)",Action|Adventure|Drama|Mystery|Thriller
5639,27373,61* (2001),Drama
417,479,Judgment Night (1993),Action|Crime|Thriller
4383,6427,"Railway Children, The (1970)",Children|Drama
1434,1957,Chariots of Fire (1981),Drama
7166,71899,Mary and Max (2009),Animation|Comedy|Drama
3067,4116,Hollywood Shuffle (1987),Comedy


In [9]:
ratings.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
63982,414,5540,3.0,1034001913
272,3,1275,3.5,1306463323
91473,593,5952,4.0,1181007502
84764,549,589,3.0,1464282651
72740,469,2302,4.0,965336263
28755,199,2329,3.0,1023654611
37771,256,4167,4.0,1446581320
18060,113,2166,3.0,980051734
19398,125,71530,2.0,1474295157
57760,380,53125,3.0,1494803831


In [10]:
movies.shape

(9742, 3)

In [11]:
ratings.shape

(100836, 4)

In [12]:
df = ratings.pivot(index='movieId', columns='userId', values='rating')

In [13]:
df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [14]:
df.fillna(0, inplace=True)

In [15]:
df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
no_user_votes = ratings.groupby('movieId')['rating'].agg('count')
no_movie_votes = ratings.groupby('userId')['rating'].agg('count')


In [18]:
final_df = df.loc[no_user_votes[no_user_votes > 10].index, :]


In [19]:
final_df = final_df.loc[:, no_movie_votes[no_movie_votes > 50].index]

In [20]:
final_df.head()

userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0


In [21]:
final_df.shape

(2121, 378)

In [22]:
csr_df = csr_matrix(final_df.values)
final_df.reset_index(inplace=True)

In [23]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)


In [24]:
knn.fit(csr_df)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [29]:
def get_similar_movies(name, n):
    movie_list = movies[movies['title'].str.contains(name)]
    if len(movie_list):
        movie_idx = movie_list.iloc[0]['movieId']
        movie_idx = final_df[final_df['movieId'] == movie_idx].index[0]
        dist, ind = knn.kneighbors(csr_df[movie_idx], n_neighbors=n+1)
        rec_movie_idx = sorted(list(zip(ind.squeeze().tolist(), 
                               dist.squeeze().tolist())),
                               key=lambda x: x[1])[:0:-1]
        rec_frame = []
        for val in rec_movie_idx:
            movie_idx = final_df.iloc[val[0]]['movieId']
            idx = movies[movies['movieId'] == movie_idx].index
            rec_frame.append({'Title': movies.iloc[idx]['title'].values[0],
                             'Distance':val[1]})
        recommendations = pd.DataFrame(rec_frame, index=range(1, n+1))
        return recommendations
    else:
        return "No movies found."

In [37]:
get_similar_movies('Guardians of the Galaxy', 6)

Unnamed: 0,Title,Distance
1,Deadpool (2016),0.340952
2,Ant-Man (2015),0.333083
3,Captain America: The First Avenger (2011),0.33137
4,Iron Man 2 (2010),0.327615
5,X-Men: Days of Future Past (2014),0.307888
6,"Avengers, The (2012)",0.241876
