# Movie Recommendation using KNN

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df=pd.read_csv('movies.csv',usecols=['movieId','title'])
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [18]:
movies_df.shape

(9742, 2)

In [3]:
ratings_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'])
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [21]:
ratings_df.shape

(100836, 3)

In [4]:
#merging the dataframes
merge_df=pd.merge(ratings_df,movies_df,on='movieId')
merge_df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [58]:
merge_df.shape

(100836, 4)

In [59]:
movie_rating_df=merge_df.groupby(['title'])['rating'].count().reset_index().rename(columns={'rating':'total_rating_count'})
movie_rating_df.sort_values('total_rating_count',ascending=False).head()

Unnamed: 0,title,total_rating_count
3158,Forrest Gump (1994),329
7593,"Shawshank Redemption, The (1994)",317
6865,Pulp Fiction (1994),307
7680,"Silence of the Lambs, The (1991)",279
5512,"Matrix, The (1999)",278


Here we see the the rating count for each movie 

In [10]:
total_df=merge_df.merge(movie_rating_df,left_on='title',right_on='title',how='left')
total_df.head()

Unnamed: 0,userId,movieId,rating,title,total_rating_count
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


Here we merge the rating count column with dataframe

In [42]:
movie_rating_df['total_rating_count'].describe()

count    9719.000000
mean       10.375141
std        22.406220
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
Name: total_rating_count, dtype: float64

In [28]:
popularity_threshhold=50
rating_popular_movie=total_df.loc[total_df.total_rating_count >=50]
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,total_rating_count
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


Here we set minimum number of ratings for the movie is 50

In [29]:
rating_popular_movie.shape

(41362, 5)

In [31]:
movies_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movies_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


Here we converting the dataframe into pivot table for recommending the movies based on user rating 

In [34]:
movies_features_df.shape

(450, 606)

In [33]:
from scipy.sparse import csr_matrix

movies_features_df_matrix = csr_matrix(movies_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movies_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [60]:
#for gettinng the random movieId
query_index=np.random.choice(movies_features_df.shape[0])
print(query_index)

#here we get distance and movieid from the random movieid
distances,indices=model_knn.kneighbors(movies_features_df.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)

288


In [52]:
# target
movies_features_df.index[query_index]

'Inception (2010)'

In [54]:
for i in range(0,len(distances.flatten())):
    if(i==0):
        print('Recommendations for {0}:\n'.format(movies_features_df.index[query_index]))
    else:
        print('{0}: {1},with distance of {2} :'.format(i,movies_features_df.index[indices.flatten()][i],distances.flatten()[i]))

Recommendations for Inception (2010):

1: Dark Knight, The (2008),with distance of 0.27273692797625093 :
2: Inglourious Basterds (2009),with distance of 0.35389678083836207 :
3: Shutter Island (2010),with distance of 0.38226361086909444 :
4: Dark Knight Rises, The (2012),with distance of 0.3824955344787052 :
5: Fight Club (1999),with distance of 0.38458257285337005 :
