In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [6]:
def get_data():
    column_names = ['user_id', 'item_id', 'rating', 'timestamp']
    movies_df = pd.read_csv('data/movies.csv', usecols=['movieId','title'])
    rating_df = pd.read_csv("data/ratings.csv", usecols = ['userId', 'movieId', 'rating'])
    # Merging both data using MovieId
    movie_data = pd.merge(rating_df,movies_df,on='movieId')
    # Drop na value rows
    combine_movie_rating = movie_data.dropna(axis = 0, subset = ['title'])
    # Get movie rating count by title(movie name)
    movie_ratingCount = combine_movie_rating.groupby(by = ['title'])['rating'].count().reset_index()
    # Renaming rating column as totalRating count
    movie_ratingCount = movie_ratingCount.rename(columns = {'rating': 'totalRatingCount'})[['title', 'totalRatingCount']]
    # merged rating count dataframe
    rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')

    return rating_with_totalRatingCount
movie_data = get_data()
movie_data.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [13]:
def create_recommendation_model(rating_with_totalRatingCount):
    # Taking threshold by observing data
    popularity_threshold = 50
    # Filtering data using threshold
    rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
    # First lets create a Pivot table
    movie_features_df = rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
    
    # Creating matrix using pivot table values
    movie_features_df_matrix = csr_matrix(movie_features_df.values)
    # creating model of NearestNeighbor using cosine similarity
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(movie_features_df_matrix)
    return model_knn, movie_features_df

model, movie_features_df = create_recommendation_model(movie_data)

In [14]:
def get_recommendation(model_knn, movie_features_df):
    query_index = np.random.choice(movie_features_df.shape[0])
    distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

In [15]:
get_recommendation(model, movie_features_df)

Recommendations for Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002):

1: X-Men: The Last Stand (2006), with distance of 0.4120607788570363:
2: I, Robot (2004), with distance of 0.41606357553095485:
3: Bruce Almighty (2003), with distance of 0.4163162506979303:
4: Star Wars: Episode III - Revenge of the Sith (2005), with distance of 0.4682346425000936:
5: Shrek 2 (2004), with distance of 0.46941838224020926:
