### Importing the required libraries

In [89]:
import os
import pandas as pd

### Loading the dataset
Loading movies data and ratings data

In [90]:
data_path = os.path.join('C://Users/Saurabh/Downloads/New_Movies_Data/', 'ml-latest-small/')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

In [91]:
# read data
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'},
    encoding="ISO-8859-1")


In [92]:
df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

### Printing the head of data to console

In [93]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [94]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


### Filtering the row indexes whos count appears more than 50 times in the dataset

In [95]:
df_movies_cnt = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])

In [96]:
#Storing the indexes of all the rows whose count is greater than 50
popular_movies = list(set(df_movies_cnt.query('count >= 50').index))  # noqa

In [97]:
#Storing boolean values for each row whose movieid is in the list of popular_movies
movies_filter = df_ratings.movieId.isin(popular_movies).values

In [98]:
df_users_cnt = pd.DataFrame(
    df_ratings.groupby('userId').size(),
    columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 50').index))  # noqa
#Checks if the user Id is in the list of popular movies
users_filter = df_ratings.userId.isin(active_users).values

In [99]:
df_ratings_filtered = df_ratings[movies_filter & users_filter]

In [100]:
df_ratings_filtered.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


In [101]:
#Creating pivot matrix using df_ratings_filtered
movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [102]:
#Creating a hashmap for movie names according to movie_user_mat
hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
        }

### Importing csr_matrix  and NearestNeighbors

In [103]:
from scipy.sparse import csr_matrix

In [104]:
from sklearn.neighbors import NearestNeighbors
#model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [105]:
model = NearestNeighbors()

In [106]:
movie_rating_thres = 50
user_rating_thres = 50

In [107]:
#Setting the model parameters
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
model.set_params(**{
            'n_neighbors': 20,
            'algorithm': 'brute',
            'metric': 'cosine',
            'n_jobs': -1})

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [108]:
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [109]:
# clean up
import gc
del df_movies, df_movies_cnt, df_users_cnt
del df_ratings, df_ratings_filtered, movie_user_mat
gc.collect()

154

In [110]:
from fuzzywuzzy import fuzz
def _fuzzy_matching(hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

In [111]:
#Fitting the model
model.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [112]:
#Storing the name of the movie for which recommendations are to be found
fav_movie='Jumanji'

In [113]:
#Storing the index of movie for which recommendations are to be found
idx =_fuzzy_matching(hashmap, fav_movie)

Found possible matches in our database: ['Jumanji (1995)']



In [114]:
#Assigning number of recommendations  to be found
n_recommendations=20

In [115]:
#Calculating distances and indices for nearest neighbors
distances, indices = model.kneighbors(
            movie_user_mat_sparse[idx],
            n_neighbors=n_recommendations+1)

In [116]:
# get list of raw idx of recommendations
raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]

In [117]:
#Reversing the hashmap and printing the recommendations
reverse_hashmap = {v: k for k, v in hashmap.items()}
print('Recommendations for {}:'.format(fav_movie))
for i, (idx, dist) in enumerate(raw_recommends):
    print('{0}: {1}, with distance ''of {2}'.format(i+1, reverse_hashmap[idx], dist))

Recommendations for Jumanji:
1: Waterworld (1995), with distance of 0.5631301403045654
2: Beetlejuice (1988), with distance of 0.5627744793891907
3: Batman Returns (1992), with distance of 0.5608212947845459
4: Lion King, The (1994), with distance of 0.5605781078338623
5: Stargate (1994), with distance of 0.5578533411026001
6: Goonies, The (1985), with distance of 0.556730329990387
7: Labyrinth (1986), with distance of 0.5536227226257324
8: Teenage Mutant Ninja Turtles (1990), with distance of 0.5535423755645752
9: Jurassic Park (1993), with distance of 0.5484600067138672
10: Space Jam (1996), with distance of 0.5411847829818726
11: Willow (1988), with distance of 0.5362755060195923
12: Star Wars: Episode I - The Phantom Menace (1999), with distance of 0.5318625569343567
13: Willy Wonka and the Chocolate Factory (1971), with distance of 0.5277585983276367
14: Mask, The (1994), with distance of 0.5231635570526123
15: Santa Clause, The (1994), with distance of 0.5206214189529419
16: Neve