import libraries

In [45]:
#source: https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea
#item based collaborative filtering with knn


import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz

import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

read data

In [46]:
df_movies = pd.read_csv('../knn/database/ml-latest-small/ml-latest-small/movies.csv')
df_ratings = pd.read_csv('../knn/database/ml-latest-small/ml-latest-small/ratings.csv')
#movies = pd.read_csv('D:/BİTİRME/knn/database/ml-100k/u.csv')
#credits = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv') credits okunmadı

take a look at the data

In [47]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [48]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


removing timestamp and genres columns

In [49]:
df_ratings = df_ratings.loc[:, df_ratings.columns != 'timestamp']
#dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float64'}
df_movies = df_movies.loc[:, df_movies.columns != 'genres']
dtype={'movieId': 'int32', 'title': 'str'}

In [50]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [51]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


sizes of matirices

In [52]:
print(df_movies.shape)
df_ratings.shape

(9742, 2)


(100836, 3)

In [53]:
from scipy.sparse import csr_matrix


# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0) #filling unknown values to zero to create a sparse matrix


# convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values) #has 'movies x users'

In [54]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)

# create mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [55]:
#since our data has high dimensionality we will be using cosine similarity instead of euclidean
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movie_user_mat_sparse)

In [56]:
#number of users
df_ratings['userId'].nunique()

610

In [57]:
movie_user_mat_sparse.shape #our sparse matrix is in the shpae of 'movies x users'

(9724, 610)

In [58]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True

    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

In [59]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie


    Parameters
    ----------
    model_knn: sklearn model, knn model

    data: movie-user matrix

    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [60]:

my_favorite = 'Iron Man'

make_recommendation(
    model_knn=model_knn,
    data=mat_movie_features,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: Iron Man
Found possible matches in our database: ['Iron Man (1931)', 'Iron Man (2008)', 'Iron Man 3 (2013)', 'Iron Man 2 (2010)']

Recommendation system start to make inference
......

Recommendations for Iron Man:
1: Iron Man 2 (2010), with distance of 0.8378452406436285
2: Deadpool (2016), with distance of 0.8287028225431171
3: Captain America: The First Avenger (2011), with distance of 0.8284499215114449
4: Thor (2011), with distance of 0.8100078666989077
5: Iron Man 3 (2013), with distance of 0.8084347429557697
6: Easy A (2010), with distance of 0.8012997197858369
7: 21 Jump Street (2012), with distance of 0.8012997197858369
8: Ex Machina (2015), with distance of 0.7914855859429253
9: Avengers: Age of Ultron (2015), with distance of 0.7903430326556163
10: Iron Man (1931), with distance of 0.0


In [64]:

my_favorite = 'John Wicmk'

make_recommendation(
    model_knn=model_knn,
    data=mat_movie_features,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: John Wicmk
Found possible matches in our database: ['John Wick (2014)']

Recommendation system start to make inference
......

Recommendations for John Wicmk:
1: Kingsman: The Secret Service (2015), with distance of 0.4586720292564803
2: Suicide Squad (2016), with distance of 0.458205226713179
3: John Wick: Chapter Two (2017), with distance of 0.4521944603296988
4: This Is the End (2013), with distance of 0.4491682129020782
5: Dredd (2012), with distance of 0.44694313705193744
6: Deadpool (2016), with distance of 0.43940792888296076
7: Rogue One: A Star Wars Story (2016), with distance of 0.43465005008595414
8: Fast Five (Fast and the Furious 5, The) (2011), with distance of 0.43439615349671235
9: Snowpiercer (2013), with distance of 0.3812274376562589
10: Mad Max: Fury Road (2015), with distance of 0.36147672673719033
