import libraries

In [1]:
#source: https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea
#item based collaborative filtering with knn

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors



read data

In [2]:
df_movies = pd.read_csv('C:/Users/burak/Desktop/bitirme_proje/movielens-large/movies.csv')
df_ratings = pd.read_csv('C:/Users/burak/Desktop/bitirme_proje/movielens-large/ratings.csv')
#movies = pd.read_csv('D:/BİTİRME/knn/database/ml-100k/u.csv')
#credits = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv') credits okunmadı

take a look at the data

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


removing timestamp and genres columns

In [5]:
df_ratings = df_ratings.loc[:, df_ratings.columns != 'timestamp']
#dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float64'}
df_movies = df_movies.loc[:, df_movies.columns != 'genres']
#dtype={'movieId': 'int32', 'title': 'str'}

In [6]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


sizes of matirices

In [8]:
print(df_movies.shape)
df_ratings.shape

(58098, 2)


(27753444, 3)

In [9]:
num_users = len(df_ratings.userId.unique())
num_items = len(df_ratings.movieId.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

df_ratings_cnt_tmp = pd.DataFrame(df_ratings.groupby('rating').size(), columns=['count'])
df_ratings_cnt_tmp

total_cnt = num_users * num_items
rating_zero_cnt = total_cnt - df_ratings.shape[0]
# append counts of zero rating to df_ratings_cnt
df_ratings_cnt = df_ratings_cnt_tmp.append(
    pd.DataFrame({'count': rating_zero_cnt}, index=[0.0]),
    verify_integrity=True,
).sort_index()
df_ratings_cnt


# add log count
df_ratings_cnt['log_count'] = np.log(df_ratings_cnt['count'])
df_ratings_cnt

# get rating frequency
df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
df_movies_cnt.head()

# filter data
popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop_movies.shape)



There are 283228 unique users and 53889 unique movies in this data set


  df_ratings_cnt = df_ratings_cnt_tmp.append(


shape of original ratings data:  (27753444, 3)
shape of ratings data after dropping unpopular movies:  (27430748, 3)


In [10]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])

# filter data
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)

shape of original ratings data:  (27753444, 3)
shape of ratings data after dropping both unpopular movies and inactive users:  (24178982, 3)


In [11]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)
# create mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [12]:
#since our data has high dimensionality we will be using cosine similarity instead of euclidean
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movie_user_mat_sparse)

In [13]:
#number of users
df_ratings['userId'].nunique()

283228

In [14]:
movie_user_mat_sparse.shape #our sparse matrix is in the shpae of 'movies x users'

(13360, 109483)

In [15]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True

    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

In [16]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie


    Parameters
    ----------
    model_knn: sklearn model, knn model

    data: movie-user matrix

    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [17]:

my_favorite = 'Iron Man'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: Iron Man
Found possible matches in our database: ['Iron Man (2008)', 'Iron Man 3 (2013)', 'Iron Man 2 (2010)']

Recommendation system start to make inference
......

Recommendations for Iron Man:
1: Bourne Ultimatum, The (2007), with distance of 0.42180498388093424
2: Sherlock Holmes (2009), with distance of 0.41908921829125967
3: Inception (2010), with distance of 0.3929465932048053
4: Avatar (2009), with distance of 0.3832303054444628
5: WALL·E (2008), with distance of 0.3831652390647853
6: Star Trek (2009), with distance of 0.375028104539211
7: Batman Begins (2005), with distance of 0.3701491860846239
8: Iron Man 2 (2010), with distance of 0.3701113597259079
9: Avengers, The (2012), with distance of 0.35798574751515333
10: Dark Knight, The (2008), with distance of 0.30102135145953235


In [18]:

my_favorite = 'now you see me 2013'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: now you see me 2013
Found possible matches in our database: ['Now You See Me (2013)', 'Now You See Me 2 (2016)', 'In Your Eyes (2014)', 'Man of Steel (2013)', 'Company You Keep, The (2012)']

Recommendation system start to make inference
......

Recommendations for now you see me 2013:
1: Avengers, The (2012), with distance of 0.5758924619035004
2: Sherlock Holmes: A Game of Shadows (2011), with distance of 0.5706007564099471
3: Avengers: Age of Ultron (2015), with distance of 0.5655001319622228
4: Captain America: The Winter Soldier (2014), with distance of 0.5647702067643393
5: Guardians of the Galaxy (2014), with distance of 0.5622876565983974
6: Iron Man 3 (2013), with distance of 0.5616610096312503
7: Deadpool (2016), with distance of 0.5583411086302532
8: Limitless (2011), with distance of 0.5543582241474001
9: Kingsman: The Secret Service (2015), with distance of 0.5373726743526237
10: Now You See Me 2 (2016), with distance of 0.48478975972882266


In [20]:

my_favorite = 'little mermaid'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: little mermaid
Found possible matches in our database: ['Little Mermaid, The (1989)', 'The Mermaid (2016)']

Recommendation system start to make inference
......

Recommendations for little mermaid:
1: Mulan (1998), with distance of 0.5328251842054996
2: 101 Dalmatians (One Hundred and One Dalmatians) (1961), with distance of 0.5290031017232699
3: Aladdin (1992), with distance of 0.5203074865337098
4: Mary Poppins (1964), with distance of 0.5198336906092028
5: Peter Pan (1953), with distance of 0.5063648185319208
6: Beauty and the Beast (1991), with distance of 0.5053706947067479
7: Cinderella (1950), with distance of 0.5032474026006502
8: Jungle Book, The (1967), with distance of 0.5010069489685072
9: Sleeping Beauty (1959), with distance of 0.4965028390874614
10: Lady and the Tramp (1955), with distance of 0.456374037093525
