In [1]:
import pandas as pd
import numpy as np 
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [2]:
from surprise import KNNBasic, accuracy
from surprise.model_selection import train_test_split
from surprise.dump import dump

In [3]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


ratings from 0 to 5

In [6]:
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [7]:
sorted(ratings.rating.unique())

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

In [8]:
movieIds = np.sort(ratings.movieId.unique())
movie_id_to_idx_map = {v: i for i, v in enumerate(movieIds)}
movie_idx_to_id_map = {i: v for i, v in enumerate(movieIds)}

In [9]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# KNN Based approach Item-Item Similarity

In [10]:
sim_options = {'user_based': False} # item item based
knn = KNNBasic(k=30, sim_options=sim_options)

In [11]:
# evaluate using cross validation
cv_knn = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9164  0.9159  0.9145  0.9159  0.9011  0.9128  0.0059  
MAE (testset)     0.7065  0.7042  0.7048  0.7070  0.6968  0.7039  0.0037  
Fit time          6.30    6.54    6.41    6.50    6.50    6.45    0.09    
Test time         9.83    9.67    9.75    9.87    9.76    9.78    0.07    


In [12]:
# build model on whole data
train = data.build_full_trainset()

In [13]:
knn.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1210e54f0>

In [14]:
knn.get_neighbors(0,5)

[96, 280, 287, 326, 367]

# Last rated movie for every user

In [15]:
# last rated - last movie rated as per timestamp having more than 3 rating
last_rated = ratings[ratings.rating>=3].sort_values(['userId', 'timestamp'], ascending=[True, False]).groupby('userId').head(1)
last_rated.head()

Unnamed: 0,userId,movieId,rating,timestamp
161,1,2492,4.0,965719662
247,2,80489,4.5,1445715340
283,3,3024,4.5,1306464054
502,4,4246,4.0,1007574542
527,5,247,5.0,847435337


In [16]:
def movieId_to_movieIdx(movieId):
    return movie_id_to_idx_map.get(movieId, -1)

def movieIdx_to_movieId(movieIdx):
    return movie_idx_to_id_map.get(movieIdx, -1)

In [17]:
def get_top_n_movies_ids_itemitem(knn, movieId, n=5):
    '''
    using knn basedd item-item model,  find the top n neighbours based on similarity scores
    returns: list of n movieIds
    '''
    midx = movieId_to_movieIdx(movieId)
    if midx < 0:
        return "Movie not found"
    return [movieIdx_to_movieId(i) for i in knn.get_neighbors(midx, n)]

In [18]:
# get top 5 recommendation based on last rated movie (rated 3  or above)
last_rated['last_rated_rec'] = last_rated.movieId.apply(lambda x: get_top_n_movies_ids_itemitem(knn, x))

In [19]:
# create id to title mapping
movie_title_map = movies[['movieId','title']].set_index('movieId').to_dict()['title']

In [20]:
# derive titles from the ids - to manually check
last_rated['last_rated_movie'] = last_rated['movieId'].apply(lambda x: movie_title_map.get(x,""))
last_rated['recommended_movies'] = last_rated['last_rated_rec'].apply(lambda x: [movie_title_map.get(i,"") for i in x])

In [21]:
last_rated

Unnamed: 0,userId,movieId,rating,timestamp,last_rated_rec,last_rated_movie,recommended_movies
161,1,2492,4.0,965719662,"[24, 53, 121, 149, 151]",20 Dates (1998),"[Powder (1995), Lamerica (1994), Boys of St. V..."
247,2,80489,4.5,1445715340,"[6, 8, 70, 83, 118]","Town, The (2010)","[Heat (1995), Tom and Huck (1995), From Dusk T..."
283,3,3024,4.5,1306464054,"[27, 36, 38, 45, 64]",Piranha (1978),"[Now and Then (1995), Dead Man Walking (1995),..."
502,4,4246,4.0,1007574542,"[103, 151, 229, 304, 384]",Bridget Jones's Diary (2001),"[Unforgettable (1996), Rob Roy (1995), Death a..."
527,5,247,5.0,847435337,"[47, 108, 128, 166, 178]",Heavenly Creatures (1994),"[Seven (a.k.a. Se7en) (1995), Catwalk (1996), ..."
...,...,...,...,...,...,...,...
97754,606,2355,3.0,1368460577,"[151, 558, 606, 680, 700]","Bug's Life, A (1998)","[Rob Roy (1995), Pagemaster, The (1994), Candy..."
98664,607,4069,3.0,997847203,"[9, 10, 44, 52, 53]","Wedding Planner, The (2001)","[Sudden Death (1995), GoldenEye (1995), Mortal..."
99494,608,52245,3.0,1189563917,"[6, 7, 18, 25, 42]",Blades of Glory (2007),"[Heat (1995), Sabrina (1995), Four Rooms (1995..."
99523,609,650,3.0,847221080,"[151, 193, 248, 257, 261]",Moll Flanders (1996),"[Rob Roy (1995), Showgirls (1995), Houseguest ..."
