In [1]:
import pandas as pd
import numpy as np 
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict

In [2]:
from surprise import KNNBasic, accuracy, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.dump import dump

In [3]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


ratings from 0 to 5

In [6]:
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [7]:
sorted(ratings.rating.unique())

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

In [8]:
movieIds = np.sort(ratings.movieId.unique())
movie_id_to_idx_map = {v: i for i, v in enumerate(movieIds)}
movie_idx_to_id_map = {i: v for i, v in enumerate(movieIds)}

In [9]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# KNN Based approach Item-Item Similarity

In [10]:
sim_options = {'user_based': False} # item item based
knn = KNNBasic(k=30, sim_options=sim_options)
knn2 = KNNWithMeans(k=30, sim_options=sim_options)

In [11]:
# evaluate using cross validation
cv_knn = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9147  0.9058  0.9131  0.9152  0.9121  0.9122  0.0034  
MAE (testset)     0.7023  0.6981  0.7034  0.7063  0.7040  0.7028  0.0027  
Fit time          6.77    6.71    6.94    6.83    6.44    6.74    0.17    
Test time         10.14   9.82    9.86    10.08   9.79    9.94    0.14    


In [12]:
cv_knn2 = cross_validate(knn2, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8961  0.8940  0.8862  0.9013  0.9008  0.8957  0.0055  
MAE (testset)     0.6866  0.6853  0.6770  0.6866  0.6882  0.6848  0.0040  
Fit time          6.64    6.80    6.83    8.79    10.72   7.96    1.59    
Test time         10.05   9.92    10.05   12.94   12.34   11.06   1.31    


KNNWithMeans works better than KNNBasic

Hence, would train using KNNWithMeans  using full data

In [13]:
# build model on whole data
train = data.build_full_trainset()

In [14]:
knn2.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1229dbd30>

In [15]:
knn2.get_neighbors(0,5)

[96, 280, 287, 326, 367]

# Last rated movie for every user

In [16]:
# last rated - last movie rated as per timestamp having more than 3 rating
last_rated = ratings[ratings.rating>=3].sort_values(['userId', 'timestamp'], ascending=[True, False]).groupby('userId').head(1)
last_rated.head()

Unnamed: 0,userId,movieId,rating,timestamp
161,1,2492,4.0,965719662
247,2,80489,4.5,1445715340
283,3,3024,4.5,1306464054
502,4,4246,4.0,1007574542
527,5,247,5.0,847435337


In [17]:
def movieId_to_movieIdx(movieId):
    return movie_id_to_idx_map.get(movieId, -1)

def movieIdx_to_movieId(movieIdx):
    return movie_idx_to_id_map.get(movieIdx, -1)

In [18]:
def get_top_n_movies_ids_itemitem(knn, movieId, n=5):
    '''
    using knn basedd item-item model,  find the top n neighbours based on similarity scores
    returns: list of n movieIds
    '''
    midx = movieId_to_movieIdx(movieId)
    if midx < 0:
        return "Movie not found"
    return [movieIdx_to_movieId(i) for i in knn.get_neighbors(midx, n)]

In [19]:
# get top 5 recommendation based on last rated movie (rated 3  or above)
last_rated['last_rated_rec'] = last_rated.movieId.apply(lambda x: get_top_n_movies_ids_itemitem(knn2, x))

In [20]:
# create id to title mapping
movie_title_map = movies[['movieId','title']].set_index('movieId').to_dict()['title']

In [21]:
# derive titles from the ids - to manually check
last_rated['last_rated_movie'] = last_rated['movieId'].apply(lambda x: movie_title_map.get(x,""))
last_rated['recommended_movies'] = last_rated['last_rated_rec'].apply(lambda x: [movie_title_map.get(i,"") for i in x])

In [22]:
last_rated

Unnamed: 0,userId,movieId,rating,timestamp,last_rated_rec,last_rated_movie,recommended_movies
161,1,2492,4.0,965719662,"[24, 53, 121, 149, 151]",20 Dates (1998),"[Powder (1995), Lamerica (1994), Boys of St. V..."
247,2,80489,4.5,1445715340,"[6, 8, 70, 83, 118]","Town, The (2010)","[Heat (1995), Tom and Huck (1995), From Dusk T..."
283,3,3024,4.5,1306464054,"[27, 36, 38, 45, 64]",Piranha (1978),"[Now and Then (1995), Dead Man Walking (1995),..."
502,4,4246,4.0,1007574542,"[103, 151, 229, 304, 384]",Bridget Jones's Diary (2001),"[Unforgettable (1996), Rob Roy (1995), Death a..."
527,5,247,5.0,847435337,"[47, 108, 128, 166, 178]",Heavenly Creatures (1994),"[Seven (a.k.a. Se7en) (1995), Catwalk (1996), ..."
...,...,...,...,...,...,...,...
97754,606,2355,3.0,1368460577,"[151, 558, 606, 680, 700]","Bug's Life, A (1998)","[Rob Roy (1995), Pagemaster, The (1994), Candy..."
98664,607,4069,3.0,997847203,"[9, 10, 44, 52, 53]","Wedding Planner, The (2001)","[Sudden Death (1995), GoldenEye (1995), Mortal..."
99494,608,52245,3.0,1189563917,"[6, 7, 18, 25, 42]",Blades of Glory (2007),"[Heat (1995), Sabrina (1995), Four Rooms (1995..."
99523,609,650,3.0,847221080,"[151, 193, 248, 257, 261]",Moll Flanders (1996),"[Rob Roy (1995), Showgirls (1995), Houseguest ..."


# Recommend 5 user for movie starting with S and A

In [23]:
sa_movies = movies[(movies.title.str.startswith('A'))|(movies.title.str.startswith('S'))]
sa_ratings = ratings[ratings.movieId.isin(sa_movies.movieId.unique())][['userId', 'movieId', 'rating']]
sa_data = Dataset.load_from_df(sa_ratings, reader=reader)
sa_train = sa_data.build_full_trainset()

In [24]:
sa_test = sa_train.build_anti_testset()
predictions = knn2.test(sa_test)

In [25]:
def get_top_n(predictions, n=5):
    """Return the top-N recommended for each movie from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each movie. Default
            is 5.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each movie.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[iid].append((uid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for iid, movie_ratings in top_n.items():
        movie_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[iid] = movie_ratings[:n]

    return top_n

In [26]:
recommended_users = get_top_n(predictions)

In [27]:
sa_movies['rec_users'] = sa_movies.movieId.apply(lambda x: [u for u, r in recommended_users[x]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sa_movies['rec_users'] = sa_movies.movieId.apply(lambda x: [u for u, r in recommended_users[x]])


In [28]:
sa_movies.head()

Unnamed: 0,movieId,title,genres,rec_users
6,7,Sabrina (1995),Comedy|Romance,"[53, 543, 584, 586, 171]"
8,9,Sudden Death (1995),Action,"[43, 53, 594, 543, 452]"
10,11,"American President, The (1995)",Comedy|Drama|Romance,"[53, 452, 1, 12, 584]"
16,17,Sense and Sensibility (1995),Drama|Romance,"[43, 53, 12, 1, 452]"
18,19,Ace Ventura: When Nature Calls (1995),Comedy,"[43, 53, 452, 171, 93]"
