In [1]:
import pandas as pd
import numpy as np 
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict

In [2]:
from surprise import KNNBasic, accuracy, KNNWithMeans
from surprise.model_selection import train_test_split, GridSearchCV

In [3]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


ratings from 0 to 5

In [6]:
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [7]:
sorted(ratings.rating.unique())

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

In [8]:
movieIds = np.sort(ratings.movieId.unique())

In [9]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# KNN Based approach Item-Item Similarity

In [10]:
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 5],
    "user_based": [False]
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.9058940856880865
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}


KNNWithMeans works better than KNNBasic

Hence, would train using KNNWithMeans  using full data

In [11]:
# build model on whole data
train = data.build_full_trainset()

In [12]:
knn2 = KNNWithMeans(sim_options={'name': 'msd', 'min_support': 3, 'user_based': False})
knn2.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x12183bdc0>

In [13]:
sim = knn2.compute_similarities()

Computing the msd similarity matrix...
Done computing similarity matrix.


In [14]:
def get_neighbors(knn, train, riid, n=5, verbose=False):
    try:    
        neighbor_innner_iids = knn.get_neighbors(train.to_inner_iid(riid),n)
        if verbose:
            print('neighbor_innner_iids  - ', neighbor_innner_iids)
    except:
        print(riid)
        return []
    neighbor_raw_iids = [train.to_raw_iid(i) for i in neighbor_innner_iids]
    if verbose:
        print('neighbor_raw_iids  - ', neighbor_raw_iids)

    return neighbor_raw_iids

In [15]:
get_neighbors(knn2, train, 1)

[7766, 2940, 2967, 178061, 5833]

# Last rated movie for every user

In [16]:
# last rated - last movie rated as per timestamp having more than 3 rating
last_rated = ratings[ratings.rating>=3].sort_values(['userId', 'timestamp'], ascending=[True, False]).groupby('userId').head(1)
last_rated.head()

Unnamed: 0,userId,movieId,rating,timestamp
161,1,2492,4.0,965719662
247,2,80489,4.5,1445715340
283,3,3024,4.5,1306464054
502,4,4246,4.0,1007574542
527,5,247,5.0,847435337


In [17]:
last_rated['inner_iid'] = last_rated.movieId.apply(lambda x: train.to_inner_iid(x))

In [18]:
# get top 5 recommendation based on last rated movie (rated 3  or above)
last_rated['last_rated_rec'] = last_rated.movieId.apply(lambda x: get_neighbors(knn2, train, x))

In [19]:
# create id to title mapping
movie_title_map = movies[['movieId','title']].set_index('movieId').to_dict()['title']

In [20]:
# derive titles from the ids - to manually check
last_rated['last_rated_movie'] = last_rated['movieId'].apply(lambda x: movie_title_map.get(x,""))
last_rated['recommended_movies'] = last_rated['last_rated_rec'].apply(lambda x: [movie_title_map.get(i,"") for i in x])

In [21]:
last_rated

Unnamed: 0,userId,movieId,rating,timestamp,inner_iid,last_rated_rec,last_rated_movie,recommended_movies
161,1,2492,4.0,965719662,161,"[367, 2054, 329, 1527, 1375]",20 Dates (1998),"[Mask, The (1994), Honey, I Shrunk the Kids (1..."
247,2,80489,4.5,1445715340,245,"[2944, 1084, 161582, 1997, 10]","Town, The (2010)","[Dirty Dozen, The (1967), Bonnie and Clyde (19..."
283,3,3024,4.5,1306464054,275,"[2641, 1876, 1375, 2699, 316]",Piranha (1978),"[Superman II (1980), Deep Impact (1998), Star ..."
502,4,4246,4.0,1007574542,446,"[428, 4880, 2819, 2889, 72605]",Bridget Jones's Diary (2001),"[Bronx Tale, A (1993), Life as a House (2001),..."
527,5,247,5.0,847435337,306,"[1057, 1885, 8529, 3189, 61132]",Heavenly Creatures (1994),"[Everyone Says I Love You (1996), Opposite of ..."
...,...,...,...,...,...,...,...,...
97754,606,2355,3.0,1368460577,1600,"[493, 104913, 1011, 2940, 3985]","Bug's Life, A (1998)","[Menace II Society (1993), Rush (2013), Herbie..."
98664,607,4069,3.0,997847203,3311,"[362, 2430, 6006, 6264, 6294]","Wedding Planner, The (2001)","[Jungle Book, The (1994), Mighty Joe Young (19..."
99494,608,52245,3.0,1189563917,2985,"[4990, 84772, 42725, 48520, 31433]",Blades of Glory (2007),"[Jimmy Neutron: Boy Genius (2001), Paul (2011)..."
99523,609,650,3.0,847221080,7423,"[62, 1059, 592, 648, 110]",Moll Flanders (1996),"[Mr. Holland's Opus (1995), William Shakespear..."


In [22]:
get_neighbors(knn2,train,2492, verbose=True)

neighbor_innner_iids  -  [22, 125, 599, 1030, 1139]
neighbor_raw_iids  -  [367, 2054, 329, 1527, 1375]


[367, 2054, 329, 1527, 1375]

In [23]:
movies[movies.movieId.isin([2492,367, 2054, 329, 1527, 1375])]

Unnamed: 0,movieId,title,genres
287,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi
325,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
1058,1375,Star Trek III: The Search for Spock (1984),Action|Adventure|Sci-Fi
1158,1527,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi
1522,2054,"Honey, I Shrunk the Kids (1989)",Adventure|Children|Comedy|Fantasy|Sci-Fi
1874,2492,20 Dates (1998),Comedy|Romance


In [24]:
def get_common_reviews(train, m1, m2):
    m1_ratings = train.ir[train.to_inner_iid(m1)]
    m2_ratings = train.ir[train.to_inner_iid(m2)]
    
    m1_ratings_dict = {i:j for i, j in m1_ratings}
    m2_ratings_dict = {i:j for i, j in m2_ratings}
    common = set(m1_ratings_dict.keys()).intersection(set(m2_ratings_dict.keys()))
    print('m1 ratings: ', {k: v for k, v in m1_ratings_dict.items() if k in common})
    print('m2 ratings: ', {k: v for k, v in m2_ratings_dict.items() if k in common})

In [25]:
get_common_reviews(train, 2492, 1375)

m1 ratings:  {44: 4.0, 185: 4.0, 554: 3.0}
m2 ratings:  {44: 4.0, 185: 4.0, 554: 3.0}


In [26]:
last_rated.to_pickle('last_rated.pkl')

# Recommend 5 user for movie starting with S and A

In [27]:
sa_movies = movies[(movies.title.str.startswith('A'))|(movies.title.str.startswith('S'))]
sa_ratings = ratings[ratings.movieId.isin(sa_movies.movieId.unique())][['userId', 'movieId', 'rating']]
sa_data = Dataset.load_from_df(sa_ratings, reader=reader)
sa_train = sa_data.build_full_trainset()

In [28]:
# predicting only for the pairs which are not rated already
sa_test = sa_train.build_anti_testset()
predictions = knn2.test(sa_test)

In [29]:
def get_top_n(predictions, n=5):
    """Return the top-N recommended for each movie from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each movie. Default
            is 5.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each movie.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[iid].append((uid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for iid, movie_ratings in top_n.items():
        movie_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[iid] = movie_ratings[:n]

    return top_n

In [30]:
recommended_users = get_top_n(predictions)

In [31]:
sa_movies['rec_users'] = sa_movies.movieId.apply(lambda x: [u for u, r in recommended_users[x]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sa_movies['rec_users'] = sa_movies.movieId.apply(lambda x: [u for u, r in recommended_users[x]])


In [32]:
sa_movies.head()

Unnamed: 0,movieId,title,genres,rec_users
6,7,Sabrina (1995),Comedy|Romance,"[53, 543, 452, 584, 171]"
8,9,Sudden Death (1995),Action,"[53, 543, 236, 43, 475]"
10,11,"American President, The (1995)",Comedy|Drama|Romance,"[53, 452, 154, 12, 584]"
16,17,Sense and Sensibility (1995),Drama|Romance,"[53, 276, 43, 12, 154]"
18,19,Ace Ventura: When Nature Calls (1995),Comedy,"[53, 43, 452, 594, 543]"


In [33]:
sa_movies.to_pickle('sa_movies.pkl')