https://github.com/yjeong5126/movie_recommender/blob/master/item_based_collaborative_filtering/item_based_collaborative_filtering.ipynb

# Item-based Collaborative Filtering

### Main idea: For item i, find other similar items

### Method: Estimate rating for item i based on ratings for similar items

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
all_ratings = pd.read_csv("all_ratings.csv", index_col=False)
len(all_ratings)

1031175

In [4]:
all_ratings = all_ratings[:40000] # subset chosen to address memory constraints in later parts of pipeline
len(all_ratings)

40000

In [5]:
unique_user_ids = all_ratings.userId.unique()
# reindex the userId
user_id_mapping = {user_id: i + 1 for i, user_id in enumerate(unique_user_ids)}

# Map the 'userId' column using the mapping dictionary
all_ratings['userId'] = all_ratings['userId'].map(user_id_mapping)


In [6]:
# all_ratings = all_ratings.drop(['Unnamed: 0'],axis=1)
# all_ratings = all_ratings.reset_index(drop=True)
all_ratings.columns

Index(['userId', 'ISBN', 'bookRating', 'bookTitle', 'bookAuthor',
       'yearOfPublication', 'publisher', 'location', 'age'],
      dtype='object')

In [7]:
all_ratings.head()

Unnamed: 0,userId,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,publisher,location,age
0,1,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0


In [10]:
item_to_check = "The Great Gatsby"

In [11]:
def users_read_book(book_title):
    users_list = all_ratings[all_ratings['bookTitle'] == book_title]['userId'].tolist()
    return users_list

users_list_for_book = users_read_book(item_to_check)

print(f"Users who have read '{item_to_check}':")
print(users_list_for_book)

Users who have read 'The Great Gatsby':
[29, 70, 86]


In [12]:
all_ratings.userId.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [13]:
all_ratings = all_ratings[['bookTitle','userId','bookRating']]

In [15]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [16]:
all_ratings.userId.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [17]:
reader = Reader(rating_scale=(1, 10))

In [18]:
all_ratings = all_ratings[all_ratings['bookRating']!=0]
df = all_ratings[['bookTitle','userId','bookRating']]
df.head()

In [19]:
all_ratings.bookRating.unique()

array([ 5,  9,  8,  7,  6, 10,  3,  4,  2,  1])

In [21]:
desired_bookTitle = 'Seal it with a Kiss'
# desired_bookTitle = 'The Great Gatsby'
# specified_userid = 2313
specified_userid = 2
already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])

In [22]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [23]:
algo = KNNWithMeans(sim_options=sim_options)

reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(df, reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f6b78fda110>

In [24]:
prediction = algo.predict(specified_userid, desired_bookTitle)
prediction.est

7.999938979741274

## User-based Collaborative Filtering

In [25]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Gridsearch to get the best User-based Collaborative filtering model
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [26]:
sim_options = gs.best_params["rmse"]['sim_options']
algo = KNNWithMeans(sim_options=sim_options)

all_ratings = all_ratings[all_ratings['bookRating']!=0]
reader = Reader(rating_scale=(1, 10))
reader = Dataset.load_from_df(all_ratings[['userId', 'bookTitle', 'bookRating']], reader).build_full_trainset()
algo.fit(reader)

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f6b78fda7d0>

In [27]:
# Get the user's top N recommendations
k = 5  # k num of neighbours
num_recs = 10 # N num of recommendations

user_inner_id = algo.trainset.to_inner_uid(specified_userid)
user_neighbors = algo.get_neighbors(user_inner_id, k=k)

# Convert inner IDs back to the original user IDs
user_neighbors = [algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors]

#Get list of user's nearest neighbors based on the trained model
# -->Recommend books from these nearest neighbors' rated books
recommended_books = []

for neighbor_userid in user_neighbors:
    
    neighbor_ratings = all_ratings[all_ratings["userId"] == neighbor_userid]

    # Filter out books that the specified user has already rated
    # already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])
    neighbor_ratings = neighbor_ratings[~neighbor_ratings["bookTitle"].isin([already_read_list])] 
    

    # Sort by bookRating in descending order to recommend the highest-rated books
    neighbor_ratings = neighbor_ratings.sort_values(by="bookRating", ascending=False)

    # take the top-rated books from each neighbor and add them to the recommended_books list
    top_n_books = neighbor_ratings.head(num_recs)  
    recommended_books.extend(top_n_books["bookTitle"].tolist())



In [28]:
len(recommended_books)

50

In [29]:
from pprint import pprint

print("Recommended books for user", specified_userid, ":")
pprint(recommended_books)

Recommended books for user 2 :
['Go Ask Alice (AvonFlare Book)',
 'The Cat Who Went up the Creek',
 'Riding Shotgun',
 'The World According to Garp',
 'Considering Kate (The Stanislaskis) (Silhouette Special Edition)',
 'Last Days of Summer',
 'The Trial and Death of Socrates : Four Dialogues (Dover Thrift Editions)',
 "The Quilter's Apprentice",
 'Walking Across Egypt',
 'Paris to the Moon',
 'Roses Are Red (Alex Cross Novels)',
 'Toot & Puddle',
 'A Little Look-See:  Mutts 6',
 'The Highwayman',
 'The Essential Calvin and Hobbes',
 "Lamb : The Gospel According to Biff, Christ's Childhood Pal",
 'Howl and Other Poems (Pocket Poets)',
 'A Dog Year: Twelve Months, Four Dogs, and Me',
 'Basket Case',
 'Shinju',
 'E-Wally and the Quest',
 'Border Music',
 'Twas the Night Before: A Love Story',
 'Quentins',
 'Standing in the Rainbow : A Novel',
 'Five Quarters of the Orange',
 '1984',
 'Flesh Tones: A Novel',
 'Wuthering Heights',
 'She Shall Have Murder (Perennial Library, P638)',
 'Flesh