https://github.com/yjeong5126/movie_recommender/blob/master/item_based_collaborative_filtering/item_based_collaborative_filtering.ipynb

# Item-based Collaborative Filtering

### Main idea: For item i, find other similar items

### Method: Estimate rating for item i based on ratings for similar items

In [1]:
import pandas as pd
import random
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
all_ratings = pd.read_csv("all_ratings.csv", index_col=False)
len(all_ratings)

1031175

In [4]:
all_ratings = all_ratings[:30000] # subset chosen to address memory constraints in later parts of pipeline
len(all_ratings)

30000

In [5]:
unique_user_ids = all_ratings.userId.unique()
# reindex the userId
user_id_mapping = {user_id: i + 1 for i, user_id in enumerate(unique_user_ids)}

# Map the 'userId' column using the mapping dictionary
all_ratings['userId'] = all_ratings['userId'].map(user_id_mapping)


In [6]:
# all_ratings = all_ratings.drop(['Unnamed: 0'],axis=1)
# all_ratings = all_ratings.reset_index(drop=True)
all_ratings.columns

Index(['userId', 'ISBN', 'bookRating', 'bookTitle', 'bookAuthor',
       'yearOfPublication', 'publisher', 'location', 'age'],
      dtype='object')

In [7]:
all_ratings.head()

Unnamed: 0,userId,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,publisher,location,age
0,1,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0


In [10]:
item_to_check = "The Great Gatsby"

In [11]:
def users_read_book(book_title):
    users_list = all_ratings[all_ratings['bookTitle'] == book_title]['userId'].tolist()
    return users_list

users_list_for_book = users_read_book(item_to_check)

print(f"Users who have read '{item_to_check}':")
print(users_list_for_book)

Users who have read 'The Great Gatsby':
[29, 70]


In [12]:
all_ratings.userId.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70])

# To prepare the items embeddings matrix

In [13]:
all_ratings = all_ratings[['bookTitle','userId','bookRating']]

In [14]:
all_ratings.bookTitle.nunique()

22481

In [15]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [16]:
all_ratings.userId.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70])

In [18]:
all_ratings = all_ratings[all_ratings['bookRating']!=0]
df = all_ratings[['bookTitle','userId','bookRating']]
df.head()

Unnamed: 0,bookTitle,userId,bookRating
1,Flesh Tones: A Novel,2,5
2,Ender's Game (Ender Wiggins Saga (Paperback)),2,9
3,In Cold Blood (Vintage International),2,8
4,Divine Secrets of the Ya-Ya Sisterhood : A Novel,2,9
5,The Mistress of Spices,2,5


In [19]:
all_ratings.bookRating.unique()

array([ 5,  9,  8,  7,  6, 10,  3,  4,  2,  1])

In [20]:
all_ratings.bookTitle.unique()

array(['Flesh Tones: A Novel',
       "Ender's Game (Ender Wiggins Saga (Paperback))",
       'In Cold Blood (Vintage International)', ..., 'Heartburn',
       'Martina', 'The First Salute'], dtype=object)

In [21]:
desired_bookTitle = 'Seal it with a Kiss'
# desired_bookTitle = 'The Great Gatsby'
specified_userid = 2
already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])

## Item-based Collaborative Filtering

In [22]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df, reader)
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
1.681557963795985
{'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}


In [23]:
sim_options = gs.best_params["rmse"]['sim_options']
algo = KNNWithMeans(sim_options=sim_options)

all_ratings = all_ratings[all_ratings['bookRating']!=0]
# reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f6fa43ba650>

In [24]:
# Get the user's top N recommendations
k = 10  
num_recs = 10 # = N


# Get the inner user ID for the specified user
user_inner_id = algo.trainset.to_inner_uid(specified_userid)

# Get the raw item IDs and their predicted ratings for the specified user
user_items_ratings = []
for item_inner_id in algo.trainset.all_items():
    if not algo.trainset.knows_user(user_inner_id) or algo.trainset.to_raw_iid(item_inner_id) in algo.trainset.ur[user_inner_id]:
        continue
    predicted_rating = algo.predict(user_inner_id, item_inner_id).est

    # Add a random perturbation factor to the predicted rating
    random_factor = random.uniform(0.001, 0.01)  # You can adjust this range as needed
    predicted_rating += random_factor

    item_raw_id = algo.trainset.to_raw_iid(item_inner_id)
    user_items_ratings.append((item_raw_id, predicted_rating))

# Sort the items by predicted ratings in descending order
user_items_ratings.sort(key=lambda x: x[1], reverse=True)

user_items_ratings

[('Captured Innocence (Avon Romantic Treasure)', 8.2338081065983),
 ('The Ghost of Carnal Cove (Candleglow)', 8.233806995591971),
 ('In Full Bloom', 8.23380586977698),
 ('Born in Shame', 8.233805561697121),
 ("The Senator's Daughter", 8.233803645110248),
 ('Blowout', 8.233802612467048),
 ('The Ambiguity of Murder', 8.23380216837963),
 ('Blues in the Night', 8.233800705007193),
 ('Gentleman Caller', 8.233800563234421),
 ('Bel Canto: A Novel', 8.233800550499915),
 ('Hot & Bothered (Mira)', 8.233800279621718),
 ('The Search for Signs of Intelligent Life in the Universe',
  8.233800239220736),
 ('A Cold Heart: An Alex Delaware Novel', 8.23379964050807),
 ("Once upon a Winter's Night (Roc Fantasy)", 8.233799478911),
 ("Joe Torre's Ground Rules for Winners : 12 Keys to Managing Team Players, Tough Bosses, Setbacks, and Success",
  8.23379888484103),
 ('The Bible and Kundalini Energy, The New Testament, Deep Secrets of the',
  8.233796067773735),
 ('Why Does It Fly? (Arvetis, Chris. Just Ask 

In [25]:
# Extract the top-N recommended item raw IDs (book titles)
top_n_items = user_items_ratings[:num_recs]

# 'top_n_items' contains tuples of (book title, predicted rating)
# Extract the book titles from the tuples and print the recommended book titles
recommended_books = [item[0] for item in top_n_items]

print("Recommended books for user", specified_userid, ":")
for book_title in recommended_books:
    print(book_title)

Recommended books for user 2 :
Captured Innocence (Avon Romantic Treasure)
The Ghost of Carnal Cove (Candleglow)
In Full Bloom
Born in Shame
The Senator's Daughter
Blowout
The Ambiguity of Murder
Blues in the Night
Gentleman Caller
Bel Canto: A Novel
