https://github.com/yjeong5126/movie_recommender/blob/master/item_based_collaborative_filtering/item_based_collaborative_filtering.ipynb

# Item-based Collaborative Filtering

### Main idea: For item i, find other similar items

### Method: Estimate rating for item i based on ratings for similar items

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
all_ratings = pd.read_csv("all_ratings.csv", index_col=False)
len(all_ratings)

1031175

In [4]:
all_ratings = all_ratings[:40000] # subset chosen to address memory constraints in later parts of pipeline
len(all_ratings)

40000

In [5]:
unique_user_ids = all_ratings.userId.unique()
# reindex the userId
user_id_mapping = {user_id: i + 1 for i, user_id in enumerate(unique_user_ids)}

# Map the 'userId' column using the mapping dictionary
all_ratings['userId'] = all_ratings['userId'].map(user_id_mapping)


In [6]:
# all_ratings = all_ratings.drop(['Unnamed: 0'],axis=1)
# all_ratings = all_ratings.reset_index(drop=True)
all_ratings.columns

Index(['userId', 'ISBN', 'bookRating', 'bookTitle', 'bookAuthor',
       'yearOfPublication', 'publisher', 'location', 'age'],
      dtype='object')

In [7]:
all_ratings.head()

Unnamed: 0,userId,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,publisher,location,age
0,1,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0


In [8]:
item_to_check = "The Great Gatsby"

In [9]:
def users_read_book(book_title):
    users_list = all_ratings[all_ratings['bookTitle'] == book_title]['userId'].tolist()
    return users_list

users_list_for_book = users_read_book(item_to_check)

print(f"Users who have read '{item_to_check}':")
print(users_list_for_book)

Users who have read 'The Great Gatsby':
[29, 70, 86]


In [10]:
all_ratings.userId.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [11]:
all_ratings = all_ratings[['bookTitle','userId','bookRating']]

In [12]:
all_ratings.bookTitle.nunique()

29082

In [13]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [14]:
all_ratings.userId.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [15]:
all_ratings = all_ratings[all_ratings['bookRating']!=0]
df = all_ratings[['bookTitle','userId','bookRating']]
df.head()

Unnamed: 0,bookTitle,userId,bookRating
1,Flesh Tones: A Novel,2,5
2,Ender's Game (Ender Wiggins Saga (Paperback)),2,9
3,In Cold Blood (Vintage International),2,8
4,Divine Secrets of the Ya-Ya Sisterhood : A Novel,2,9
5,The Mistress of Spices,2,5


In [16]:
desired_bookTitle = 'Seal it with a Kiss'
# desired_bookTitle = 'The Great Gatsby'
specified_userid = 2
already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])

In [17]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df, reader)

In [18]:
# check best values for SVD algo (matrix factorization algo)
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_factors":[i for i in range(30)],
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

1.642649148339057
{'n_factors': 7, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [19]:
from surprise import KNNWithMeans

sim_options = gs.best_params["rmse"]
algo = KNNWithMeans(sim_options=sim_options)

# all_ratings = all_ratings[all_ratings['bookRating']!=0]
reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fd3bf4c7e20>

In [20]:

# Get the user's top N recommendations
k = 10  
num_recs = 10
user_inner_id = algo.trainset.to_inner_uid(specified_userid)
user_neighbors = algo.get_neighbors(user_inner_id, k=k)

# Convert inner IDs back to the original user IDs
user_neighbors = [algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors]

# Now, you have a list of user's nearest neighbors based on the trained model
# You can recommend books from these nearest neighbors' rated books
recommended_books = []

for neighbor_userid in user_neighbors:
    # Assuming you have a DataFrame 'all_ratings' with columns 'userId', 'bookTitle', and 'bookRating'
    neighbor_ratings = all_ratings[all_ratings["userId"] == neighbor_userid]

    # Filter out books that the specified user has already rated
    # already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])
    neighbor_ratings = neighbor_ratings[~neighbor_ratings["bookTitle"].isin([already_read_list])] 
    

# Sort by bookRating in descending order to recommend the highest-rated books
neighbor_ratings = neighbor_ratings.sort_values(by="bookRating", ascending=False)

# take the top-rated books from each neighbor and add them to the recommended_books list
top_n_books = neighbor_ratings.head(num_recs)  # You can change the number of recommendations as needed
recommended_books = top_n_books["bookTitle"].tolist()

# 'recommended_books' now contains the recommended books for the specified user based on their nearest neighbors' preferences
print("Recommended books for user", specified_userid, ":")
print(recommended_books)

Recommended books for user 2 :
['The Poisonwood Bible: A Novel', 'Deception on His Mind', "Complications: A Surgeon's Notes on an Imperfect Science", 'The Other Boleyn Girl', 'Mortal Sins', 'The Poyson Garden: An Elizabethan I Mystery (Elizabeth I Mysteries (Paperback))', "Help I'm a Parent", 'Dark Nantucket Noon: A Homer Kelly Mystery', "The Quilter's Apprentice", 'The English Assassin']


In [21]:
from pprint import pprint
pprint(recommended_books)

['The Poisonwood Bible: A Novel',
 'Deception on His Mind',
 "Complications: A Surgeon's Notes on an Imperfect Science",
 'The Other Boleyn Girl',
 'Mortal Sins',
 'The Poyson Garden: An Elizabethan I Mystery (Elizabeth I Mysteries '
 '(Paperback))',
 "Help I'm a Parent",
 'Dark Nantucket Noon: A Homer Kelly Mystery',
 "The Quilter's Apprentice",
 'The English Assassin']
