https://github.com/yjeong5126/movie_recommender/blob/master/item_based_collaborative_filtering/item_based_collaborative_filtering.ipynb

# 3 Approaches for Recommender Systems
## 1. User-based Collaborative Filtering
## 2. Item-based Collaborative Filtering
## 3. Latent Factor Approach: Matrix Factorization (using SVC)

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
all_ratings = pd.read_csv("all_ratings.csv", index_col=False)
len(all_ratings)

1031175

In [4]:
all_ratings = all_ratings[:30000] # subset chosen to address memory constraints in later parts of pipeline

In [5]:
unique_user_ids = all_ratings.userId.unique()
# reindex the userId
user_id_mapping = {user_id: i + 1 for i, user_id in enumerate(unique_user_ids)}

# Map the 'userId' column using the mapping dictionary
all_ratings['userId'] = all_ratings['userId'].map(user_id_mapping)


In [6]:
# all_ratings = all_ratings.drop(['Unnamed: 0'],axis=1)
# all_ratings = all_ratings.reset_index(drop=True)
all_ratings.columns

Index(['userId', 'ISBN', 'bookRating', 'bookTitle', 'bookAuthor',
       'yearOfPublication', 'publisher', 'location', 'age'],
      dtype='object')

In [7]:
all_ratings.head()

Unnamed: 0,userId,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,publisher,location,age
0,1,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0


In [8]:
item_to_check = "The Great Gatsby"

In [9]:
def users_read_book(book_title):
    users_list = all_ratings[all_ratings['bookTitle'] == book_title]['userId'].tolist()
    return users_list

users_list_for_book = users_read_book(item_to_check)

print(f"Users who have read '{item_to_check}':")
print(users_list_for_book)

Users who have read 'The Great Gatsby':
[29, 70]


In [10]:
all_ratings.userId.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70])

In [11]:
all_ratings = all_ratings[['bookTitle','userId','bookRating']]

In [12]:
all_ratings.bookTitle.nunique()

22481

In [13]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [14]:
all_ratings.userId.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70])

In [15]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(all_ratings[["bookTitle","userId",  "bookRating"]], reader)

In [16]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [17]:
all_ratings = all_ratings[all_ratings['bookRating']!=0]

In [18]:
all_ratings.bookRating.unique()

array([ 5,  9,  8,  7,  6, 10,  3,  4,  2,  1])

In [19]:
all_ratings.bookTitle.unique()

array(['Flesh Tones: A Novel',
       "Ender's Game (Ender Wiggins Saga (Paperback))",
       'In Cold Blood (Vintage International)', ..., 'Heartburn',
       'Martina', 'The First Salute'], dtype=object)

In [20]:
desired_bookTitle = 'Seal it with a Kiss'
# desired_bookTitle = 'The Great Gatsby'
specified_userid = 2
already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])

In [21]:
df = all_ratings[['bookTitle','userId','bookRating']]

In [22]:
df.head()

Unnamed: 0,bookTitle,userId,bookRating
1,Flesh Tones: A Novel,2,5
2,Ender's Game (Ender Wiggins Saga (Paperback)),2,9
3,In Cold Blood (Vintage International),2,8
4,Divine Secrets of the Ya-Ya Sisterhood : A Novel,2,9
5,The Mistress of Spices,2,5


In [23]:
algo = KNNWithMeans(sim_options=sim_options)

# all_ratings = all_ratings[all_ratings['bookRating']!=0]
reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f796c4bfbb0>

# Example of a predicted book rating for a book which a specified user has not read

In [None]:
# data = Dataset.load_from_df(all_ratings[["bookTitle","userId",  "bookRating"]], reader)
prediction = algo.predict(specified_userid, desired_bookTitle)
prediction.est

print(f"Book Rating Prediction is {prediction.est} for {desired_bookTitle} for user:{specified_userid}")

Book Rating Prediction is 6.0 for Seal it with a Kiss for user:2


In [25]:
from pprint import pprint
# Get the user's top N recommendations
k = 10  
num_recs = 10
user_inner_id = algo.trainset.to_inner_uid(specified_userid)
user_neighbors = algo.get_neighbors(user_inner_id, k=k)

# Convert inner IDs back to the original user IDs
user_neighbors = [algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors]

# Get list of user's nearest neighbors based on the trained model
# --> Recommend books from these nearest neighbors' rated books
recommended_books = []

for neighbor_userid in user_neighbors:
    neighbor_ratings = all_ratings[all_ratings["userId"] == neighbor_userid]

    # Filter out books that the specified user has already rated
    # already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])
    neighbor_ratings = neighbor_ratings[~neighbor_ratings["bookTitle"].isin([already_read_list])] 
    

    # Sort by bookRating in descending order to recommend the highest-rated books
    neighbor_ratings = neighbor_ratings.sort_values(by="bookRating", ascending=False)

    # take the top-rated books from each neighbor and add them to the recommended_books list
    top_n_books = neighbor_ratings.head(num_recs)  # You can change the number of recommendations as needed
    recommended_books.extend(top_n_books["bookTitle"].tolist())

# 'recommended_books' now contains the recommended books for the specified user based on their nearest neighbors' preferences
print("Recommended books for user", specified_userid, ":")
pprint(recommended_books)

Recommended books for user 2 :
['Roses Are Red (Alex Cross Novels)',
 'Toot & Puddle',
 'A Little Look-See:  Mutts 6',
 'The Highwayman',
 'The Essential Calvin and Hobbes',
 "Lamb : The Gospel According to Biff, Christ's Childhood Pal",
 'Howl and Other Poems (Pocket Poets)',
 'A Dog Year: Twelve Months, Four Dogs, and Me',
 'Basket Case',
 'Shinju',
 'E-Wally and the Quest',
 'Border Music',
 'Twas the Night Before: A Love Story',
 'Quentins',
 'Standing in the Rainbow : A Novel',
 'Five Quarters of the Orange',
 '1984',
 'Flesh Tones: A Novel',
 'Wuthering Heights',
 'She Shall Have Murder (Perennial Library, P638)',
 'Flesh Tones: A Novel',
 'To Kill a Mockingbird',
 'Little Altars Everywhere: A Novel',
 'The Red Tent (Bestselling Backlist)',
 'Good in Bed',
 'Lucky Man: A Memoir',
 "The Liar's Club: A Memoir",
 'Brothel: Mustang Ranch and Its Women',
 'Rosie: A Novel',
 'The Territory of Men',
 'Blues Dancing: A Novel',
 'Every Womans Health the Complete Guide to Body',
 'DK Handb

## User-based Collaborative Filtering

In [26]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Gridsearch to get the best User-based Collaborative filtering model
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [27]:
sim_options = gs.best_params["rmse"]['sim_options']
algo = KNNWithMeans(sim_options=sim_options)

all_ratings = all_ratings[all_ratings['bookRating']!=0]
reader = Reader(rating_scale=(1, 10))
reader = Dataset.load_from_df(all_ratings[['userId', 'bookTitle', 'bookRating']], reader).build_full_trainset()
algo.fit(reader)

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f796c4be6e0>

In [28]:
# Get the user's top N recommendations
k = 5  # k num of neighbours
num_recs = 10 # N num of recommendations

user_inner_id = algo.trainset.to_inner_uid(specified_userid)
user_neighbors = algo.get_neighbors(user_inner_id, k=k)

# Convert inner IDs back to the original user IDs
user_neighbors = [algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors]

#Get list of user's nearest neighbors based on the trained model
# -->Recommend books from these nearest neighbors' rated books
recommended_books = []

for neighbor_userid in user_neighbors:
    
    neighbor_ratings = all_ratings[all_ratings["userId"] == neighbor_userid]

    # Filter out books that the specified user has already rated
    # already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])
    neighbor_ratings = neighbor_ratings[~neighbor_ratings["bookTitle"].isin([already_read_list])] 
    

    # Sort by bookRating in descending order to recommend the highest-rated books
    neighbor_ratings = neighbor_ratings.sort_values(by="bookRating", ascending=False)

    # take the top-rated books from each neighbor and add them to the recommended_books list
    top_n_books = neighbor_ratings.head(num_recs)  
    recommended_books.extend(top_n_books["bookTitle"].tolist())



In [29]:
len(recommended_books)

50

In [30]:
from pprint import pprint

print("Recommended books for user", specified_userid, ":")
pprint(recommended_books)

Recommended books for user 2 :
['Go Ask Alice (AvonFlare Book)',
 'The First Salute',
 'The Notebook',
 'A Painted House',
 'Mythology',
 'Flatland (Shambhala Pocket Classics)',
 'Waiting For Nick (Silhouette Special Edition)',
 "Girls' Poker Night",
 'Up & Out (Red Dress Ink)',
 'Geek Love',
 'Roses Are Red (Alex Cross Novels)',
 'Toot & Puddle',
 'A Little Look-See:  Mutts 6',
 'The Highwayman',
 'The Essential Calvin and Hobbes',
 "Lamb : The Gospel According to Biff, Christ's Childhood Pal",
 'Howl and Other Poems (Pocket Poets)',
 'A Dog Year: Twelve Months, Four Dogs, and Me',
 'Basket Case',
 'Shinju',
 'E-Wally and the Quest',
 'Border Music',
 'Twas the Night Before: A Love Story',
 'Quentins',
 'Standing in the Rainbow : A Novel',
 'Five Quarters of the Orange',
 '1984',
 'Flesh Tones: A Novel',
 'Wuthering Heights',
 'She Shall Have Murder (Perennial Library, P638)',
 'Flesh Tones: A Novel',
 'To Kill a Mockingbird',
 'Little Altars Everywhere: A Novel',
 'The Red Tent (Best

## Item-based Collaborative Filtering

In [31]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

data = Dataset.load_from_df(df, reader)
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [32]:
sim_options = gs.best_params["rmse"]['sim_options']
algo = KNNWithMeans(sim_options=sim_options)

all_ratings = all_ratings[all_ratings['bookRating']!=0]
reader = Reader(rating_scale=(1, 10))
reader = Dataset.load_from_df(all_ratings[['userId', 'bookTitle', 'bookRating']], reader).build_full_trainset()
algo.fit(reader)

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f796c4bf340>

In [33]:
import random
# Get the user's top N recommendations
k = 10  
num_recs = 10 # = N


# Get the inner user ID for the specified user
user_inner_id = algo.trainset.to_inner_uid(specified_userid)

# Get the raw item IDs and their predicted ratings for the specified user
user_items_ratings = []
for item_inner_id in algo.trainset.all_items():
    if not algo.trainset.knows_user(user_inner_id) or algo.trainset.to_raw_iid(item_inner_id) in algo.trainset.ur[user_inner_id]:
        continue
    predicted_rating = algo.predict(user_inner_id, item_inner_id).est

    # Add a random perturbation factor to the predicted rating
    random_factor = random.uniform(0.001, 0.01)  # You can adjust this range as needed
    predicted_rating += random_factor

    item_raw_id = algo.trainset.to_raw_iid(item_inner_id)
    user_items_ratings.append((item_raw_id, predicted_rating))

# Sort the items by predicted ratings in descending order
user_items_ratings.sort(key=lambda x: x[1], reverse=True)

user_items_ratings

[("Murder in Scorpio (St. Martin's Dead Letter Mysteries,)",
  8.233807789953273),
 ('In the Spirit', 8.233807582495153),
 ('The Hundredth Man', 8.233807233561642),
 ('Love for Sale : A Grace & Favor Mystery (Grace & Favor Mysteries (Hardcover))',
  8.233807152714514),
 ('The Vampire Armand: The Vampire Chronicles (Rice, Anne, Vampire Chronicles.)',
  8.233806616376832),
 ("Ahab's Wife: Or, The Star-Gazer: A Novel", 8.233804114523924),
 ('Transcension', 8.233802919435),
 ('Sadako and the Thousand Paper Cranes', 8.233801693749507),
 ('Second Touch (A. D. Chronicles #2)', 8.233801586081743),
 ('In the Company of Soldiers : A Chronicle of Combat', 8.233801559082298),
 ('Is that a Moose in Your Pocket?', 8.233800673721543),
 ('Playing For Keeps', 8.233800589878834),
 ('The Sea Hunters II', 8.233800220561681),
 ('Le Grand Meaulnes (Classiques De Poche)', 8.233800040567276),
 ('Dangerous Dilemmas', 8.233799525026312),
 ('McVicar by Himself', 8.23379941477722),
 ('Darknesses (The Corean Chron

In [34]:
# Extract the top-N recommended item raw IDs (book titles)
top_n_items = user_items_ratings[:num_recs]

# 'top_n_items' contains tuples of (book title, predicted rating)
# Extract the book titles from the tuples and print the recommended book titles
recommended_books = [item[0] for item in top_n_items]

print("Recommended books for user", specified_userid, ":")
for book_title in recommended_books:
    print(book_title)

Recommended books for user 2 :
Murder in Scorpio (St. Martin's Dead Letter Mysteries,)
In the Spirit
The Hundredth Man
Love for Sale : A Grace & Favor Mystery (Grace & Favor Mysteries (Hardcover))
The Vampire Armand: The Vampire Chronicles (Rice, Anne, Vampire Chronicles.)
Ahab's Wife: Or, The Star-Gazer: A Novel
Transcension
Sadako and the Thousand Paper Cranes
Second Touch (A. D. Chronicles #2)
In the Company of Soldiers : A Chronicle of Combat


## Latent Factor Approach: Matrix Factorization

In [35]:
# check best values for SVD algo (matrix factorization algo)
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_factors":[i for i in range(30)],
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

1.5041296228170777
{'n_factors': 7, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [36]:
from surprise import KNNWithMeans

sim_options = gs.best_params["rmse"]
algo = KNNWithMeans(sim_options=sim_options)

# all_ratings = all_ratings[all_ratings['bookRating']!=0]
reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(all_ratings[["userId", "bookTitle", "bookRating"]], reader)
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f796c4beb30>

In [37]:

# Get the user's top N recommendations
k = 10  
num_recs = 10
user_inner_id = algo.trainset.to_inner_uid(specified_userid)
user_neighbors = algo.get_neighbors(user_inner_id, k=k)

# Convert inner IDs back to the original user IDs
user_neighbors = [algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors]

# Now, you have a list of user's nearest neighbors based on the trained model
# You can recommend books from these nearest neighbors' rated books
recommended_books = []

for neighbor_userid in user_neighbors:
    # Assuming you have a DataFrame 'all_ratings' with columns 'userId', 'bookTitle', and 'bookRating'
    neighbor_ratings = all_ratings[all_ratings["userId"] == neighbor_userid]

    # Filter out books that the specified user has already rated
    # already_read_list = list(all_ratings[all_ratings['userId'] == specified_userid]['bookTitle'])
    neighbor_ratings = neighbor_ratings[~neighbor_ratings["bookTitle"].isin([already_read_list])] 
    

# Sort by bookRating in descending order to recommend the highest-rated books
neighbor_ratings = neighbor_ratings.sort_values(by="bookRating", ascending=False)

# take the top-rated books from each neighbor and add them to the recommended_books list
top_n_books = neighbor_ratings.head(num_recs)  # You can change the number of recommendations as needed
recommended_books = top_n_books["bookTitle"].tolist()

# 'recommended_books' now contains the recommended books for the specified user based on their nearest neighbors' preferences
print("Recommended books for user", specified_userid, ":")
print(recommended_books)

Recommended books for user 2 :
['The Poisonwood Bible: A Novel', 'Deception on His Mind', "Complications: A Surgeon's Notes on an Imperfect Science", 'The Other Boleyn Girl', 'Mortal Sins', 'The Poyson Garden: An Elizabethan I Mystery (Elizabeth I Mysteries (Paperback))', "Help I'm a Parent", 'Dark Nantucket Noon: A Homer Kelly Mystery', "The Quilter's Apprentice", 'The English Assassin']


In [38]:
from pprint import pprint
pprint(recommended_books)

['The Poisonwood Bible: A Novel',
 'Deception on His Mind',
 "Complications: A Surgeon's Notes on an Imperfect Science",
 'The Other Boleyn Girl',
 'Mortal Sins',
 'The Poyson Garden: An Elizabethan I Mystery (Elizabeth I Mysteries '
 '(Paperback))',
 "Help I'm a Parent",
 'Dark Nantucket Noon: A Homer Kelly Mystery',
 "The Quilter's Apprentice",
 'The English Assassin']
