<a href="https://colab.research.google.com/github/simon-bouchard/Book_Recommendation_KNN_with_FastAPI/blob/master/model_notebooks/book_recommendation_EmbeddingNN_for_users.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.nn import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from fastai.collab import *
from fastai.tabular.all import *


In [3]:
ratings = pd.read_csv("/kaggle/input/goodbooks-10k/ratings.csv")
ratings = ratings[['user_id', 'book_id', 'rating']]
print(ratings)
print(ratings.shape)

        user_id  book_id  rating
0           314        1       5
1           439        1       3
2           588        1       5
3          1169        1       4
4          1185        1       4
...         ...      ...     ...
981751    48386    10000       5
981752    49007    10000       4
981753    49383    10000       5
981754    50124    10000       5
981755    51328    10000       1

[981756 rows x 3 columns]
(981756, 3)


In [6]:
books = pd.read_csv("/kaggle/input/goodbooks-10k/books.csv")
#books = books[['id', 'original_title', 'authors']]
print(books.head)
books.columns

<bound method NDFrame.head of          id  book_id  best_book_id   work_id  books_count        isbn  \
0         1  2767052       2767052   2792775          272   439023483   
1         2        3             3   4640799          491   439554934   
2         3    41865         41865   3212258          226   316015849   
3         4     2657          2657   3275794          487    61120081   
4         5     4671          4671    245494         1356   743273567   
...     ...      ...           ...       ...          ...         ...   
9995   9996  7130616       7130616   7392860           19   441019455   
9996   9997   208324        208324   1084709           19  067973371X   
9997   9998    77431         77431   2393986           60  039330762X   
9998   9999  8565083       8565083  13433613            7    61711527   
9999  10000     8914          8914     11817           31   375700455   

            isbn13                      authors  original_publication_year  \
0     9.780439e

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [5]:
user_ratings_count = ratings['user_id'].value_counts()

valid_users = user_ratings_count[user_ratings_count >= 10].index

movie_ratings_count = ratings['book_id'].value_counts()

valid_movies = movie_ratings_count[movie_ratings_count >= 50].index

ratings = ratings[(ratings['user_id'].isin(valid_users)) & (ratings['book_id'].isin(valid_movies))]
ratings.shape

(857437, 3)

In [6]:
data = ratings.merge(books, left_on='book_id', right_on='id')
data = data[['user_id', 'original_title', 'rating']]

dls = CollabDataLoaders.from_df(data, item_name='original_title', bs=150)
dls.show_batch()

Unnamed: 0,user_id,original_title,rating
0,47692,Polgara the Sorceress,5
1,44683,The Door into Summer,5
2,42793,Death on the Nile,5
3,9131,Two Graves,3
4,18718,Truce,5
5,47059,The Peripheral,4
6,2477,#na#,4
7,43985,The Hunger Games,4
8,24347,Amadeus,3
9,11840,The Wizard Heir,4


In [7]:
embs = get_emb_sz(dls)
embs

[(24406, 458), (9270, 266)]

In [None]:
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [None]:
class DotProductBias(Module):
    def __init__(self, n_users, n_books, n_factors, y_range=(1, 5.5)):
        super().__init__()

        self.user_factors = nn.Embedding(n_users, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.book_factors = nn.Embedding(n_books, n_factors)
        self.book_bias = nn.Embedding(n_books, 1)
        self.y_range = y_range

    def forward(self, x):
        users = self.user_factors(x[:, 0])
        books = self.book_factors(x[:, 1])
        res = (users * books).sum(dim=1)

        res += self.user_bias(x[:, 0]).squeeze() + self.book_bias(x[:, 1]).squeeze()

        return sigmoid_range(res, *self.y_range)

In [None]:
n_users = len(dls.classes['user_id'])
n_books = len(dls.classes['book_id'])

In [None]:
model = DotProductBias(n_users, n_books, 200)
learn = Learner(dls, model, loss_func=lambda pred, target: SmoothL1Loss()(pred, target.squeeze(-1)), metrics=[rmse, mae])
learn.fit_one_cycle(5, 5e-3, wd=0.3)

In [None]:
book_bias = learn.model.book_bias.weight.squeeze()
idxs = book_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

In [None]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(1,5.5), n_act=100, dropout=0.3):
        super().__init__()
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_act, n_act // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_act // 2, n_act // 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_act // 4, n_act // 8),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_act // 8, 1))
        self.y_range = y_range

    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [8]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(1,5.5), n_act=200, dropout=0.3):
        super().__init__()
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_act, n_act // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_act // 2, 1))
        self.y_range = y_range

    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [9]:
#model = CollabNN((n_users, 100), (n_books, 100))
n_act = max(embs[0][1], embs[1][1])

model = CollabNN(*embs, n_act=n_act)
learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=[rmse, mae])
#learn.lr_find()
learn.fit_one_cycle(5, 5e-3, wd=0.4)

epoch,train_loss,valid_loss,_rmse,mae,time
0,0.774635,0.805598,0.897551,0.717408,00:57
1,0.790819,0.804562,0.896974,0.710549,00:56
2,0.772842,0.785423,0.886241,0.698018,00:56
3,0.683601,0.736601,0.858254,0.666942,00:56
4,0.633744,0.712256,0.843952,0.650155,00:56


In [11]:
device = torch.device("cpu")
print(f"Using device: {device}")

learn.model.to(device)

Using device: cpu


CollabNN(
  (user_factors): Embedding(24406, 458)
  (item_factors): Embedding(9270, 266)
  (layers): Sequential(
    (0): Linear(in_features=724, out_features=458, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=458, out_features=229, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=229, out_features=1, bias=True)
  )
)

In [None]:
def get_recommends(user_id, n=5):
    if user_id not in ratings['user_id'].values:
        raise ValueError(f"User {user_id} not found in ratings dataset")

    device = next(learn.model.parameters()).device

    # Get books the user has rated
    rated_books = ratings[ratings['user_id'] == user_id]['book_id'].tolist()

    # Ensure only books seen in training are used
    trained_books = set(dls.classes['book_id'])  # Only books in model
    unrated_books = [b for b in trained_books if b not in rated_books]

    # Debug: Print first few unrated books
    print(f"unrated_books length: {len(unrated_books)}")
    print(f"First 10 unrated_books: {unrated_books[:10]}")
    print(f"Data type of unrated_books elements: {type(unrated_books[0]) if len(unrated_books) > 0 else 'Empty List'}")

    # Ensure book IDs are within the valid trained range
    book_ids = [int(b) for b in dls.classes['book_id'] if str(b).isdigit()]  # Convert to int, filter out strings
    min_book_id, max_book_id = min(book_ids), max(book_ids)
    unrated_books = [int(b) for b in unrated_books if str(b).isdigit() and min_book_id <= int(b) <= max_book_id]

    if len(unrated_books) == 0:
        raise ValueError(f"User {user_id} has rated all available books!")

    # Convert to tensor (use CPU first for debugging)
    unrated_books = torch.tensor(unrated_books, dtype=torch.int64).to(device)

    # Create user-book pairs
    user_tensor = torch.full((len(unrated_books),), user_id, dtype=torch.int64, device=device)
    user_book_pairs = torch.stack([user_tensor, unrated_books], dim=1)

    # Debug: Print shape before prediction
    print(f"user_book_pairs shape: {user_book_pairs.shape}")

    # Predict ratings
    preds = learn.model(user_book_pairs).detach().cpu().numpy()

    # Get top N recommendations
    top_indices = preds.argsort()[::-1][:n]
    top_books = unrated_books.cpu().numpy()[top_indices]

    return books[books['id'].isin(top_books)][['id', 'original_title']]

# Test
print(get_recommends(314))


In [12]:
import torch

def get_recommendations(user_id, model=model, dls=dls, books=books, top_n=5):
    
    # Get all unique book IDs
    all_book_titles = data['original_title'].unique()
    
    # Create a tensor for the user ID (repeated for all books)
    user_tensor = torch.tensor([user_id] * len(all_book_titles), dtype=torch.int64)
    
    # Create a tensor for all book titles (using their indices in the DataLoader's vocab)
    book_indices = torch.tensor([dls.classes['original_title'].o2i[title] for title in all_book_titles], dtype=torch.int64)
    
    # Combine into a batch for prediction
    batch = torch.stack([user_tensor, book_indices], dim=1)
    
    # Set model to evaluation mode
    model.eval()
    
    # Get predictions
    with torch.no_grad():
        predictions = model(batch)
    
    # Convert predictions to a numpy array
    predictions = predictions.squeeze().numpy()
    
    # Create a DataFrame with book titles and predicted ratings
    recommendations = pd.DataFrame({
        'original_title': all_book_titles,
        'predicted_rating': predictions
    })
    
    # Merge with book information to get authors
    recommendations = recommendations.merge(books, on='original_title')
    
    # Filter out books already rated by the user
    rated_books = data[data['user_id'] == user_id]['original_title'].unique()
    recommendations = recommendations[~recommendations['original_title'].isin(rated_books)]
    
    # Sort by predicted rating in descending order
    recommendations = recommendations.sort_values(by='predicted_rating', ascending=False)
    
    # Return the top N recommendations
    return recommendations.head(top_n)


get_recommendations(314)

Unnamed: 0,original_title,predicted_rating,id,authors
9200,Preach My Gospel (A Guide to Missionary Service),4.911574,9076,The Church of Jesus Christ of Latter-day Saints
5340,Jesus the Christ: A Study of the Messiah and His Mission according to Holy Scriptures both Ancient and Modern,4.904326,4868,James E. Talmage
5662,The Days Are Just Packed: A Calvin and Hobbes Collection,4.898662,5207,Bill Watterson
2365,The Calvin and Hobbes Tenth Anniversary Book,4.868118,1788,Bill Watterson
6031,The Calvin and Hobbes Lazy Sunday Book,4.860455,5580,Bill Watterson
