In [1]:
import pandas as pd
import torch
import torch.nn as nn
import os
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Data Processing

In [6]:
data = pd.read_csv("../datasets/clean_dataset.csv")

# fixing random seed
seed = 42
torch.manual_seed(seed)  # PyTorch CPU
if torch.backends.mps.is_available():
    torch.manual_seed(seed)  # Seed for MPS devices

# split data
train, test = train_test_split(
    data, test_size=0.1, random_state=seed, stratify=data.rating.values
)
train, validation = train_test_split(
    train, test_size=0.1, random_state=seed, stratify=train.rating.values
)

In [41]:
# Convert data into pytorch tensors
train_users = torch.tensor(train["user"].values, dtype=torch.long)
train_books = torch.tensor(train["book_id"].values, dtype=torch.long)
train_ratings = torch.tensor(train["rating"].values, dtype=torch.float32)

val_users = torch.tensor(validation["user"].values, dtype=torch.long)
val_books = torch.tensor(validation["book_id"].values, dtype=torch.long)
val_ratings = torch.tensor(validation["rating"].values, dtype=torch.float32)

test_users = torch.tensor(test["user"].values, dtype=torch.long)
test_books = torch.tensor(test["book_id"].values, dtype=torch.long)
test_ratings = torch.tensor(test["rating"].values, dtype=torch.float32)

# Matrix Factorization - SVD

In [19]:
from surprise import Reader, Dataset

reader = Reader(rating_scale=(1, 5))
# build trainset
SVD_train_set = Dataset.load_from_df(
    train[["user", "book_id", "rating"]], reader
).build_full_trainset()

# make validation and test set in the right format: list of tuples
SVD_val_set = list(
    validation[["user", "book_id", "rating"]].itertuples(index=False, name=None)
)
SVD_test_set = list(
    test[["user", "book_id", "rating"]].itertuples(index=False, name=None)
)

In [20]:
from surprise.prediction_algorithms.matrix_factorization import SVD

algo = SVD(
    n_factors=95,
    lr_all=0.015,
    init_std_dev=0.05,
    reg_all=0.015,
    n_epochs=300,
    random_state=seed,
)
algo.fit(SVD_train_set)
val_predictions = algo.test(SVD_val_set)
test_predictions = algo.test(SVD_test_set)

In [21]:
from surprise.accuracy import rmse
from surprise.accuracy import mse

print(f"test set MSE: {mse(test_predictions, verbose=False)}")
print(f"test set RMSE: {rmse(test_predictions, verbose=False)}")

test set MSE: 0.44198889647470835
test set RMSE: 0.6648224548514501


In [22]:
from surprise import dump

dump.dump("../models/svd_model.pkl", algo=algo)

# Neural Collaborative Filtering (NCF)

In [28]:
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
    def __init__(self, user_ids, book_ids, ratings):
        self.user_ids = user_ids
        self.book_ids = book_ids
        self.ratings = ratings

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return {
            "user_id": self.user_ids[idx],
            "book_id": self.book_ids[idx],
            "rating": self.ratings[idx],
        }

# instatiate datasets objects
train_dataset = Dataset(train_users, train_books, train_ratings)
val_dataset = Dataset(val_users, val_books, val_ratings)
test_dataset = Dataset(test_users, test_books, test_ratings)

# instatiate dataloader objects
batch_size = 32 # 64?
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [27]:
class CollaborativeFilteringModel(nn.Module):
    def __init__(self, n_users, n_books, hidden_dim, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.book_embedding = nn.Embedding(n_books, embedding_dim)
        self.hidden_dim = hidden_dim
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, user_id, book_id):
        user_embedded = self.user_embedding(user_id)
        book_embedded = self.book_embedding(book_id)
        x = torch.cat([user_embedded, book_embedded], dim=-1)

        return self.fc(x).squeeze()

In [29]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device:", device)
else:
    device = torch.device("cpu")  # Fallback to CPU if MPS is not available
    print("MPS is not available. Using CPU instead.")

Using MPS device: mps


In [31]:
class EarlyStopper:
    def __init__(self, patience, min_delta):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float("inf")

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [32]:
from torch.optim import Adam

n_users = data["user"].nunique()
n_books = data["title"].nunique()

# hyperparameters
embedding_dim = 256
hidden_dim = 256
epochs = 50

# Initialize early stopping
early_stopping = EarlyStopper(patience=8, min_delta=0.001)

# Initialize the model
model = CollaborativeFilteringModel(
    n_users=n_users, n_books=n_books, hidden_dim=hidden_dim, embedding_dim=embedding_dim
)

# move model to device
model = model.to(device)

# Define the optimizer and loss function
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

In [30]:
def test_model(model, loader, device):
    y_pred = []
    y_test = []

    model.eval()

    with torch.no_grad():
        for batch in loader:
            user_ids = batch["user_id"].to(device)
            book_ids = batch["book_id"].to(device)
            y_pred.append(model(user_ids, book_ids))
            y_test.append(batch["rating"].to(device))

        y_pred = torch.cat(y_pred, dim=0)
        y_test = torch.cat(y_test, dim=0)
        
    return y_pred, y_test

In [14]:
def train_model(
    model,
    criterion,
    optimizer,
    n_epochs,
    train_loader,
    val_loader,
    early_stopper,
    device,
):
    best_valid_loss = float("inf")  # infinite
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            user_ids = batch["user_id"].to(device)
            book_ids = batch["book_id"].to(device)

            # Forward pass
            predictions = model(user_ids, book_ids)

            ratings = batch['rating'].to(device)

            # compute training loss
            loss = criterion(predictions, ratings)
            optimizer.zero_grad()

            # backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_train_loss = total_loss / len(train_loader)
        ratings, preds = test_model(model, val_loader, device)
        loss_val = criterion(preds, ratings)
        print(
            f"epoch {epoch}: training loss: {average_train_loss}, validation loss: {loss_val}"
        )

        # save best model
        if loss_val.item() < best_valid_loss:
            best_valid_loss = loss_val.item()
            if not os.path.exists("models"):
                os.makedirs("models")
            torch.save(model.state_dict(), "models/collaborative_model.pth")

        # early stopper
        if early_stopper.early_stop(loss_val):             
            break

In [33]:
if os.path.exists("models/collaborative_model.pth"):
   state_dict = torch.load("models/collaborative_model.pth")
   model.load_state_dict(state_dict)
else:
   train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    n_epochs=epochs,
    train_loader=train_loader,
    val_loader=val_loader,
    early_stopper=early_stopping,
    device=device,
)

  state_dict = torch.load("models/collaborative_model.pth")


#### Test NCF on the the test set

In [34]:
ratings, preds = test_model(model=model, loader=val_loader, device=device)
loss_test = criterion(preds, ratings)

# Compute RMSE
rmse = torch.sqrt(loss_test)
print(f"MSE validation set: {loss_test.item()} \nRMSE validation set: {rmse.item()}")

MSE validation set: 0.5652198791503906 
RMSE validation set: 0.7518110871315002


# Content based filtering

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [9]:
# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1122)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1122)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1122)>


#### Train the Word2Vec model

In [10]:
import re

book_train = train.drop_duplicates(subset="book_id", keep="first")
book_train = book_train.drop(["user", "rating"], axis=1)
book_train["categories"] = (
    book_train["categories"]
    .fillna("unknown")
    .apply(lambda x: re.sub(r"[\[\]']", "", x))
)
book_train["full_info"] = book_train["categories"] + " " + book_train["description"]
book_train["processed_text"] = book_train["full_info"].apply(preprocess_text)

train_sentences = book_train["processed_text"].tolist()

In [11]:
from gensim.models import Phrases

bigram_transformer = Phrases(train_sentences)

bi_train_sentences = [bigram_transformer[sentence] for sentence in train_sentences]

In [None]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(
    vector_size=400,
    window=5,
    min_count=2,
    sg=1,
    workers=5,
    seed=seed,
)
word2vec_model.build_vocab(bi_train_sentences)

word2vec_model.train(
    bi_train_sentences,
    total_examples=word2vec_model.corpus_count,
    epochs=30,
    compute_loss=True,
    start_alpha=0.01,
    end_alpha=0.001,
)

print("Training Loss:", word2vec_model.get_latest_training_loss())

# Save the fine-tuned model
word2vec_model.save("../models/local_word2vec.model")

Training Loss: 69432048.0


#### Hybrid model function

In [25]:
from gensim.models import KeyedVectors, Word2Vec

local_model = Word2Vec.load("../models/local_word2vec.model")
glove_input_file = "../models/glove.6B.300d.txt"
pretrained_vectors = KeyedVectors.load_word2vec_format(
    glove_input_file, binary=False, no_header=True
)

In [26]:
def hybrid_model(
    user_id,
    book_id,
    algo,
    books_df,
    ratings_df,
    similarity_matrix,
    alpha,
    min_rating,
    only_CB=False,
):
    # compute the svd rating
    svd_rating = algo.predict(user_id, book_id).est

    rating_for_item = ratings_df["book_id"].value_counts().get(book_id, 0)

    if rating_for_item < min_rating or only_CB == True:
        # compute the content-based rating
        user_rated_books = ratings_df[
            (ratings_df["user"] == user_id) & (ratings_df["book_id"] != book_id)
        ]  # books the user has rated

        weighted_sum = 0
        similarity_sum = 0

        # index of the input book in the similarity matrix
        input_index = books_df.index[books_df["book_id"] == book_id].tolist()[0]

        # iterate over books the user has rated
        for _, rated_item in user_rated_books.iterrows():
            rated_book_id = rated_item["book_id"]
            rated_book_rating = rated_item["rating"]

            # index of the rated book for the similarity matrix
            rated_index = books_df.index[books_df["book_id"] == rated_book_id].tolist()[
                0
            ]

            # get cosine similarity between the input book and the rated book
            content_similarity = similarity_matrix[input_index][rated_index]

            # update weighted sum and similarity sum
            weighted_sum += content_similarity * rated_book_rating
            similarity_sum += content_similarity

        # calculate the content-based predicted rating (weighted average)
        if similarity_sum > 0:
            cb_rating = weighted_sum / similarity_sum
        else:
            cb_rating = 0  # No content similarity available

        if only_CB:
            return cb_rating

        if cb_rating != 0:
            # combine SVD and CBF predictions
            return alpha * svd_rating + (1 - alpha) * cb_rating
        else:
            return svd_rating
    else:
        return svd_rating

## fine-tuning on validation

##### pre-process the data

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate the average embedding for a book
def get_book_embedding(text, model):
    embeddings = []
    if isinstance(model, KeyedVectors): 
        for word in text:
            if word == "unknown":
                embeddings.append(
                    np.zeros(model.vector_size)
                )  # if the word is unknown put a neutral vector
            elif word in model:  # Check if the word is in the model vocabulary
                embeddings.append(model[word])
    elif isinstance(model, Word2Vec):
        for word in text:
            if word == "unknown":
                embeddings.append(
                    np.zeros(model.wv.vector_size)
                )  # if the word is unknown, put a neutral vector
            elif word in model.wv:  # Check if the word is in the model vocabulary
                embeddings.append(model.wv[word])
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average of all word embeddings
    else:
        return np.zeros(
            model.vector_size
        )  # Return a zero vector if no embeddings exist

def build_similarity_matrix(dataframe, similarity_measure, model):
    df = dataframe.copy()
    if model == 'pre-trained': 
        df["book_embedding"] = df["processed_text"].apply(
            lambda x: get_book_embedding(x, pretrained_vectors)
        )
    elif model == 'local':
        df["book_embedding"] = df["processed_text"].apply(
            lambda x: get_book_embedding(x, local_model)
        )

    embedding_matrix = np.vstack(df["book_embedding"].values)
    if similarity_measure == 'cosine':
        similarity_matrix = cosine_similarity(embedding_matrix)
    return similarity_matrix

In [40]:
import re
book_val = validation.drop_duplicates(subset="book_id", keep="first")
book_val = book_val.drop(["user", "rating"], axis=1)
book_val["categories"] = (
    book_val["categories"].fillna("unknown").apply(lambda x: re.sub(r"[\[\]']", "", x))
)
book_val["full_info"] = book_val["categories"] + " " + book_val["description"]
book_val["processed_text"] = book_val["full_info"].apply(preprocess_text)
book_val = book_val.reset_index()
book_val["processed_text"] = book_val["processed_text"].apply(
    lambda x: bigram_transformer[x]
)
ratings_val = validation[["user", "book_id", "rating"]]

# matrix based on pre-trained vectors
val_similarity_matrix_1 = build_similarity_matrix(book_val, "cosine", model="pre-trained")

# matrix based on fine-tuned model
val_similarity_matrix_2 = build_similarity_matrix(book_val, "cosine", model="local")

In [33]:
from surprise import dump

# Load the saved model
_, svd_model = dump.load("../models/svd_model.pkl")

## GridSearch

In [39]:
def grid_search_hybrid(
    X, y, param_grid, algo, books_df, ratings_df, only_CB=False
):
    best_params = None
    best_score = float("inf")  
    results = [] 
    # hyper-parameters
    alpha_values = param_grid["alpha"]
    min_rating_values = param_grid["min_rating"]
    similarity_matrix_values = param_grid["similarity_matrix"]

    for name, matrix in similarity_matrix_values:
        for alpha in alpha_values:
            for min_rating in min_rating_values:
                predictions = []

                for user_id, book_id in X:
                    prediction = hybrid_model(
                        user_id=user_id,
                        book_id=book_id,
                        algo=algo,
                        books_df=books_df,
                        ratings_df=ratings_df,
                        similarity_matrix=matrix,
                        alpha=alpha,
                        min_rating=min_rating,
                        only_CB=only_CB,
                    )
                    predictions.append(prediction)

                # RMSE for this combination of parameters
                rmse = np.sqrt(mean_squared_error(y, predictions))
                results.append(
                    {
                        "alpha": alpha,
                        "min_rating": min_rating,
                        "similarity_matrix": name,
                        "rmse": rmse,
                    }
                )

            # update best parameters if RMSE is lower
            if rmse < best_score:
                best_score = rmse
                best_params = {
                    "alpha": alpha,
                    "min_rating": min_rating,
                    "similarity_matrix": name,
                }

    return best_params, best_score, results

In [40]:
param_grid = {
    "alpha": [0.1, 0.3, 0.5, 0.7, 0.9],
    "min_rating": [6, 7, 8, 9, 10],
    "similarity_matrix": [(1, val_similarity_matrix_1), (2, val_similarity_matrix_2)],
}

X_val = list(zip(validation["user"], validation["book_id"]))
y_val = list(validation["rating"])

best_params, best_score, results = grid_search_hybrid(
    X=X_val,
    y=y_val,
    param_grid=param_grid,
    algo=svd_model,  
    books_df=book_val[["book_id"]],
    ratings_df=ratings_val,
    only_CB=False,
)

print("Best Parameters:", best_params)
print("Best RMSE:", best_score)

for res in results:
    print(res)

Best Parameters: {'alpha': 0.9, 'min_rating': 10, 'similarity_matrix': 2}
Best RMSE: 0.6671597842933592
{'alpha': 0.1, 'min_rating': 6, 'similarity_matrix': 1, 'rmse': 0.7660784440917381}
{'alpha': 0.1, 'min_rating': 7, 'similarity_matrix': 1, 'rmse': 0.7751021320840445}
{'alpha': 0.1, 'min_rating': 8, 'similarity_matrix': 1, 'rmse': 0.7817632792423339}
{'alpha': 0.1, 'min_rating': 9, 'similarity_matrix': 1, 'rmse': 0.7873011245379007}
{'alpha': 0.1, 'min_rating': 10, 'similarity_matrix': 1, 'rmse': 0.7930643338604606}
{'alpha': 0.3, 'min_rating': 6, 'similarity_matrix': 1, 'rmse': 0.7245567615991725}
{'alpha': 0.3, 'min_rating': 7, 'similarity_matrix': 1, 'rmse': 0.7303520756207648}
{'alpha': 0.3, 'min_rating': 8, 'similarity_matrix': 1, 'rmse': 0.7345427949031311}
{'alpha': 0.3, 'min_rating': 9, 'similarity_matrix': 1, 'rmse': 0.7381758284346297}
{'alpha': 0.3, 'min_rating': 10, 'similarity_matrix': 1, 'rmse': 0.7419670380268131}
{'alpha': 0.5, 'min_rating': 6, 'similarity_matrix': 1

In [46]:
def grid_search_hybrid(X, y, param_grid, algo, books_df, ratings_df, matrix, only_CB=False):
    best_params = None
    best_score = float("inf")
    results = []
    # hyper-parameters
    alpha_values = param_grid["alpha"]
    min_rating_values = param_grid["min_rating"]

    for alpha in alpha_values:
        for min_rating in min_rating_values:
            predictions = []
            for user_id, book_id in X:
             prediction = hybrid_model(
                        user_id=user_id,
                        book_id=book_id,
                        algo=algo,
                        books_df=books_df,
                        ratings_df=ratings_df,
                        similarity_matrix=matrix,
                        alpha=alpha,
                        min_rating=min_rating,
                        only_CB=only_CB,
                    )
             predictions.append(prediction)

            # RMSE for this combination of parameters
        rmse = np.sqrt(mean_squared_error(y, predictions))
        results.append(
                    {
                        "alpha": alpha,
                        "min_rating": min_rating,
                        "rmse": rmse,
                    }
                )

            # update best parameters if RMSE is lower
        if rmse < best_score:
                best_score = rmse
                best_params = {
                    "alpha": alpha,
                    "min_rating": min_rating,

                }

    return best_params, best_score, results

#### test best parameters on test set

In [37]:
import re

book_test = test.drop_duplicates(subset="book_id", keep="first")
book_test = book_test.drop(["user", "rating"], axis=1)
book_test["categories"] = (
    book_test["categories"].fillna("unknown").apply(lambda x: re.sub(r"[\[\]']", "", x))
)
book_test["full_info"] = book_test["categories"] + " " + book_test["description"]
book_test["processed_text"] = book_test["full_info"].apply(preprocess_text)
book_test = book_test.reset_index()
book_test["processed_text"] = book_test["processed_text"].apply(lambda x: bigram_transformer[x])
test_similarity_matrix_2 = build_similarity_matrix(book_test, "cosine", "local")
ratings_test = test[["user", "book_id", "rating"]]

In [38]:
X_test = list(zip(test["user"], test["book_id"]))
y_test = list(test["rating"])

predictions = []
for user_id, book_id in X_test:
    prediction = hybrid_model(
        user_id=user_id,
        book_id=book_id,
        algo=svd_model,
        books_df=book_test,
        ratings_df=ratings_test,
        similarity_matrix=test_similarity_matrix_2,
        alpha=0.9,
        min_rating=10,
        only_CB=False,
    )
    predictions.append(prediction)

mae = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mae)
print(mae, rmse)

0.4407001753364538 0.6638525252919159


### test only the content-based

In [39]:
X_test = list(zip(test["user"], test["book_id"]))
y_test = list(test["rating"])

predictions = []
for user_id, book_id in X_test:
    prediction = hybrid_model(
        user_id=user_id,
        book_id=book_id,
        algo=svd_model,
        books_df=book_test,
        ratings_df=ratings_test,
        similarity_matrix=test_similarity_matrix_2,
        alpha=0.9,
        min_rating=8,
        only_CB=True,
    )
    predictions.append(prediction)

mae = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mae)
print(mae, rmse)

3.7756023918681647 1.9430909376218513
