In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import random

random.seed(42)
np.random.seed(42)
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load data
print("Loading dataset...")
books = pd.read_csv('Books.csv')
ratings = pd.read_csv('Ratings.csv')

# 1. Retain users with enough ratings (at least 5 ratings)
user_counts = ratings['User-ID'].value_counts()
active_users = user_counts[user_counts >= 5].index
filtered_ratings = ratings[ratings['User-ID'].isin(active_users)]

# 2. Retain books that have been rated by enough users (at least 5 users)
book_counts = filtered_ratings['ISBN'].value_counts()
popular_books = book_counts[book_counts >= 5].index
filtered_ratings = filtered_ratings[filtered_ratings['ISBN'].isin(popular_books)]

# 3. Retain only high-rated data (ratings above 7)
filtered_ratings = filtered_ratings[filtered_ratings['Book-Rating'] >= 7]

# Sample data
sampled_ratings = filtered_ratings.sample(n=5000, random_state=42)
sampled_book_isbns = sampled_ratings['ISBN'].unique()

# Create ISBN to index mapping
isbn_to_idx = {isbn: idx for idx, isbn in enumerate(books['ISBN'])}
idx_to_isbn = {idx: isbn for isbn, idx in isbn_to_idx.items()}

#  Step 2: Prepare data in the format required by the Surprise library
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(sampled_ratings[['User-ID', 'ISBN', 'Book-Rating']], reader)

# Split into training and testing datasets
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Use SVD model
print("Training SVD model...")
algo = SVD()
algo.fit(trainset)

# Step 4: Use BERT model to get book title embeddings
def get_bert_embeddings(titles):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    embeddings = []
    for title in tqdm(titles, desc="Tokenizing titles"):
        if isinstance(title, str):  # Ensure the title is a string
            inputs = tokenizer(title, padding=True, truncation=True, return_tensors="pt", max_length=128).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        else:
            # For non-string titles, use zero vectors
            embeddings.append(np.zeros((1, 768)))
    
    return np.vstack(embeddings)

# Get BERT embeddings for all book titles
print("Getting BERT embeddings for book titles...")
# Process only the sampled books
sampled_books = books[books['ISBN'].isin(sampled_book_isbns)]
book_titles = sampled_books['Book-Title'].tolist()
book_embeddings = get_bert_embeddings(book_titles)

# Compute cosine similarity matrix for book titles
cosine_sim = cosine_similarity(book_embeddings, book_embeddings)

# Step 5: Define content-based recommendation function
def recommend_books_based_on_content(user_id, num_recommendations=5):
    # Get books rated by the user
    user_rated_books = sampled_ratings[sampled_ratings['User-ID'] == user_id]['ISBN'].tolist()
    
    # Get indices of rated books in the sampled dataset
    rated_books_indices = []
    for isbn in user_rated_books:
        if isbn in sampled_books['ISBN'].values:
            idx = sampled_books[sampled_books['ISBN'] == isbn].index[0]
            rated_books_indices.append(idx - sampled_books.index[0])  # Adjust index to match cosine_sim matrix
    
    # Store all recommendations
    content_based_recommendations = []
    
    for idx in rated_books_indices:
        if idx < len(cosine_sim):  # Ensure the index is within range
            similar_books = list(enumerate(cosine_sim[idx]))
            similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)
            
            for book_idx, sim_score in similar_books[1:]:  # Skip itself
                if book_idx < len(sampled_books):
                    isbn = sampled_books.iloc[book_idx]['ISBN']
                    if isbn not in user_rated_books:
                        content_based_recommendations.append((isbn, sim_score))
    
    # Sort by similarity and return recommendations
    content_based_recommendations = sorted(content_based_recommendations, key=lambda x: x[1], reverse=True)
    recommended_isbns = [isbn for isbn, _ in content_based_recommendations[:num_recommendations]]
    recommended_books = sampled_books[sampled_books['ISBN'].isin(recommended_isbns)]
    
    return recommended_books

# Step 6: Define collaborative filtering recommendation function
def recommend_books_for_user(user_id, num_recommendations=5):
    user_rated_books = sampled_ratings[sampled_ratings['User-ID'] == user_id]['ISBN'].tolist()
    books_to_predict = [isbn for isbn in sampled_book_isbns if isbn not in user_rated_books]
    
    predicted_ratings = []
    for isbn in books_to_predict:
        predicted_ratings.append((isbn, algo.predict(user_id, isbn).est))
    
    top_books = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:num_recommendations]
    recommended_books = sampled_books[sampled_books['ISBN'].isin([isbn for isbn, _ in top_books])]
    return recommended_books

# Step 7: Hybrid recommendation function
def hybrid_recommend_books(user_id, num_recommendations=5, alpha=0.5):
    try:
        if user_id not in sampled_ratings['User-ID'].unique():
            # Get the 5 books with the highest average rating
            top_books = sampled_books.sort_values('Book-Rating', ascending=False).head(num_recommendations)
            return top_books[['ISBN', 'Book-Title', 'Book-Author']]
        collaborative_recommendations = recommend_books_for_user(user_id, num_recommendations)
        content_based_recommendations = recommend_books_based_on_content(user_id, num_recommendations)
        
        # merge recommendation result
        all_recommendations = pd.concat([collaborative_recommendations, content_based_recommendations])
        all_recommendations = all_recommendations.drop_duplicates(subset=['ISBN'])
        try:
            return all_recommendations.sample(num_recommendations,random_state=42)
        except Exception as e:
            return all_recommendations.head(num_recommendations)
    except Exception as e:
        print(f"Error in hybrid_recommend_books for user {user_id}: {str(e)}")
        return pd.DataFrame(columns=['ISBN', 'Book-Title', 'Book-Author'])

Using device: cuda
Loading dataset...


  books = pd.read_csv('Books.csv')


Training SVD model...
Getting BERT embeddings for book titles...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Tokenizing titles: 100%|███████████████████████████████████████████████████████████| 3899/3899 [01:17<00:00, 50.23it/s]


In [2]:

# Step 8: Evaluate the hybrid recommendation model
def precision_recall_at_k(test_data, num_recommendations=5, sample_size=100, similarity_threshold=0.7):
    hits, total_relevant, total_recommended = 0, 0, 0
    
    # Get the ISBN to index mapping for the sampled data
    isbn_to_index = {isbn: idx for idx, isbn in enumerate(sampled_books['ISBN'])}
    
    # Only consider users who have rated books in the sampled data
    test_data = test_data[test_data['ISBN'].isin(sampled_books['ISBN'])]
    
    # Randomly select users for evaluation
    user_ids = test_data['User-ID'].unique()
    if len(user_ids) > sample_size:
        np.random.seed(42)
        user_ids = np.random.choice(user_ids, sample_size, replace=False)

    for user_id in user_ids:
        try:
            # Get the user's ratings in the test set
            user_test_ratings = test_data[test_data['User-ID'] == user_id]
            if len(user_test_ratings) == 0:
                continue
                
            avg_rating = user_test_ratings['Book-Rating'].mean()

            # Get books the user likes (ratings above the average)
            relevant_read_books = user_test_ratings[user_test_ratings['Book-Rating'] >= avg_rating]
            relevant_read_books = pd.merge(relevant_read_books, sampled_books[['ISBN', 'Book-Title']], on='ISBN')
            
            if len(relevant_read_books) == 0:
                continue

            # Get books the user likes (ratings above the average)
            # relevant_books = user_test_ratings[user_test_ratings['Book-Rating'] >= avg_rating]
            # relevant_books = pd.merge(relevant_books, sampled_books[['ISBN', 'Book-Title']], on='ISBN')
            
            # if len(relevant_books) == 0:
            #     continue
                
            # Calculate relevant books based on average ratings of all users
            recommended_books = hybrid_recommend_books(user_id, num_recommendations)
            if len(recommended_books) == 0:
                continue
            
            relevant_books = pd.DataFrame(columns=recommended_books.columns)
            for _, rec_book in recommended_books.iterrows():
                rec_isbn = rec_book['ISBN']
                avg_book_rating = test_data[test_data['ISBN'] == rec_isbn]['Book-Rating'].mean()
                
                if avg_book_rating > avg_rating:
                    relevant_books = pd.concat([relevant_books, pd.DataFrame([rec_book])], ignore_index=True)  # 使用 pd.concat 替代 append
            
            if len(relevant_books) == 0:
                continue

            # print("rec",len(recommended_books))
            # print("rel", len(relevant_books))
            # Get recommended books
            user_hits = 0

            # for every recommanded book
            for _, rec_book in relevant_books.iterrows():
                if rec_book['ISBN'] not in isbn_to_index:
                    continue
                
                rec_idx = isbn_to_index[rec_book['ISBN']]
                counted = False
                # for every book user liked (filter by average ratings)
                for _, rel_book in relevant_read_books.iterrows():
                    if rel_book['ISBN'] not in isbn_to_index:
                        continue
                        
                    rel_idx = isbn_to_index[rel_book['ISBN']]
                    
                    # ensure the index is in the size of cosine_sim matrix
                    if rec_idx >= len(cosine_sim) or rel_idx >= len(cosine_sim):
                        continue
                    
                    # calculate the title similarity
                    similarity = cosine_sim[rec_idx][rel_idx]
                    
                    # if the similarity is higher than the threshold, then hit
                    if similarity >= similarity_threshold and counted == False:
                        user_hits += 1
                        counted = True
                        break  # one book only counted to one hit
            
            hits += user_hits
            total_relevant += len(relevant_books)
            total_recommended += len(recommended_books)

            # print("hits",hits)
            # print("total_relevant",total_relevant)
            # print("total_recommended",total_recommended)
            
        except Exception as e:
            print(f"Error processing user {user_id}: {str(e)}")
            continue
    
    precision = hits / total_recommended if total_recommended > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0
    
    return precision, recall

# Test different similarity threshold
print("Calculating precision and recall...")
thresholds = [0.7, 0.75, 0.8, 0.85, 0.9]
results = []

for threshold in thresholds:
    print(f"\nCalculating threshold {threshold}...")
    precision, recall = precision_recall_at_k(sampled_ratings, 
                                            num_recommendations=5, 
                                            sample_size=100, 
                                            similarity_threshold=threshold)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    results.append({
        'threshold': threshold,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })
    print(f"Threshold: {threshold:.2f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

Calculating precision and recall...

Calculating threshold 0.7...
Threshold: 0.70
Precision: 0.8557
Recall: 0.9857
F1-score: 0.9161

Calculating threshold 0.75...
Threshold: 0.75
Precision: 0.8433
Recall: 0.9713
F1-score: 0.9028

Calculating threshold 0.8...
Threshold: 0.80
Precision: 0.7637
Recall: 0.8797
F1-score: 0.8176

Calculating threshold 0.85...
Threshold: 0.85
Precision: 0.5174
Recall: 0.5960
F1-score: 0.5539

Calculating threshold 0.9...
Threshold: 0.90
Precision: 0.1741
Recall: 0.2006
F1-score: 0.1864


In [4]:
# Randomly select a user ID from the sampled ratings
random.seed(42)
random_user_id = random.choice(sampled_ratings['User-ID'].unique())

# Get the books rated by the selected user
user_rated_books = sampled_ratings[sampled_ratings['User-ID'] == random_user_id]['ISBN'].tolist()

# Get the books rated by the user with titles (join on ISBN)
rated_books = sampled_books[sampled_books['ISBN'].isin(user_rated_books)]

# Get the books recommended for this user using the hybrid recommendation system
recommended_books = hybrid_recommend_books(random_user_id, num_recommendations=5)

# Display the results
print(f"User ID: {random_user_id}")
print("\nBooks rated by the user:")

# Display ISBN and Book-Rating for the books rated by the user, along with the Book Title
rated_books_info = rated_books[['ISBN', 'Book-Title']]
print(rated_books_info)

print("\nBooks recommended for the user:")

# Display ISBN and Book-Title for the recommended books
print(recommended_books[['ISBN', 'Book-Title']])

User ID: 216343

Books rated by the user:
              ISBN                                         Book-Title
105269  0060162546  Small Victories: The Real World of a Teacher, ...

Books recommended for the user:
             ISBN                                         Book-Title
1105   0060928336    Divine Secrets of the Ya-Ya Sisterhood: A Novel
14953  0553250426  The Clan of the Cave Bear (Earth's Children (P...
1284   0380813815  Lamb : The Gospel According to Biff, Christ's ...
1029   0345348036  The Princess Bride: S Morgenstern's Classic Ta...
4244   0802130208           A Confederacy of Dunces (Evergreen Book)
