In [None]:
!pip install pandas numpy scikit-learn scikit-surprise datasets joblib

In [None]:
from datasets import load_dataset
import pandas as pd

print("📚 Loading dataset...")
dataset = load_dataset("zygmunt/goodbooks-10k")

ratings_df = pd.DataFrame(dataset['ratings']['train'])
books_df = pd.DataFrame(dataset['books']['train'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

books_df['genres'] = books_df['original_title'].fillna('') + " " + books_df['authors'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_df['genres'])
cosine_sim = cosine_similarity(tfidf_matrix)

In [None]:
def content_based_recommendations(book_title, n=5):
    if book_title not in books_df['title'].values:
        return pd.DataFrame(columns=['title', 'authors'])
    idx = books_df[books_df['title'] == book_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    book_indices = [i[0] for i in sim_scores]
    return books_df.iloc[book_indices][['title', 'authors', 'original_publication_year']]

In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd = SVD()
svd.fit(trainset)

predictions = svd.test(testset)
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))

In [None]:
def hybrid_recommend(user_id, book_title=None, n=5):
    if book_title:
        content_recs = content_based_recommendations(book_title, n * 2)
        if content_recs.empty:
            return content_recs
        content_book_ids = content_recs.merge(books_df[['title', 'book_id']], on='title')['book_id'].tolist()
        content_recs['predicted_rating'] = content_book_ids[:len(content_recs)]
        content_recs['predicted_rating'] = content_recs['predicted_rating'].apply(lambda x: svd.predict(user_id, x).est)
        return content_recs.sort_values(by='predicted_rating', ascending=False).head(n)
    else:
        rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id'].tolist()
        unrated_books = books_df[~books_df['book_id'].isin(rated_books)]
        predictions = []
        for book_id in unrated_books['book_id'].sample(n=min(1000, len(unrated_books)), random_state=42):
            est = svd.predict(user_id, book_id).est
            predictions.append((book_id, est))
        top_books = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
        top_ids = [x[0] for x in top_books]
        return books_df[books_df['book_id'].isin(top_ids)][['title', 'authors', 'original_publication_year']]

In [None]:
# Try recommendation
user_id = 10
book_title = "The Hobbit"

print("\n📘 Hybrid Recommendations Based on Book:")
print(hybrid_recommend(user_id, book_title))

print("\n📕 Collaborative Filtering Recommendations:")
print(hybrid_recommend(user_id))