In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Content-based filtering

In [None]:
class BookRecommender:
    def __init__(self, books_data):
        """
        Initialize the recommender with book data.

        Args:
            books_data (dict or DataFrame): Contains book information with keys:
                - 'title': Book titles
                - 'author': Book authors
                - 'description': Book descriptions/summaries
                - 'genre': Book genres (optional)
        """
        # Convert to DataFrame if not already
        if isinstance(books_data, dict):
            self.books = pd.DataFrame(books_data)
        else:
            self.books = books_data.copy()

        # Preprocess the data
        self._preprocess_data()

        # Create TF-IDF matrix
        self._create_tfidf_matrix()

        # Compute cosine similarity matrix
        self._compute_similarity_matrix()

    def _preprocess_data(self):
        """Combine and clean text data for analysis."""
        # Fill missing values
        self.books['description'] = self.books['description'].fillna('')
        self.books['author'] = self.books['author'].fillna('')

        # Combine features into a single text for analysis
        self.books['combined_features'] = (
            self.books['title'] + ' ' +
            self.books['author'] + ' ' +
            self.books['description']
        )

        # If genre is available, include it
        if 'genre' in self.books.columns:
            self.books['genre'] = self.books['genre'].fillna('')
            self.books['combined_features'] += ' ' + self.books['genre']

    def _create_tfidf_matrix(self):
        """Create TF-IDF matrix from combined features."""
        self.tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=10000,
            ngram_range=(1, 2)
        )
        self.tfidf_matrix = self.tfidf.fit_transform(self.books['combined_features'])

    def _compute_similarity_matrix(self):
        """Compute cosine similarity matrix."""
        self.cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

    def recommend_books(self, book_title, n=5):
        """
        Get book recommendations based on content similarity.

        Args:
            book_title (str): Title of the book to get recommendations for
            n (int): Number of recommendations to return

        Returns:
            DataFrame: Recommended books with similarity scores
        """
        # Get the index of the book
        idx = self.books[self.books['title'].str.lower() == book_title.lower()].index

        if len(idx) == 0:
            raise ValueError(f"Book '{book_title}' not found in database.")

        idx = idx[0]

        # Get pairwise similarity scores
        sim_scores = list(enumerate(self.cosine_sim[idx]))

        # Sort books by similarity score
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get scores for top n most similar books
        sim_scores = sim_scores[1:n+1]  # Skip the first item (itself)

        # Get book indices and similarity scores
        book_indices = [i[0] for i in sim_scores]
        similarity_scores = [i[1] for i in sim_scores]

        # Return top n most similar books
        recommendations = self.books.iloc[book_indices].copy()
        recommendations['similarity_score'] = similarity_scores

        return recommendations[['title', 'author', 'similarity_score']]


In [None]:
if __name__ == "__main__":
    # Sample book data (in a real scenario, you'd load this from a database or CSV)
    books_data = {
        'title': [
            'The Hobbit',
            'The Lord of the Rings',
            'Harry Potter and the Philosopher\'s Stone',
            'The Catcher in the Rye',
            'To Kill a Mockingbird',
            '1984',
            'Animal Farm',
            'The Great Gatsby',
            'Pride and Prejudice',
            'The Hunger Games'
        ],
        'author': [
            'J.R.R. Tolkien',
            'J.R.R. Tolkien',
            'J.K. Rowling',
            'J.D. Salinger',
            'Harper Lee',
            'George Orwell',
            'George Orwell',
            'F. Scott Fitzgerald',
            'Jane Austen',
            'Suzanne Collins'
        ],
        'description': [
            'A hobbit goes on an adventure with dwarves to reclaim treasure from a dragon.',
            'A fellowship embarks on a quest to destroy a powerful ring and defeat the Dark Lord.',
            'A young boy discovers he\'s a wizard and attends a magical school.',
            'A teenager experiences alienation and rebellion in New York City.',
            'A lawyer defends a black man accused of rape in the American South.',
            'A dystopian novel about totalitarianism and surveillance.',
            'An allegorical novella about animals taking over a farm.',
            'A story of wealth, love, and the American Dream in the 1920s.',
            'A romantic novel about the spirited Elizabeth Bennet and proud Mr. Darcy.',
            'A dystopian novel where teenagers fight to the death in a televised event.'
        ],
        'genre': [
            'Fantasy',
            'Fantasy',
            'Fantasy',
            'Literary Fiction',
            'Literary Fiction',
            'Dystopian',
            'Dystopian',
            'Literary Fiction',
            'Romance',
            'Dystopian'
        ]
    }

    # Create recommender
    recommenderforcbf = BookRecommender(books_data)

    # Get recommendations
    target_book = "The Hobbit"
    recommendationsforcbf = recommenderforcbf.recommend_books(target_book)

    print(f"Recommendations for '{target_book}':")
    print(recommendationsforcbf)


Recommendations for 'The Hobbit':
                                      title          author  similarity_score
1                     The Lord of the Rings  J.R.R. Tolkien          0.055273
2  Harry Potter and the Philosopher's Stone    J.K. Rowling          0.024791
3                    The Catcher in the Rye   J.D. Salinger          0.000000
4                     To Kill a Mockingbird      Harper Lee          0.000000
5                                      1984   George Orwell          0.000000


# Collaborative Filtering

In [None]:
class CollaborativeBookRecommender:
    def __init__(self, ratings_data):
        """
        Initialize the recommender with ratings data.

        Args:
            ratings_data (dict or DataFrame): Contains:
                - 'user_id': User identifiers
                - 'book_id': Book identifiers
                - 'rating': Ratings (1-5 scale)
        """
        # Convert to DataFrame if not already
        if isinstance(ratings_data, dict):
            self.ratings = pd.DataFrame(ratings_data)
        else:
            self.ratings = ratings_data.copy()

        # Preprocess the data
        self._preprocess_data()

        # Create user-book matrix
        self._create_user_book_matrix()

        # Compute user similarity matrix
        self._compute_user_similarity()

    def _preprocess_data(self):
        """Clean and prepare the ratings data."""
        # Remove duplicates if any
        self.ratings = self.ratings.drop_duplicates(['user_id', 'book_id'])

        # Create mapping from book_id to title for display purposes
        if 'title' in self.ratings.columns:
            self.book_id_to_title = dict(zip(self.ratings['book_id'], self.ratings['title']))

    def _create_user_book_matrix(self):
        """Create a sparse user-book rating matrix."""
        self.user_book_matrix = self.ratings.pivot(
            index='user_id',
            columns='book_id',
            values='rating'
        ).fillna(0)

        # Convert to sparse matrix for efficient computation
        self.sparse_user_book = csr_matrix(self.user_book_matrix.values)

    def _compute_user_similarity(self):
        """Compute cosine similarity between users."""
        self.user_similarity = cosine_similarity(self.sparse_user_book)
        self.user_similarity_df = pd.DataFrame(
            self.user_similarity,
            index=self.user_book_matrix.index,
            columns=self.user_book_matrix.index
        )

    def recommend_books(self, user_id, n=5):
        """
        Get book recommendations for a user based on similar users' preferences.

        Args:
            user_id: The user to get recommendations for
            n (int): Number of recommendations to return

        Returns:
            DataFrame: Recommended books with predicted ratings
        """
        if user_id not in self.user_book_matrix.index:
            raise ValueError(f"User {user_id} not found in database.")

        # Get similar users (excluding the user themselves)
        similar_users = self.user_similarity_df[user_id].sort_values(ascending=False)[1:n+1]

        # Get books rated by similar users
        similar_users_ratings = self.user_book_matrix.loc[similar_users.index]

        # Calculate weighted average of ratings
        weighted_ratings = np.dot(similar_users.values, similar_users_ratings) / similar_users.sum()

        # Create recommendations DataFrame
        recommendations = pd.DataFrame({
            'book_id': self.user_book_matrix.columns,
            'predicted_rating': weighted_ratings
        })

        # Filter out books already rated by the user
        user_rated = set(self.ratings[self.ratings['user_id'] == user_id]['book_id'])
        recommendations = recommendations[~recommendations['book_id'].isin(user_rated)]

        # Sort by predicted rating
        recommendations = recommendations.sort_values('predicted_rating', ascending=False).head(n)

        # Add book titles if available
        if hasattr(self, 'book_id_to_title'):
            recommendations['title'] = recommendations['book_id'].map(self.book_id_to_title)

        return recommendations.reset_index(drop=True)

In [None]:
if __name__ == "__main__":
    # Sample ratings data (in a real scenario, you'd load this from a database or CSV)
    ratings_data = {
        'user_id': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5],
        'book_id': [101, 102, 103, 101, 102, 104, 102, 103, 105, 101, 103, 105, 102, 104, 105],
        'rating': [5, 4, 3, 4, 5, 2, 5, 4, 1, 3, 5, 4, 4, 3, 5],
        'title': [
            'The Hobbit',
            'The Lord of the Rings',
            'Harry Potter and the Philosopher\'s Stone',
            'The Hobbit',
            'The Lord of the Rings',
            'To Kill a Mockingbird',
            'The Lord of the Rings',
            'Harry Potter and the Philosopher\'s Stone',
            '1984',
            'The Hobbit',
            'Harry Potter and the Philosopher\'s Stone',
            '1984',
            'The Lord of the Rings',
            'To Kill a Mockingbird',
            '1984'
        ]
    }

    # Create recommender
    recommenderforcf = CollaborativeBookRecommender(ratings_data)

    # Get recommendations for user 1
    user_id = 1
    recommendationsforcf = recommenderforcf.recommend_books(user_id)

    print(f"Recommendations for user {user_id}:")
    print(recommendationsforcf.to_string(index=False))

Recommendations for user 1:
                                   title         author  similarity_score
                   The Lord of the Rings J.R.R. Tolkien          0.055273
Harry Potter and the Philosopher's Stone   J.K. Rowling          0.024791
                  The Catcher in the Rye  J.D. Salinger          0.000000
                   To Kill a Mockingbird     Harper Lee          0.000000
                                    1984  George Orwell          0.000000


# Hybrid Filtering

In [None]:
class HybridBookRecommender:
    def __init__(self, books_data, ratings_data):
        """
        Initialize the hybrid recommender with both book content and ratings data.

        Args:
            books_data: Contains book information (title, author, description, etc.)
            ratings_data: Contains user-book ratings (user_id, book_id, rating)
        """
        # Initialize content-based components
        if isinstance(books_data, dict):
            self.books = pd.DataFrame(books_data)
        else:
            self.books = books_data.copy()
        self._preprocess_book_data()
        self._create_content_matrix()

        # Initialize collaborative filtering components
        if isinstance(ratings_data, dict):
            self.ratings = pd.DataFrame(ratings_data)
        else:
            self.ratings = ratings_data.copy()
        self._preprocess_rating_data()
        self._create_user_book_matrix()

        # Create hybrid weights
        self.content_weight = 0.5  # Can be adjusted
        self.collab_weight = 0.5   # Can be adjusted

    # Content-based methods
    def _preprocess_book_data(self):
        """Prepare book content data."""
        self.books['description'] = self.books['description'].fillna('')
        self.books['author'] = self.books['author'].fillna('')

        # Combine features for content analysis
        self.books['combined_features'] = (
            self.books['title'] + ' ' +
            self.books['author'] + ' ' +
            self.books['description']
        )

        if 'genre' in self.books.columns:
            self.books['genre'] = self.books['genre'].fillna('')
            self.books['combined_features'] += ' ' + self.books['genre']

        # Create book_id to title mapping
        self.book_id_to_title = dict(zip(self.books['book_id'], self.books['title']))

    def _create_content_matrix(self):
        """Create TF-IDF matrix from book content."""
        self.tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=10000,
            ngram_range=(1, 2)
        )
        self.content_matrix = self.tfidf.fit_transform(self.books['combined_features'])
        self.content_sim = cosine_similarity(self.content_matrix)

    # Collaborative filtering methods
    def _preprocess_rating_data(self):
        """Prepare ratings data."""
        self.ratings = self.ratings.drop_duplicates(['user_id', 'book_id'])

        # Create user and book mappings
        self.user_id_to_idx = {uid: i for i, uid in enumerate(self.ratings['user_id'].unique())}
        self.book_id_to_idx = {bid: i for i, bid in enumerate(self.ratings['book_id'].unique())}

    def _create_user_book_matrix(self):
        """Create user-book rating matrix."""
        self.user_book_matrix = self.ratings.pivot(
            index='user_id',
            columns='book_id',
            values='rating'
        ).fillna(0)

        # Convert to sparse matrix
        self.sparse_user_book = csr_matrix(self.user_book_matrix.values)

        # Compute user similarity
        self.user_sim = cosine_similarity(self.sparse_user_book)
        self.user_sim_df = pd.DataFrame(
            self.user_sim,
            index=self.user_book_matrix.index,
            columns=self.user_book_matrix.index
        )

    # Hybrid recommendation methods
    def recommend_books(self, user_id=None, book_id=None, n=5):
        """
        Get hybrid recommendations.

        Args:
            user_id: For collaborative filtering (optional)
            book_id: For content-based filtering (optional)
            n: Number of recommendations

        Returns:
            DataFrame with recommendations
        """
        if user_id is None and book_id is None:
            raise ValueError("Must provide either user_id or book_id")

        content_rec = pd.DataFrame()
        collab_rec = pd.DataFrame()

        # Get content-based recommendations if book_id provided
        if book_id is not None:
            content_rec = self._get_content_recommendations(book_id)

        # Get collaborative recommendations if user_id provided
        if user_id is not None:
            collab_rec = self._get_collaborative_recommendations(user_id)

        # Combine recommendations
        return self._combine_recommendations(content_rec, collab_rec, n)

    def _get_content_recommendations(self, book_id, n=50):
        """Get content-based recommendations."""
        if book_id not in self.books['book_id'].values:
            raise ValueError(f"Book {book_id} not found in database.")

        # Find book index
        book_idx = self.books[self.books['book_id'] == book_id].index[0]

        # Get similarity scores
        sim_scores = list(enumerate(self.content_sim[book_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top n similar books
        book_indices = [i[0] for i in sim_scores[1:n+1]]
        similarity_scores = [i[1] for i in sim_scores[1:n+1]]

        recommendations = self.books.iloc[book_indices].copy()
        recommendations['content_score'] = similarity_scores

        return recommendations[['book_id', 'title', 'content_score']]

    def _get_collaborative_recommendations(self, user_id, n=50):
        """Get collaborative recommendations."""
        if user_id not in self.user_book_matrix.index:
            raise ValueError(f"User {user_id} not found in database.")

        # Get similar users
        similar_users = self.user_sim_df[user_id].sort_values(ascending=False)[1:n+1]

        # Get their ratings
        similar_users_ratings = self.user_book_matrix.loc[similar_users.index]

        # Calculate weighted average
        weighted_ratings = np.dot(similar_users.values, similar_users_ratings) / similar_users.sum()

        # Create recommendations
        recommendations = pd.DataFrame({
            'book_id': self.user_book_matrix.columns,
            'collab_score': weighted_ratings
        })

        # Filter out already rated books
        if user_id in self.ratings['user_id'].values:
            user_rated = set(self.ratings[self.ratings['user_id'] == user_id]['book_id'])
            recommendations = recommendations[~recommendations['book_id'].isin(user_rated)]

        # Add titles
        recommendations['title'] = recommendations['book_id'].map(self.book_id_to_title)

        return recommendations[['book_id', 'title', 'collab_score']].dropna()

    def _combine_recommendations(self, content_rec, collab_rec, n=5):
        """Combine content and collaborative recommendations."""
        # If we only have one type of recommendation, return it
        if content_rec.empty:
            return collab_rec.sort_values('collab_score', ascending=False).head(n)
        if collab_rec.empty:
            return content_rec.sort_values('content_score', ascending=False).head(n)

        # Merge both recommendation types
        hybrid_rec = pd.merge(
            content_rec,
            collab_rec,
            on=['book_id', 'title'],
            how='outer'
        ).fillna(0)

        # Calculate hybrid score
        hybrid_rec['hybrid_score'] = (
            self.content_weight * hybrid_rec['content_score'] +
            self.collab_weight * hybrid_rec['collab_score']
        )

        # Return top n recommendations
        return hybrid_rec.sort_values('hybrid_score', ascending=False).head(n)

In [None]:
if __name__ == "__main__":
    # Sample book data
    books_data = {
        'book_id': [101, 102, 103, 104, 105],
        'title': [
            'The Hobbit',
            'The Lord of the Rings',
            'Harry Potter and the Philosopher\'s Stone',
            'To Kill a Mockingbird',
            '1984'
        ],
        'author': [
            'J.R.R. Tolkien',
            'J.R.R. Tolkien',
            'J.K. Rowling',
            'Harper Lee',
            'George Orwell'
        ],
        'description': [
            'A hobbit goes on an adventure with dwarves to reclaim treasure from a dragon.',
            'A fellowship embarks on a quest to destroy a powerful ring and defeat the Dark Lord.',
            'A young boy discovers he\'s a wizard and attends a magical school.',
            'A lawyer defends a black man accused of rape in the American South.',
            'A dystopian novel about totalitarianism and surveillance.'
        ],
        'genre': [
            'Fantasy',
            'Fantasy',
            'Fantasy',
            'Literary Fiction',
            'Dystopian'
        ]
    }

    # Sample ratings data
    ratings_data = {
        'user_id': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5],
        'book_id': [101, 102, 103, 101, 102, 104, 102, 103, 105, 101, 103, 105, 102, 104, 105],
        'rating': [5, 4, 3, 4, 5, 2, 5, 4, 1, 3, 5, 4, 4, 3, 5]
    }

    # Create hybrid recommender
    recommenderforhf = HybridBookRecommender(books_data, ratings_data)

    # Example 1: Recommendations based on a book (content-based + collaborative)
    print("Hybrid recommendations similar to 'The Hobbit':")
    book_recs = recommenderforhf.recommend_books(book_id=101)
    print(book_recs.to_string(index=False))

    # Example 2: Recommendations for a user (collaborative + content-based)
    print("\nHybrid recommendations for user 1:")
    user_recs = recommenderforhf.recommend_books(user_id=1)
    print(user_recs.to_string(index=False))

    # Example 3: Cold start - new user with no ratings (content-based only)
    print("\nCold start recommendations (content-based) for new book 'The Hobbit':")
    cold_start_recs = recommenderforhf.recommend_books(book_id=101)
    print(cold_start_recs.to_string(index=False))

Hybrid recommendations similar to 'The Hobbit':
 book_id                                    title  content_score
     102                    The Lord of the Rings       0.048002
     103 Harry Potter and the Philosopher's Stone       0.020233
     104                    To Kill a Mockingbird       0.000000
     105                                     1984       0.000000

Hybrid recommendations for user 1:
 book_id                 title  collab_score
     105                  1984      1.908658
     104 To Kill a Mockingbird      1.075146

Cold start recommendations (content-based) for new book 'The Hobbit':
 book_id                                    title  content_score
     102                    The Lord of the Rings       0.048002
     103 Harry Potter and the Philosopher's Stone       0.020233
     104                    To Kill a Mockingbird       0.000000
     105                                     1984       0.000000
