## Installing necessary libraries

In [None]:
pip install transformers

In [None]:
pip install torch torchvision torchaudio

In [None]:
import numpy as np
import pandas as pd
import string
import re
import scipy.sparse as sp

import nltk #using the nltk library for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import BertTokenizer, BertModel

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import vstack, hstack
from sklearn.metrics.pairwise import cosine_similarity

In [146]:
# Initializing BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

## Reading datasets from their respective CSV files

In [4]:
bx_cleaned = pd.read_csv('/Users/sanjeevani1109/Desktop/Book_Recommendation_System/bx_cleaned')

In [5]:
gbx_cleaned = pd.read_csv('/Users/sanjeevani1109/Desktop/Book_Recommendation_System/gbx_cleaned')

## Helper-functions

In [6]:
#Function to clean text data
def clean_text(text):
    text = text.lower()     #Converting the text to lowercase
    text = re.sub(r'\d+', ' ', text)    # Removing digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removing special characters
    text = text.translate(str.maketrans('', '', string.punctuation))    #Removing punctuation     
    tokens = word_tokenize(text) #Tokenization of the text
    non_stopwords = []  #Creating an empty list to store the non-stopwords
    stop_words = set(stopwords.words('english')) #Initializing the NLTK English stopwords
    #Lemmatization of the words/tokens
    lemmatizer = WordNetLemmatizer()    #Initialization of the NLTK WordNet Lemmatizer

    #Iterating through the list of tokens, lemmatizing them, and adding it to the list of non-stopwords
    for token in tokens:
        if token not in stop_words:
            lemma = lemmatizer.lemmatize(token) #Performing lemmatization
            non_stopwords.append(lemma)  # Appending the stemmed token back to the list of non-stopwords

    # Joining the tokens/words from the list of non-stopwords into a string
    filtered_text = ' '.join(non_stopwords)
    return filtered_text

In [7]:
def clean_genres(text):
    text = text.lower()
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [147]:
# Generate BERT embeddings for a list of texts in batches

def get_bert_embeddings_batch(text_list, batch_size=32):
    embeddings = [] #initializign an empty list to store the BERT embeddings for all the texts
    for i in range(0, len(text_list), batch_size):  #batch processing the texts from the list of texts
        batch = text_list[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512) #tokenization of the texts in the batch and returning the PyTorch tensors
        outputs = model(**inputs)   #passing the tokenized inputs to the BERT model to retrieve the outputs
        embeddings.extend(outputs.last_hidden_state.mean(dim=1).detach().numpy()) 
    return np.vstack(embeddings)    #vertical stacking of of the embeddings to create a single matrix of embeddings for the input texts

In [8]:
bx_cleaned['cleaned_genres'] = bx_cleaned['Genres'].apply(clean_genres)

In [9]:
gbx_cleaned['cleaned_genres'] = gbx_cleaned['Genres'].apply(clean_genres)

In [10]:
bx_cleaned['title_genres_combined'] = bx_cleaned['cleaned_title'] + ' ' + bx_cleaned['cleaned_genres']
gbx_cleaned['title_genres_combined'] = gbx_cleaned['cleaned_title'] + ' ' + gbx_cleaned['cleaned_genres']

In [148]:
bx_cleaned.head(1)

Unnamed: 0,UserID,ISBN,User_Rating,Title,Author,Average_Rating,cleaned_title,cleaned_author,Genres,new_combined,title_embeddings,combined_embeddings,genres_embeddings,combined_cluster_label,genre_cluster_label,cleaned_genres,title_genres_combined,authors_embeddings
0,276733,2080674722,1,Les Particules Elementaires,Michel Houellebecq,2,le particules elementaires,michel houellebecq,"Fiction, France, Literature, Novels, French Li...",le particules elementaires michel houellebecq ...,"[-0.22314462, 0.42006478, -0.28728843, -0.1840...","[-0.21087638, 0.38165882, -0.00066206243, -0.3...","[0.11841526, 0.20468897, -0.028159915, -0.2183...",1,3,fiction france literature novels french litera...,le particules elementaires fiction france lite...,"[-0.3414879, -0.07702848, -0.051483676, -0.296..."


### Creating BERT embeddings for relevant columns

#### (uncomment to regenerate embeddings)

##### 'Title' columns

In [None]:
# unique_title_bx = bx_cleaned['Title'].drop_duplicates().reset_index(drop=True)
# title_embeddings_bx = get_bert_embeddings_batch(bx_cleaned['Title'].tolist())
# genres_to_embedding_bx = dict(zip(unique_title_bx, title_embeddings_bx))
# # bx_cleaned.head(3)

In [None]:
# unique_title_gbx = gbx_cleaned['Title'].drop_duplicates().reset_index(drop=True)
# title_embeddings_gbx = get_bert_embeddings_batch(gbx_cleaned['Title'].tolist())
# genres_to_embedding_bx = dict(zip(unique_title_gbx, title_embeddings_gbx))
# # bx_cleaned.head(3)

##### 'new_combined' columns (cleaned_title + cleaned_author + cleaned_genres)

In [None]:
# unique_combined_bx = bx_cleaned['new_combined'].drop_duplicates().reset_index(drop=True)
# combined_embeddings_bx = get_bert_embeddings_batch(bx_cleaned['new_combined'].tolist())
# combined_to_embedding_bx = dict(zip(unique_combined_bx, combined_embeddings_bx))
# # bx_cleaned.head(3)

In [None]:
# unique_combined_gbx = gbx_cleaned['new_combined'].drop_duplicates().reset_index(drop=True)
# combined_embeddings_gbx = get_bert_embeddings_batch(gbx_cleaned['new_combined'].tolist())
# combined_to_embedding_gbx = dict(zip(unique_combined_gbx, combined_embeddings_gbx))
# # bx_cleaned.head(3)

##### 'Genres' columns

In [None]:
# unique_genres_bx = bx_cleaned['Genres'].drop_duplicates().reset_index(drop=True)
# genres_embeddings_bx = get_bert_embeddings_batch(bx_cleaned['Genres'].tolist())
# combined_to_embedding_bx = dict(zip(unique_genres_bx, genres_embeddings_bx))
# # bx_cleaned.head(3)

In [None]:
# unique_genres_gbx = gbx_cleaned['Genres'].drop_duplicates().reset_index(drop=True)
# genres_embeddings_gbx = get_bert_embeddings_batch(gbx_cleaned['Genres'].tolist())
# combined_to_embedding_gbx = dict(zip(unique_genres_gbx, genres_embeddings_gbx))
# # bx_cleaned.head(3)

##### 'Author' columns

In [13]:
# unique_author_bx = bx_cleaned['Author'].drop_duplicates().reset_index(drop=True)
# author_embeddings_bx = get_bert_embeddings_batch(bx_cleaned['Author'].tolist())
# genres_to_embedding_bx = dict(zip(unique_author_bx, author_embeddings_bx))
# # bx_cleaned.head(3)

In [14]:
# unique_author_gbx = gbx_cleaned['Author'].drop_duplicates().reset_index(drop=True)
# author_embeddings_gbx = get_bert_embeddings_batch(gbx_cleaned['Author'].tolist())
# genres_to_embedding_gbx = dict(zip(unique_author_gbx, author_embeddings_gbx))
# # gbx_cleaned.head(3)

### Saving the embeddings as CSV files

##### This is done to avoid long recalculations everytime we restart the kernel ((uncomment to resave the embeddings))

In [None]:
# unique_title_bx.to_csv('unique_titles_bx.csv', index=False)
# unique_titles_gbx.to_csv('unique_titles_gbx.csv', index=False)

# unique_genres_bx.to_csv('unique_genres_bx.csv', index=False)
# unique_genres_gbx.to_csv('unique_genres_gbx.csv', index=False)

# unique_combined_bx.to_csv('unique_combined_bx.csv', index=False)
# unique_combined_gbx.to_csv('unique_combined_gbx.csv', index=False)

# unique_author_bx.to_csv('unique_author_bx.csv', index=False)
# unique_author_gbx.to_csv('unique_author_gbx.csv', index=False)

In [None]:
# np.save('title_embeddings_bx.npy', title_embeddings_bx)
# np.save('title_embeddings_gbx.npy', title_embeddings_gbx)

# np.save('combined_embeddings_bx.npy', combined_embeddings_bx)
# np.save('combined_embeddings_gbx.npy', combined_embeddings_gbx)

# np.save('genres_embeddings_bx.npy', genres_embeddings_bx)
# np.save('genres_embeddings_gbx.npy', genres_embeddings_gbx)

# np.save('author_embeddings_bx.npy', author_embeddings_bx)
# np.save('author_embeddings_gbx.npy', author_embeddings_gbx)


### Saving the data frames as csv files

In [17]:
# gbx_cleaned.to_csv('/Users/sanjeevani1109/Desktop/Book_Recommendation_System/gbx_cleaned', index=False)

In [18]:
# bx_cleaned.to_csv('/Users/sanjeevani1109/Desktop/Book_Recommendation_System/bx_cleaned', index=False)

### Loading the embeddings from the Numpys

In [19]:
bx_title_embeddings_array = np.load('title_embeddings_bx.npy')
gbx_title_embeddings_array = np.load('title_embeddings_gbx.npy')

bx_combined_embeddings_array = np.load('combined_embeddings_bx.npy')
gbx_combined_embeddings_array = np.load('combined_embeddings_gbx.npy')

bx_genres_embeddings_array = np.load('genres_embeddings_bx.npy')
gbx_genres_embeddings_array = np.load('genres_embeddings_gbx.npy')

bx_authors_embeddings_array = np.load('author_embeddings_bx.npy')
gbx_authors_embeddings_array = np.load('author_embeddings_gbx.npy')

### Loading the unique embeddings

In [20]:
unique_titles_gbx = pd.read_csv('unique_titles_gbx.csv')['Title']
unique_titles_bx = pd.read_csv('unique_titles_bx.csv')['Title']

unique_combined_gbx = pd.read_csv('unique_combined_gbx.csv')['new_combined']
unique_combined_bx = pd.read_csv('unique_combined_bx.csv')['new_combined']

unique_genres_gbx = pd.read_csv('unique_genres_gbx.csv')['Genres']
unique_genres_bx = pd.read_csv('unique_genres_bx.csv')['Genres']

unique_authors_bx = pd.read_csv('unique_author_bx.csv')['Author']
unique_authors_gbx = pd.read_csv('unique_author_gbx.csv')['Author']

### Updating the data frame with the embedding values

In [21]:
title_to_embedding_gbx = dict(zip(unique_titles_gbx, gbx_title_embeddings_array))
gbx_cleaned['title_embeddings'] = gbx_cleaned['Title'].map(title_to_embedding_gbx)

title_to_embedding_bx = dict(zip(unique_titles_bx, bx_title_embeddings_array))
bx_cleaned['title_embeddings'] = bx_cleaned['Title'].map(title_to_embedding_bx)

combined_to_embedding_gbx = dict(zip(unique_combined_gbx, gbx_combined_embeddings_array))
gbx_cleaned['combined_embeddings'] = gbx_cleaned['new_combined'].map(combined_to_embedding_gbx)

combined_to_embedding_bx = dict(zip(unique_combined_bx, bx_combined_embeddings_array))
bx_cleaned['combined_embeddings'] = bx_cleaned['new_combined'].map(combined_to_embedding_bx)

genres_to_embedding_gbx = dict(zip(unique_genres_gbx, gbx_genres_embeddings_array))
gbx_cleaned['genres_embeddings'] = gbx_cleaned['Genres'].map(genres_to_embedding_gbx)

genres_to_embedding_bx = dict(zip(unique_genres_bx, bx_genres_embeddings_array))
bx_cleaned['genres_embeddings'] = bx_cleaned['Genres'].map(genres_to_embedding_bx)

authors_to_embedding_bx = dict(zip(unique_authors_bx, bx_authors_embeddings_array))
bx_cleaned['authors_embeddings'] = bx_cleaned['Author'].map(authors_to_embedding_bx)

authors_to_embedding_gbx = dict(zip(unique_authors_gbx, gbx_authors_embeddings_array))
gbx_cleaned['authors_embeddings'] = gbx_cleaned['Author'].map(authors_to_embedding_gbx)

# 1. Content-based Filtering Model

##### Using BERT embeddings, TFIDF vectorization, and kNN

In [22]:
relevant_columns = ['Title', 'Author', 'Genres', 'cleaned_title', 'cleaned_author', 'new_combined', 'title_embeddings', 'authors_embeddings', 'genres_embeddings', 'combined_embeddings', 'cleaned_genres']  
relevant_columns_gbx = ['BookID', 'Title', 'Author', 'Genres', 'cleaned_title', 'cleaned_author', 'new_combined', 'title_embeddings', 'authors_embeddings', 'genres_embeddings', 'combined_embeddings', 'cleaned_genres']

# Dropping duplicates based on the 'Title' column and only selecting the relevant columns for bx_cleaned and gbx_cleaned
gbx_content = gbx_cleaned.drop_duplicates(subset='Title')[relevant_columns_gbx]
bx_content = bx_cleaned.drop_duplicates(subset='Title')[relevant_columns]

In [25]:
gbx_content.head(1)

Unnamed: 0,BookID,Title,Author,Genres,cleaned_title,cleaned_author,new_combined,title_embeddings,authors_embeddings,genres_embeddings,combined_embeddings,cleaned_genres
0,2767052,The Hunger Games,Suzanne Collins,"Young Adult, Fiction, Fantasy, Dystopia, Scien...",hunger game,suzanne collins,hunger game suzanne collins young adult fictio...,"[-0.35120225, -0.6337399, -0.2950579, -0.18312...","[-0.12804408, 0.11276019, -0.25651056, -0.0526...","[-0.03419616, 0.14870769, 0.5226705, -0.080204...","[-0.10558018, 0.06420389, 0.4107097, 0.1490544...",young adult fiction fantasy dystopia science f...


In [26]:
bx_content.head(1)

Unnamed: 0,Title,Author,Genres,cleaned_title,cleaned_author,new_combined,title_embeddings,authors_embeddings,genres_embeddings,combined_embeddings,cleaned_genres
0,Les Particules Elementaires,Michel Houellebecq,"Fiction, France, Literature, Novels, French Li...",le particules elementaires,michel houellebecq,le particules elementaires michel houellebecq ...,"[-0.22314462, 0.42006478, -0.28728843, -0.1840...","[-0.3414879, -0.07702848, -0.051483676, -0.296...","[0.11841526, 0.20468897, -0.028159915, -0.2183...","[-0.21087638, 0.38165882, -0.00066206243, -0.3...",fiction france literature novels french litera...


In [99]:
# Function to compute cosine similarity (based on Genres)

def cosine_sims(df, book_title):

    all_titles = df['Genres'].values
    np.append(all_titles, (np.array([book_title])))
    all_titles = all_titles.astype("U")

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_titles)

    # Converting the TFIDF matrix to CSR format
    tfidf_matrix_csr = sp.csr_matrix(tfidf_matrix)

    # Computing cosine similarity between the query and all titles
    query_vector = tfidf_matrix_csr[-1]
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix_csr).flatten()

    # Adding cosine similarity values to a new column in the DataFrame
    df['Cosine Similarity'] = cosine_similarities

    return df

In [28]:
# Fuction to compute combined features (combination of BERT embeddings and TFIDF vector representations)

def compute_combined_features(df, column_type, embeddings):

    #Computing TF-IDF Features
    tfidf_vectorizer = TfidfVectorizer(max_features=100)  
    tfidf_features = tfidf_vectorizer.fit_transform(df[column_type])

    #Concatenating BERT and TF-IDF Features
    bert_features = np.stack(df[embeddings].to_numpy())  # Using stack for an array of arrays
    combined_features = hstack([bert_features, tfidf_features]) # Horizontally stacking the BERT embeddings features and TFIDF vectors features

    return combined_features, tfidf_vectorizer

In [130]:
def content_based_recommendations(query, df, type = "combined", n_recommendations=5): # Providing the default type of recommendations as combined (based on title+author+genres)

    if 'BookID' in df.columns:      # Checking for BookID in the dataframes to determine which data frame to use
        df = gbx_content
    else:
        df = bx_content

    if type.lower() == "combined":
        column_type = "new_combined"
        embeddings = "combined_embeddings"
    elif type.lower() == "title":
        column_type = "Title"
        embeddings = "title_embeddings"
    elif type.lower() == "author":
        column_type = "Author"
        embeddings = "authors_embeddings"
    elif type.lower() == "genres":
        column_type = "cleaned_genres"
        embeddings = "genres_embeddings"

    # Computing combined features for the specified DataFrame
    combined_features, tfidf_vectorizer = compute_combined_features(df, column_type, embeddings)

    # Using k-Nearest Neighbors on the combined features
    knn_model = NearestNeighbors(n_neighbors=n_recommendations, metric='cosine')
    knn_model.fit(combined_features)

    query_embedding = get_bert_embeddings_batch([query])[0].astype(np.float64)
    query_tfidf = tfidf_vectorizer.transform([query])
    query_combined = hstack([np.array([query_embedding]), query_tfidf])

    distances, indices = knn_model.kneighbors(query_combined, n_neighbors=n_recommendations*10)

    # Flattening indices and fetching recommendations (based on the nearest neighbors)
    recommendations = df.iloc[indices.flatten()]

    # Retrieving unique recommendations based on Title and Author (excluding the query itself)
    unique_titles = set()
    unique_recommendations = []
    for _, row in recommendations.iterrows():
        unique_key = (row['Title'].lower(), row['Author'].lower())
        if unique_key not in unique_titles and row['Title'].lower() != query.lower():
            unique_titles.add(unique_key)
            unique_recommendations.append(row)
            if len(unique_recommendations) >= n_recommendations:
                break    
    print("Your query:-")
    print("Query:", query)

    combined_recs = pd.DataFrame(unique_recommendations, columns=['Title', 'Author', 'Genres']).reset_index(drop=True)

    # Calculating cosine similarity for the recommendations retrieved using kNN
    combined_recommendations = cosine_sims(combined_recs, book_title=query)  

    # Sorting the recommendations based on the cosine similarity values
    sorted_recommendations = combined_recommendations.sort_values(by='Cosine Similarity', ascending=False)

    # Retrieving the top 'n_recommendations' as specified by the user
    top_recommendations = sorted_recommendations.head(n_recommendations)

    # Creating a new data frame of top_recommendations with specified columns
    final_recs = top_recommendations.loc[:, ['Title', 'Author', 'Genres']]

    return final_recs

# 2. Collaborative-Filtering Model

#### Memory-based item-item similarity CF using kNN

In [30]:
bx_cols_to_drop = ['ISBN', 'title_embeddings', 'combined_embeddings', 'genres_embeddings', 'combined_cluster_label', 'genre_cluster_label']
bx_cf = bx_cleaned.drop(columns=bx_cols_to_drop)
bx_cf.head(1)

Unnamed: 0,UserID,User_Rating,Title,Author,Average_Rating,cleaned_title,cleaned_author,Genres,new_combined,cleaned_genres,title_genres_combined,authors_embeddings
0,276733,1,Les Particules Elementaires,Michel Houellebecq,2,le particules elementaires,michel houellebecq,"Fiction, France, Literature, Novels, French Li...",le particules elementaires michel houellebecq ...,fiction france literature novels french litera...,le particules elementaires fiction france lite...,"[-0.3414879, -0.07702848, -0.051483676, -0.296..."


In [31]:
gbx_cols_to_drop = ['id', 'ISBN', 'title_embeddings', 'combined_embeddings', 'genres_embeddings', 'combined_cluster_label', 'genre_cluster_label']
gbx_cf = gbx_cleaned.drop(columns=gbx_cols_to_drop)
gbx_cf.head(1)

Unnamed: 0,BookID,Author,Title,Average_Rating,UserID,User_Rating,cleaned_title,cleaned_author,Genres,new_combined,cleaned_genres,title_genres_combined,authors_embeddings
0,2767052,Suzanne Collins,The Hunger Games,4,314,5,hunger game,suzanne collins,"Young Adult, Fiction, Fantasy, Dystopia, Scien...",hunger game suzanne collins young adult fictio...,young adult fiction fantasy dystopia science f...,hunger game young adult fiction fantasy dystop...,"[-0.12804408, 0.11276019, -0.25651056, -0.0526..."


### Calculating User-Item matrices 

###### (Uncomment to recalculate the user-item matrices)

##### Title values as columns

In [None]:
# user_item_matrix_bx = bx_cf.pivot_table(index='UserID', columns='cleaned_title', values='User_Rating').fillna(0)

In [24]:
# user_item_matrix_gbx = gbx_cf.pivot_table(index='UserID', columns='cleaned_title', values='User_Rating').fillna(0)

##### Genres values as columns

In [25]:
# genres_user_item_matrix_gbx = gbx_cf.pivot_table(index='UserID', columns='cleaned_genres', values='User_Rating').fillna(0)

In [None]:
# genres_user_item_matrix_bx = bx_cf.pivot_table(index='UserID', columns='cleaned_genres', values='User_Rating').fillna(0)

### Saving the user-item matrices as CSV files 

##### This is done to avoid long recalculations everytime we restart the kernel ((uncomment to resave the matrices))

In [116]:
# user_item_matrix_bx.to_csv('user_item_matrix_bx.csv')

In [117]:
# user_item_matrix_gbx.to_csv('user_item_matrix_gbx.csv')

In [118]:
# genres_user_item_matrix_gbx.to_csv('genres_user_item_matrix_gbx.csv')

In [119]:
# genres_user_item_matrix_bx.to_csv('genres_user_item_matrix_bx.csv')

### Loading the User-Item matrices

In [32]:
user_item_matrix_bx = pd.read_csv('user_item_matrix_bx.csv', index_col=0)

In [None]:
user_item_matrix_gbx = pd.read_csv('user_item_matrix_gbx.csv', index_col=0)

In [None]:
genres_user_item_matrix_bx = pd.read_csv('genres_user_item_matrix_bx.csv', index_col=0)

In [33]:
genres_user_item_matrix_gbx = pd.read_csv('genres_user_item_matrix_gbx.csv', index_col=0)

In [157]:
# Function to retrieve the popular books from a dataframe (based on user ratings)
def get_popular_books(df, n=5):
    book_popularity = df.groupby('cleaned_title')['User_Rating'].count().sort_values(ascending=False).head(n)
    popular_books_details = df[df['cleaned_title'].isin(book_popularity.index)][['Title', 'Author', 'Genres']].drop_duplicates().head(n)
    return popular_books_details

In [132]:
# def item_collaborative_recommendations(book_title, df, n_recommendations=5):
#     cleaned_book_title = clean_text(book_title)
    
#     # Assuming user_item_matrix is predefined and contains your user-item interaction data
#     if 'BookID' in df.columns:
#         df = gbx_cf
#         user_item_matrix = user_item_matrix_gbx
#     else:
#         df = bx_cf
#         user_item_matrix = user_item_matrix_bx
    
#     if cleaned_book_title not in user_item_matrix.columns:
#     # if cleaned_book_title not in df['cleaned_title'].values:
#         print("Book title not found. Recommending popular books instead.")
#         return get_popular_books(df, n_recommendations)
    
#     # Initialize and train kNN model
#     model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_recommendations, n_jobs=-1)
#     model_knn.fit(user_item_matrix.T.values)  # Transpose to get item-feature matrix

#     # Locate the index of the book title
#     book_index = list(user_item_matrix.columns).index(cleaned_book_title)
#     distances, indices = model_knn.kneighbors(user_item_matrix.T.iloc[book_index, :].values.reshape(1, -1), n_neighbors=n_recommendations*2)

#     # Fetch indices for the recommended books, excluding the book itself
#     recommended_indices = indices.flatten()[1:]

#     # Fetch book titles for the recommended indices
#     recommended_books_titles = [user_item_matrix.columns[i] for i in recommended_indices]
    
#     # Fetch the details for the recommended books
#     recommended_books_details = df[df['cleaned_title'].isin(recommended_books_titles)].drop_duplicates(subset=['cleaned_title'])
#     recommended_books_details = recommended_books_details[['Title', 'Author', 'Genres']].head(n_recommendations)
    
#     # print("Your query:-")
#     # print("Query:", book_title)
#     # print("Title:", row['Title'])
#     # print("Author:", row['Author'])
#     # print("Genres:", df[df['Genres']])
    
#     return recommended_books_details

In [139]:
def collaborative_recommendations(query, df, type="Title", n_recommendations=5):    # Providing the default type of recommendations as Title (based on book title)
    
    cleaned_book_title = clean_text(query)  # Applying the clean_text function to the query to reduce it to its base form

    if type.lower() == "title":     # Checking if the user wishes for title-based recommendations

        if 'BookID' in df.columns:
            df = gbx_cf
            user_item_matrix = user_item_matrix_gbx
        else:
            df = bx_cf
            user_item_matrix = user_item_matrix_bx

        if cleaned_book_title not in user_item_matrix.columns:
            print("Book title not found. Recommending popular books instead.")
            return get_popular_books(df, n_recommendations)
        
        # Using k-Nearest Neighbors on the transpose of the item-feature matrix
        model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_recommendations, n_jobs=-1)
        model_knn.fit(user_item_matrix.T.values) 

        book_index = list(user_item_matrix.columns).index(cleaned_book_title)
        distances, indices = model_knn.kneighbors(user_item_matrix.T.iloc[book_index, :].values.reshape(1, -1), n_neighbors=n_recommendations*2)

        # Flattening indices and fetching recommendations (based on the nearest neighbors)
        recommended_indices = indices.flatten()[1:]
        recommended_books_titles = [user_item_matrix.columns[i] for i in recommended_indices]

        # Dropping the duplicates from the retrieved books based on titles
        recommended_books_details = df[df['cleaned_title'].isin(recommended_books_titles)].drop_duplicates(subset=['cleaned_title'])

        recommended_books_details = recommended_books_details[['Title', 'Author', 'Genres']].head(n_recommendations)


    elif type.lower() == "genres":      # Checking if the user wishes for genres-based recommendations
        if 'BookID' in df.columns:
            df = gbx_cf
            user_item_matrix = genres_user_item_matrix_gbx
        else:
            df = bx_cf
            user_item_matrix = genres_user_item_matrix_bx

        if cleaned_book_title not in df['cleaned_title'].values:
            print("Book title not found. Recommending popular books instead.")
            return get_popular_books(df, n_recommendations)
        
        book_genres_set = set(df.loc[df['cleaned_title'] == cleaned_book_title, 'cleaned_genres'].iloc[0].lower().split())

        relevant_columns = []
        for col in user_item_matrix.columns:
            if set(col.split()) & book_genres_set:
                relevant_columns.append(col)

        if not relevant_columns:
            print("Matching genres not found in the matrix. Recommending popular books instead.")
            return get_popular_books(df, n_recommendations)

        # Using k-Nearest Neighbors on the transpose of the item-feature matrix
        model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_recommendations, n_jobs=-1)
        model_knn.fit(user_item_matrix[relevant_columns].values.T)  

        query_vector = user_item_matrix[relevant_columns].mean(axis=1).values.reshape(1, -1)
        distances, indices = model_knn.kneighbors(query_vector, n_neighbors=n_recommendations*2)

        recommended_indices = indices.flatten()[1:]
        recommended_genres = [user_item_matrix.columns[i] for i in recommended_indices]

        recommended_books = df[df['cleaned_genres'].apply(lambda x: any(genre in x for genre in recommended_genres))].drop_duplicates(subset=['cleaned_title'])
        recommended_books_details = recommended_books[['Title', 'Author', 'Genres']].head(n_recommendations)


    # Calculating cosine similarity for the retrieved recommendations 
    combined_recommendations = cosine_sims(recommended_books_details, book_title=query)  

    # Sorting the recommendations based on the cosine similarity values
    sorted_recommendations = combined_recommendations.sort_values(by='Cosine Similarity', ascending=False)

    # Retrieving the top 'n_recommendations' from the sorted recommendations
    top_recommendations = sorted_recommendations.head(n_recommendations)

    # Creating a new dataframe of the top 'n_recommendations' with specified columns
    final_recs = top_recommendations.loc[:, ['Title', 'Author', 'Genres']]

    return final_recs


# Hybrid-Filtering Models

In [39]:
bx_cols = ['ISBN', 'combined_cluster_label', 'genre_cluster_label']
bx_hybrid = bx_cleaned.drop(columns=bx_cols)
bx_hybrid.head(1)

Unnamed: 0,UserID,User_Rating,Title,Author,Average_Rating,cleaned_title,cleaned_author,Genres,new_combined,title_embeddings,combined_embeddings,genres_embeddings,cleaned_genres,title_genres_combined,authors_embeddings
0,276733,1,Les Particules Elementaires,Michel Houellebecq,2,le particules elementaires,michel houellebecq,"Fiction, France, Literature, Novels, French Li...",le particules elementaires michel houellebecq ...,"[-0.22314462, 0.42006478, -0.28728843, -0.1840...","[-0.21087638, 0.38165882, -0.00066206243, -0.3...","[0.11841526, 0.20468897, -0.028159915, -0.2183...",fiction france literature novels french litera...,le particules elementaires fiction france lite...,"[-0.3414879, -0.07702848, -0.051483676, -0.296..."


In [40]:
gbx_cols = ['id', 'ISBN', 'combined_cluster_label', 'genre_cluster_label']
gbx_hybrid = gbx_cleaned.drop(columns=bx_cols)
gbx_hybrid.head(1)

Unnamed: 0,id,BookID,Author,Title,Average_Rating,UserID,User_Rating,cleaned_title,cleaned_author,Genres,new_combined,title_embeddings,combined_embeddings,genres_embeddings,cleaned_genres,title_genres_combined,authors_embeddings
0,1,2767052,Suzanne Collins,The Hunger Games,4,314,5,hunger game,suzanne collins,"Young Adult, Fiction, Fantasy, Dystopia, Scien...",hunger game suzanne collins young adult fictio...,"[-0.35120225, -0.6337399, -0.2950579, -0.18312...","[-0.10558018, 0.06420389, 0.4107097, 0.1490544...","[-0.03419616, 0.14870769, 0.5226705, -0.080204...",young adult fiction fantasy dystopia science f...,hunger game young adult fiction fantasy dystop...,"[-0.12804408, 0.11276019, -0.25651056, -0.0526..."


## 3. Cascade Hybrid Model

In [160]:
# def cosine_sim(df, book_title):
#     # if book_title not in df['Title'].values:
#     #     print(f"Query title '{book_title}' not found in the dataset.")
#     #     return df
    
# # try creating new_cobijned with clean_text features for tfidf
    
#     # all_titles = df['Title'].values
#     all_titles = df['Genres'].values
#     np.append(all_titles, (np.array([book_title])))
#     all_titles = all_titles.astype("U")

#     tfidf_vectorizer = TfidfVectorizer()
#     tfidf_matrix = tfidf_vectorizer.fit_transform(all_titles)

#     # Convert the TF-IDF matrix to CSR format
#     tfidf_matrix_csr = sp.csr_matrix(tfidf_matrix)

#     # Get the index of the query title
#     # query_index = df.index[df['Title'] == book_title].tolist()[0]

#     # Compute cosine similarity between the query title and all titles
#     query_vector = tfidf_matrix_csr[-1]
#     cosine_similarities = cosine_similarity(query_vector, tfidf_matrix_csr).flatten()


#     # cosine_similarities = cosine_similarity(query_index, tfidf_matrix_csr).flatten()

#     # Add cosine similarity values to a new column in the DataFrame
#     df['Cosine Similarity'] = cosine_similarities

#     return df

In [164]:
def cascade_hybrid_recss(query, df, type="combined", n_recommendations=5):
    
    recs_list = []
    # combined_recommendations = pd.DataFrame()

    if 'BookID' in df.columns:
        content_df = gbx_content
        cf_df = gbx_cf
    else:
        content_df = bx_content
        cf_df = bx_cf
    
    if type.lower() == "title":
        cont_type = "title"
        cf_type = "title"
    elif type.lower() == "author":
        cont_type = "author"
        cf_type = "title"
    elif type.lower() == "genres":
        cont_type = "genres"
        cf_type = "genres"
    elif type.lower() == "combined":
        cont_type = "combined"
        cf_type = "title"


    # initial_recommendations = content_based_recommendations(query=book_title, df=content_df, new_type, n_recommendations=n_recommendations)
    initial_recommendations = content_based_recommendations(query=query, df=content_df, type=cont_type, n_recommendations=5)

    if initial_recommendations.empty:
        return get_popular_books(df, n=n_recommendations)

    if not initial_recommendations.empty:
        # combined_recommendations = pd.DataFrame()
        for index,row in initial_recommendations.iterrows():
            # print(initial_recommendations)
            title = row['Title']
            # return book_title


            cf_recommendations = collaborative_recommendations(query=title, df=cf_df, type=cf_type, n_recommendations=2)

            # cf_recommendations = collaborative_recommendations(query=query, df=content_df, type=new_type, n_recommendations=2)

            recs_list.append(cf_recommendations)
            # print(recs_list)
        combined_recommendations = pd.concat(recs_list, ignore_index=True)

    combined_recommendations = cosine_sims(combined_recommendations, book_title=query)  # Calculate cosine similarity
    sorted_recommendations = combined_recommendations.sort_values(by='Cosine Similarity', ascending=False)
    top_recommendations = sorted_recommendations.head(n_recommendations)
    final_recs = top_recommendations.loc[:, ['Title', 'Author', 'Genres']]
    return final_recs

# Weighted Hybrid

In [165]:
def weighted_hybrid_recommendations(query, df, type="combined", n_recommendations=5, content_weight=0.5, cf_weight=0.5):
    # Adjust the number of recommendations fetched from each model to account for overlaps
    n_content = int(n_recommendations * (content_weight / (content_weight + cf_weight)))
    n_cf = n_recommendations - n_content
    
    # Select the appropriate DataFrame based on df_identifier
    if 'BookID' in df.columns:
        df_content = gbx_content
        df_cf = gbx_cf
    else:
        df_content = bx_content
        df_cf = bx_cf

    if type.lower() == "title":
        cont_type = "title"
        cf_type = "title"
    elif type.lower() == "author":
        cont_type = "author"
        cf_type = "title"
    elif type.lower() == "genres":
        cont_type = "genres"
        cf_type = "genres"
    elif type.lower() == "combined":
        cont_type = "combined"
        cf_type = "title"
    
    # Get recommendations from content-based model
    content_recs = content_based_recommendations(query, df_content, type = cont_type, n_recommendations=n_content)
    
    # Get recommendations from collaborative filtering model
    cf_recs = collaborative_recommendations(query, df_cf, type = cf_type, n_recommendations=n_cf)
    # cf_recs = collaborative_recommendations(book_title, df_cf, type="genres", n_recommendations=n_cf)
    
    # Combine recommendations
    combined_recs = pd.concat([content_recs, cf_recs]).drop_duplicates(subset=['Title', 'Author'])
    # print(combined_recs)
    

    combined_recommendations = cosine_sims(combined_recs, book_title=query)  # Calculate cosine similarity
    sorted_recommendations = combined_recommendations.sort_values(by='Cosine Similarity', ascending=False)
    top_recommendations = sorted_recommendations.head(n_recommendations).reset_index(drop=True)
    final_recs = top_recommendations.loc[:, ['Title', 'Author', 'Genres']]
    return final_recs




    # # If there are more recommendations than needed, prioritize content-based recommendations
    # if len(combined_recs) > n_recommendations:
    #     combined_recs = combined_recs.head(n_recommendations)
    
    # return combined_recs.reset_index(drop=True)


In [83]:
book_title = "Fifty Shades of Grey"
weighted_hybrid_recommendations(book_title, df = gbx_hybrid, n_recommendations=5)

Your query:-
Query: Fifty Shades of Grey
Genres: Fiction, Contemporary, British Literature, Novels, LGBT, Coming Of Age, 20th Century


Unnamed: 0,Title,Author,Genres,Cosine Similarity
8800,Fifty Shades Freed,E.L. James,"Romance, Erotica, Fiction, BDSM, Adult, Contem...",1.0
9100,Fifty Shades Darker,E.L. James,"Romance, Fiction, Erotica, BDSM, Adult, Contem...",1.0
0,What I Loved,Siri Hustvedt,"Fiction, Contemporary, Art, Literary Fiction, ...",0.166593
3800,"Eat, pray, love: one woman's search for everyt...",Elizabeth Gilbert,"Nonfiction, Memoir, Travel, Biography, Romance...",0.142316
1,The Buddha of Suburbia,Hanif Kureishi,"Fiction, Contemporary, British Literature, Nov...",0.137919


In [117]:
weighted_hybrid_recommendations("Fifty Shades Of Grey", df=gbx_hybrid, type="Genres", n_recommendations=5)

Your query:-
Query: Fifty Shades Of Grey


Unnamed: 0,Title,Author,Genres,Cosine Similarity
0,Fifty Shades Freed,E.L. James,"Romance, Erotica, Fiction, BDSM, Adult, Contem...",1.0
1,Fifty Shades Darker,E.L. James,"Romance, Fiction, Erotica, BDSM, Adult, Contem...",1.0
2,"Eat, pray, love: one woman's search for everyt...",Elizabeth Gilbert,"Nonfiction, Memoir, Travel, Biography, Romance...",0.130593
3,A Hat Full of Sky,Terry Pratchett,"Fantasy, Fiction, Young Adult, Humor, Children...",0.103518
4,Words of Radiance,Brandon Sanderson,"Fantasy, Fiction, Epic Fantasy, High Fantasy, ...",0.079192


In [119]:
weighted_hybrid_recommendations("Fifty Shades Of Grey", df=gbx_hybrid, type="Genres", n_recommendations=5)

Your query:-
Query: Fifty Shades Of Grey


Unnamed: 0,Title,Author,Genres,Cosine Similarity
0,Fifty Shades Freed,E.L. James,"Romance, Erotica, Fiction, BDSM, Adult, Contem...",1.0
1,Fifty Shades Darker,E.L. James,"Romance, Fiction, Erotica, BDSM, Adult, Contem...",1.0
2,"Eat, pray, love: one woman's search for everyt...",Elizabeth Gilbert,"Nonfiction, Memoir, Travel, Biography, Romance...",0.130593
3,A Hat Full of Sky,Terry Pratchett,"Fantasy, Fiction, Young Adult, Humor, Children...",0.103518
4,Words of Radiance,Brandon Sanderson,"Fantasy, Fiction, Epic Fantasy, High Fantasy, ...",0.079192


In [122]:
weighted_hybrid_recommendations("Fifty Shades Of Grey", df=gbx_hybrid, type="combined", n_recommendations=5)

Your query:-
Query: Fifty Shades Of Grey


Unnamed: 0,Title,Author,Genres,Cosine Similarity
0,Fifty Shades Freed,E.L. James,"Romance, Erotica, Fiction, BDSM, Adult, Contem...",1.0
1,Fifty Shades Darker,E.L. James,"Romance, Fiction, Erotica, BDSM, Adult, Contem...",1.0
2,What I Loved,Siri Hustvedt,"Fiction, Contemporary, Art, Literary Fiction, ...",0.166593
3,"Eat, pray, love: one woman's search for everyt...",Elizabeth Gilbert,"Nonfiction, Memoir, Travel, Biography, Romance...",0.142316
4,The Buddha of Suburbia,Hanif Kureishi,"Fiction, Contemporary, British Literature, Nov...",0.137919


In [81]:
book_title = "Fifty Shades of Grey"
weighted_hybrid_recommendations(book_title, df = gbx_hybrid, n_recommendations=5, content_weight=0.3, cf_weight=0.7)

Unnamed: 0,Title,Author,Genres
0,What I Loved,Siri Hustvedt,"Fiction, Contemporary, Art, Literary Fiction, ..."
1,90 Minutes in Heaven,"Don Piper, Cecil Murphey","Christian, Nonfiction, Religion, Spirituality,..."
2,Les Fleurs du mal,"Charles Baudelaire, Richard Howard","Classics, France, Poetry, French Literature, F..."
3,Ben-Hur: A Tale of the Christ,Lew Wallace,"Classics, Historical Fiction, Fiction, Christi..."
4,"Darkness, Take My Hand",Dennis Lehane,"Mystery, Fiction, Crime, Thriller, Mystery Thr..."


In [167]:
book_title = "Fifty Shades of Grey"
weighted_hybrid_recommendations(book_title, df = gbx_hybrid, type="combined", n_recommendations=5, content_weight=0.3, cf_weight=0.7)

Your query:-
Query: Fifty Shades of Grey


Unnamed: 0,Title,Author,Genres
0,"Eat, pray, love: one woman's search for everyt...",Elizabeth Gilbert,"Nonfiction, Memoir, Travel, Biography, Romance..."
1,Fifty Shades Freed,E.L. James,"Romance, Erotica, Fiction, BDSM, Adult, Contem..."
2,Fifty Shades Darker,E.L. James,"Romance, Fiction, Erotica, BDSM, Adult, Contem..."
3,Divergent,Veronica Roth,"Young Adult, Dystopia, Fantasy, Fiction, Scien..."
4,What I Loved,Siri Hustvedt,"Fiction, Contemporary, Art, Literary Fiction, ..."


# DISPLAYING RECOMMENDATIONS

In [52]:
import ipywidgets as widgets
from IPython.display import display, clear_output

In [80]:
def display_book_recommendations():
    # Text field input for the query
    query_input = widgets.Text(
        value='',
        placeholder='Enter query',
        description='Name:',
        disabled=False
    )
    
    # Dropdown to select the recommendation type
    rec_model = widgets.Dropdown(
        options=['Select', 'Content-Based', 'Collaborative', 'Cascade Hybrid', 'Weighted Hybrid'],
        value='Select',
        description='Model:',
    )

    data_frame = widgets.Dropdown(
        options = ['Select', 'Book-Crossing', 'Goodbooks-10k'],
        value = 'Select',
        description = 'Dataset'
    )

    type_rec = widgets.Dropdown(
        options = ['Select', 'Title-based', 'Author-based', 'Genre-based', 'Combined'],
        value = 'Select',
        description = 'Recommendation Type'
    )

    num_recs = widgets.Dropdown(
        options = ['Select', '1', '2','3','4','5','6','7','8','9','10'],
        value = 'Select',
        description = 'Number'
    )

    
    # Button to generate recommendations
    generate_button = widgets.Button(description="Generate Recommendations")
    
    # Output widget to display the recommendations
    output = widgets.Output()
    
    def on_generate_button_clicked(b):
        with output:
            clear_output()
            query = query_input.value
            model = rec_model.value
            dataframe = data_frame.value
            rec_type = type_rec.value
            n_recommendations = int(num_recs.value)  # You can modify this to be user-selectable as well
            # similarity_threshold = 0.02  # Default threshold, adjust as necessary

            # recommendations, message = pd.DataFrame(), ""
            recommendations = pd.DataFrame()
            

            if query:  # Check if query is not empty

                if dataframe == "Book-Crossing":
                    df = bx_hybrid
                elif dataframe == "Goodbooks-10k":
                    df = gbx_hybrid
                
                if rec_type == "Title-based":
                    cont_type = "title"
                    cf_type = "title"
                elif rec_type == "Author-based":
                    cont_type = "author"
                    cf_type = "title"
                elif rec_type == "Genre-based":
                    cont_type = "genres"
                    cf_type = "genres"
                elif rec_type == "Combined":
                    cont_type = "combined"
                    cf_type = "title"


                if model == 'Content-Based':
                    recommendations = content_based_recommendations(query, df, cont_type, n_recommendations)
                elif model == 'Collaborative':  # Collaborative
                    recommendations= collaborative_recommendations(query, df, cf_type, n_recommendations)
                elif model == 'Cascade Hybrid': 
                    recommendations = cascade_hybrid_recss(query, df, cont_type, n_recommendations)
                elif model == 'Weighted Hybrid': 
                    recommendations= weighted_hybrid_recommendations(query, df, cont_type, n_recommendations)
                else:
                    print("Please select a valid recommendation model.")
                
                if recommendations.empty:
                    print("No recommendations found")
                else:
                    display(recommendations)

    
    generate_button.on_click(on_generate_button_clicked)
    
    # Display the widgets
    display(query_input, rec_model, data_frame, type_rec, num_recs, generate_button, output)

In [144]:
display_book_recommendations()

Text(value='', description='Name:', placeholder='Enter query')

Dropdown(description='Model:', options=('Select', 'Content-Based', 'Collaborative', 'Cascade Hybrid', 'Weighte…

Dropdown(description='Dataset', options=('Select', 'Book-Crossing', 'Goodbooks-10k'), value='Select')

Dropdown(description='Recommendation Type', options=('Select', 'Title-based', 'Author-based', 'Genre-based', '…

Dropdown(description='Number', options=('Select', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'), value='S…

Button(description='Generate Recommendations', style=ButtonStyle())

Output()