# Import Libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
!pip install --upgrade lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
import scipy.sparse as sp
from sklearn.metrics import mean_squared_error



# Data Preprocessing

Load the dataset

In [None]:
books_df = pd.read_csv('data/books.csv')
print("Books DataFrame:\n", books_df.head())

Books DataFrame:
    id  book_id  best_book_id  work_id  books_count       isbn        isbn13  \
0   1  2767052       2767052  2792775          272  439023483  9.780439e+12   
1   2        3             3  4640799          491  439554934  9.780440e+12   
2   3    41865         41865  3212258          226  316015849  9.780316e+12   
3   4     2657          2657  3275794          487   61120081  9.780061e+12   
4   5     4671          4671   245494         1356  743273567  9.780743e+12   

                       authors  original_publication_year  \
0              Suzanne Collins                     2008.0   
1  J.K. Rowling, Mary GrandPré                     1997.0   
2              Stephenie Meyer                     2005.0   
3                   Harper Lee                     1960.0   
4          F. Scott Fitzgerald                     1925.0   

                             original_title  ... ratings_count  \
0                          The Hunger Games  ...       4780653   
1  Harry

In [None]:
books_df['combined_features'] = books_df['title'].str.lower() + ' ' + books_df['authors'].str.lower()

In [None]:
book_tags_df = pd.read_csv('data/book_tags.csv')
print("\nBook Tags DataFrame:\n", book_tags_df.head())


Book Tags DataFrame:
    goodreads_book_id  tag_id   count
0                  1   30574  167697
1                  1   11305   37174
2                  1   11557   34173
3                  1    8717   12986
4                  1   33114   12716


In [None]:
ratings_df = pd.read_csv('data/ratings.csv')
print("\nRatings DataFrame:\n", ratings_df.head())


Ratings DataFrame:
    book_id  user_id  rating
0        1      314       5
1        1      439       3
2        1      588       5
3        1     1169       4
4        1     1185       4


In [None]:
tags_df = pd.read_csv('data/tags.csv')
print("\nTags DataFrame:\n", tags_df.head())


Tags DataFrame:
    tag_id tag_name
0       0        -
1       1     --1-
2       2    --10-
3       3    --12-
4       4   --122-


In [None]:
to_read_df = pd.read_csv('data/to_read.csv')
print("\nTo Read DataFrame:\n", to_read_df.head())


To Read DataFrame:
    user_id  book_id
0        1      112
1        1      235
2        1      533
3        1     1198
4        1     1874


Remove unnecessary columns

In [None]:
#books_df = books_df[['book_id', 'title', 'authors', 'original_publication_year']]

ratings_df = ratings_df[['user_id', 'book_id', 'rating']]
tags_df = tags_df[['tag_id', 'tag_name']]
book_tags_df = book_tags_df[['goodreads_book_id', 'tag_id', 'count']]
to_read_df = to_read_df[['user_id', 'book_id']]

Handling Missing Values

In [None]:
books_df = books_df.dropna()
ratings_df = ratings_df.dropna()
tags_df = tags_df.dropna()
book_tags_df = book_tags_df.dropna()
to_read_df = to_read_df.dropna()

Drop Duplicates

In [None]:
books_df = books_df.drop_duplicates()
ratings_df = ratings_df.drop_duplicates()
tags_df = tags_df.drop_duplicates()
book_tags_df = book_tags_df.drop_duplicates()
to_read_df = to_read_df.drop_duplicates()

Convert data types if necessary

In [None]:
books_df['original_publication_year'] = books_df['original_publication_year'].astype(int)
ratings_df['rating'] = ratings_df['rating'].astype(float)

Merge dataframes

In [None]:
booktags_with_names_df = pd.merge(book_tags_df, tags_df, on='tag_id')
merged_df = pd.merge(ratings_df, books_df, on='book_id')
merged_df = pd.merge(merged_df, booktags_with_names_df, left_on='book_id', right_on='goodreads_book_id', how='left')

print(merged_df.head())

   user_id  book_id  rating  id  best_book_id   work_id  books_count  \
0      314        1     5.0  27             1  41335427          275   
1      314        1     5.0  27             1  41335427          275   
2      314        1     5.0  27             1  41335427          275   
3      314        1     5.0  27             1  41335427          275   
4      314        1     5.0  27             1  41335427          275   

        isbn        isbn13                      authors  ...  ratings_3  \
0  439785960  9.780440e+12  J.K. Rowling, Mary GrandPré  ...     136333   
1  439785960  9.780440e+12  J.K. Rowling, Mary GrandPré  ...     136333   
2  439785960  9.780440e+12  J.K. Rowling, Mary GrandPré  ...     136333   
3  439785960  9.780440e+12  J.K. Rowling, Mary GrandPré  ...     136333   
4  439785960  9.780440e+12  J.K. Rowling, Mary GrandPré  ...     136333   

  ratings_4 ratings_5                                          image_url  \
0    459028   1161491  https://images.gr

Encoding

In [None]:
label_encoder = LabelEncoder()
merged_df['authors'] = label_encoder.fit_transform(merged_df['authors'])
merged_df['tag_name'] = label_encoder.fit_transform(merged_df['tag_name'].astype(str))

Normalization

In [None]:
merged_df['rating'] = merged_df['rating'].astype(float)

Create a user-item matrix

In [None]:
user_item_matrix = merged_df.pivot_table(index='user_id', columns='book_id', values='rating')
print(user_item_matrix.head())

book_id  1     2     3     5     6     10    11    13    21    24    ...  \
user_id                                                              ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
7         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
10        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
11        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

book_id  9844  9864  9865  9912  9913  9914  9915  9943  9957  9998  
user_id                                                              
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
7         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
10        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
11        NaN   NaN   NaN   NaN   NaN   NaN   N

# Content-based Filtering

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_df['combined_features'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(cosine_sim[:5, :5])

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [None]:
book_id = 2767052
print(books_df[books_df['book_id'] == book_id])

   id  book_id  best_book_id  work_id  books_count       isbn        isbn13  \
0   1  2767052       2767052  2792775          272  439023483  9.780439e+12   

           authors  original_publication_year    original_title  ...  \
0  Suzanne Collins                       2008  The Hunger Games  ...   

  work_ratings_count work_text_reviews_count  ratings_1  ratings_2  ratings_3  \
0            4942365                  155254      66715     127936     560092   

   ratings_4  ratings_5                                          image_url  \
0    1481305    2706317  https://images.gr-assets.com/books/1447303603m...   

                                     small_image_url  \
0  https://images.gr-assets.com/books/1447303603s...   

                                   combined_features  
0  the hunger games (the hunger games, #1) suzann...  

[1 rows x 24 columns]


In [None]:
def get_content_based_recommendations(book_id, cosine_sim, top_n=10):
    if book_id not in books_df['book_id'].values:
        print(f"Book ID {book_id} not found in books_df")
        return pd.DataFrame()

    idx = books_df[books_df['book_id'] == book_id].index[0]
    cosine_similarities = cosine_sim[idx]
    similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]

    return books_df.iloc[similar_indices]


recommended_books = get_content_based_recommendations(book_id=2767052, cosine_sim=cosine_sim, top_n=10)
print(recommended_books[['book_id', 'title', 'authors']])

       book_id                                              title  \
506    7938275  The Hunger Games Trilogy Boxset (The Hunger Ga...   
16     6148028               Catching Fire (The Hunger Games, #2)   
19     7260188                  Mockingjay (The Hunger Games, #3)   
1819   5047880                                  Hunger (Gone, #2)   
3114     14384   A Hunger Like No Other (Immortals After Dark #2)   
5945    215540                  The Quillan Games (Pendragon, #7)   
2324     18630                  The Player of Games (Culture, #2)   
4743     49176                                  Games People Play   
8576   8046680  The Girl Who Was on Fire: Your Favorite Author...   
5662  25065629                      Hunger Makes Me a Modern Girl   

                                                authors  
506                                     Suzanne Collins  
16                                      Suzanne Collins  
19                                      Suzanne Collins  
1819    

# Collaborative Filtering

In [None]:
dataset = Dataset()
dataset.fit(ratings_long['user_id'], ratings_long['book_id'])

(interactions, _) = dataset.build_interactions([(row['user_id'], row['book_id']) for idx, row in ratings_long.iterrows()])

model = LightFM(loss='warp')
model.fit(interactions, epochs=30, num_threads=2)

def evaluate_model(model, interactions):
    precision = precision_at_k(model, interactions, k=10).mean()
    print(f'Precision at k=10: {precision:.2f}')

evaluate_model(model, interactions)

Precision at k=10: 0.19


In [None]:
def get_collaborative_recommendations(model, user_id, interactions, n=10):
    num_items = interactions.shape[1]
    scores = model.predict(user_id, np.arange(num_items))
    item_scores = list(enumerate(scores))
    top_items = sorted(item_scores, key=lambda x: x[1], reverse=True)
    top_n_items = [item[0] for item in top_items[:n]]
    return top_n_items

def print_user_friendly_recommendations(df):
    if df.empty:
        print("No recommendations found.")
        return
    print("\nTop Book Recommendations:\n")
    for index, row in df.iterrows():
        print(f"Book ID: {row['book_id']}")
        print(f"Title: {row['title']}")
        print(f"Author(s): {row['authors']}")
        print("-" * 40)

user_id = 14
top_n_books = get_collaborative_recommendations(model, user_id, interactions, n=10)
recommended_books = books_df[books_df['book_id'].isin(top_n_books)]

k = len(recommended_books)
if len(recommended_books) < 10:
    print("Warning: Fewer than 10 recommendations found.")

k = min(10, len(recommended_books))
print(f"Top {k} Book Recommendations for User {user_id}:\n")
print_user_friendly_recommendations(recommended_books)

Top 5 Book Recommendations for User 14:


Top Book Recommendations:

Book ID: 33
Title: The Lord of the Rings (The Lord of the Rings, #1-3)
Author(s): J.R.R. Tolkien
----------------------------------------
Book ID: 28
Title: Notes from a Small Island
Author(s): Bill Bryson
----------------------------------------
Book ID: 24
Title: In a Sunburned Country
Author(s): Bill Bryson
----------------------------------------
Book ID: 105
Title: Chapterhouse: Dune (Dune Chronicles #6)
Author(s): Frank Herbert
----------------------------------------
Book ID: 29
Title: The Mother Tongue: English and How It Got That Way
Author(s): Bill Bryson
----------------------------------------


# Hybrid Recommendation System

In [None]:
def hybrid_recommendations(user_id, book_id, model, interactions, cosine_sim, top_n=10):
    collab_recs = get_collaborative_recommendations(model, user_id, interactions, top_n)
    content_recs = get_content_based_recommendations(book_id, cosine_sim, top_n)
    if 'book_id' not in content_recs.columns:
        print("Content recommendations are missing 'book_id' column")
        return pd.DataFrame()

    hybrid_recs = set(collab_recs) | set(content_recs['book_id'])
    return books_df[books_df['book_id'].isin(hybrid_recs)]


def print_user_friendly_recommendations(df):
    print("\nTop Book Recommendations:\n")
    for index, row in df.iterrows():
        print(f"Book ID: {row['book_id']}")
        print(f"Title: {row['title']}")
        print(f"Authors: {row['authors']}")
        print("-" * 50)


hybrid_recommendations_df = hybrid_recommendations(user_id=1, book_id=2767052, model=model, interactions=interactions, cosine_sim=cosine_sim, top_n=10)
print_user_friendly_recommendations(hybrid_recommendations_df)


Top Book Recommendations:

Book ID: 2767052
Title: The Hunger Games (The Hunger Games, #1)
Authors: Suzanne Collins
--------------------------------------------------
Book ID: 6148028
Title: Catching Fire (The Hunger Games, #2)
Authors: Suzanne Collins
--------------------------------------------------
Book ID: 7260188
Title: Mockingjay (The Hunger Games, #3)
Authors: Suzanne Collins
--------------------------------------------------
Book ID: 6
Title: Harry Potter and the Goblet of Fire (Harry Potter, #4)
Authors: J.K. Rowling, Mary GrandPré
--------------------------------------------------
Book ID: 21
Title: A Short History of Nearly Everything
Authors: Bill Bryson
--------------------------------------------------
Book ID: 7938275
Title: The Hunger Games Trilogy Boxset (The Hunger Games, #1-3)
Authors: Suzanne Collins
--------------------------------------------------
Book ID: 5047880
Title: Hunger (Gone, #2)
Authors: Michael  Grant
-------------------------------------------------