### Dependencies

In [6]:
import kagglehub
import pandas as pd
import seaborn as sns
import numpy as np
from transformers import pipeline
import matplotlib.pyplot as plt
from tqdm import tqdm

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os
from dotenv import load_dotenv

load_dotenv()


%matplotlib inline

### Data load

In [7]:
# Download latest version
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

books = pd.read_csv(f"{path}/books.csv")

Path to dataset files: C:\Users\Salus X Labs\.cache\kagglehub\datasets\dylanjcastillo\7k-books-with-metadata\versions\3


In [8]:
df = pd.DataFrame(books)

In [9]:
df.shape, df.dtypes

((6810, 12),
 isbn13              int64
 isbn10             object
 title              object
 subtitle           object
 authors            object
 categories         object
 thumbnail          object
 description        object
 published_year    float64
 average_rating    float64
 num_pages         float64
 ratings_count     float64
 dtype: object)

In [10]:
df.head(5)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [11]:
df.isna().sum()

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64

In [12]:
df["missing_desc"] = np.where(df["description"].isna(), 1, 0)

In [13]:
df.head(5)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_desc
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,0


In [14]:
df.isna().sum()

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
missing_desc         0
dtype: int64

There are lot missing values in the subtitle column. So we  cannot drop them we need to find a way to fill missing values. This is same for categoris and thumbnail

In [15]:
df = df[~(df["description"].isna()) &
        ~(df["num_pages"].isna()) &
        ~(df["average_rating"].isna()) &
        ~(df["published_year"].isna()) &
        ~(df["ratings_count"].isna())
]
df.shape

(6507, 13)

filter the short description rows

In [16]:
df["word_counts_in_desc"] = df["description"].str.split().str.len()

In [17]:
df.shape

(6507, 14)

In [18]:
df = df[df["word_counts_in_desc"] >= 25]
df.shape

(5197, 14)

create a column by merging the title and subtitle to resolve missing value issue

In [19]:
df["title_and_subtitle"] = np.where(df["subtitle"].isna(), df["title"], df[["title", "subtitle"]].astype(str).agg(":".join, axis=1))
df.shape

(5197, 15)

In [20]:
df.head(4)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_desc,word_counts_in_desc,title_and_subtitle
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,0,199,Gilead
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,0,205,Spider's Web:A Novel
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,0,57,Rage of angels
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,0,45,The Four Loves


Fix implance and missing value in category column

In [21]:
df.categories.unique().shape

(480,)

In [22]:
df["categories"].value_counts()[:20]

categories
Fiction                      2111
Juvenile Fiction              390
Biography & Autobiography     311
History                       207
Literary Criticism            124
Philosophy                    117
Religion                      117
Comics & Graphic Novels       116
Drama                          86
Juvenile Nonfiction            57
Science                        56
Poetry                         51
Literary Collections           50
Business & Economics           49
Social Science                 48
Performing Arts                40
Cooking                        35
Psychology                     33
Travel                         32
Art                            32
Name: count, dtype: int64

In [23]:
category_mapping = {
    "Fiction": "Fiction",
    "Juvenile Fiction": "Fiction",
    "Comics & Graphic Novels": "Fiction",
    "Drama": "Fiction",
    "Poetry": "Fiction",
    "Literary Collections": "Fiction",
    "Literary Criticism": "Fiction",
    "Biography & Autobiography": "Nonfiction",
    "History": "Nonfiction",
    "Philosophy": "Nonfiction",
    "Religion": "Nonfiction",
    "Social Science": "Nonfiction",
    "Political Science": "Nonfiction",
    "Psychology": "Nonfiction",
    "Self-Help": "Nonfiction",
    "Health & Fitness": "Nonfiction"
}


df["simple_categories"] = df["categories"].map(category_mapping)

In [24]:
df.shape

(5197, 16)

In [25]:
df["simple_categories"].isna().value_counts()

simple_categories
False    3847
True     1350
Name: count, dtype: int64

### LLM utilization to fill missing values in simple_categories column

In [26]:
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use cpu


In [27]:
fiction_categories = ["Fiction", "Nonfiction"]

In [28]:
def generate_predictions(sequence, fiction_categories):
    pred=pipe(sequence, fiction_categories)
    max_index=np.argmax(pred["scores"])
    max_label=pred["labels"][max_index]
    return max_label

In [29]:
isbns = []
predicted_categories = []    

In [30]:
missing_categories = df.loc[df["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
missing_categories.shape

(1350, 2)

In [31]:
for i in tqdm(range(len(missing_categories))):
    sequence = missing_categories["description"][i]
    predicted_categories += [generate_predictions(sequence, fiction_categories)]
    isbns += [missing_categories["isbn13"][i]] 

100%|██████████| 1350/1350 [11:00:48<00:00, 29.37s/it]      


In [32]:
pred_df = pd.DataFrame({"isbn13": isbns, "predicted_category": predicted_categories})
pred_df["predicted_category"].value_counts()

predicted_category
Nonfiction    905
Fiction       445
Name: count, dtype: int64

In [33]:
df = pd.merge(df, pred_df, on = "isbn13", how="left")
df["simple_categories"] = np.where(df["simple_categories"].isna(), df["predicted_category"], df["simple_categories"])


In [34]:
df["simple_categories"].value_counts()

simple_categories
Fiction       3373
Nonfiction    1824
Name: count, dtype: int64

In [35]:
df["simple_categories"].isna().sum()

0

In [36]:
df.drop(columns=["subtitle", "missing_desc", "word_counts_in_desc","predicted_category"], inplace=True, axis=-1)

In [37]:
df.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'title_and_subtitle', 'simple_categories'],
      dtype='object')

## sentiment analysis

In [38]:
emotion_classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None)

Device set to use cpu


In [39]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_pred = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label] += [sorted_pred[index]["score"]]
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}      

In [40]:
isbns = []
emotion_scores = {label: [] for label in emotion_labels}

In [41]:
for i in tqdm(range(len(df))):
    isbns.append(df["isbn13"][i])
    sentences = df["description"][i].split(".")
    predictions = emotion_classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [13:59<00:00,  6.19it/s]


In [42]:
emotions_df = pd.DataFrame( emotion_scores)
emotions_df["isbn13"] = isbns

In [43]:
df = pd.merge(df, emotions_df, on="isbn13", how= "left")

In [44]:
df["tagged_description"] = df[["isbn13", "description"]].astype(str).agg(" ".join, axis=1)

In [45]:
df["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

In [46]:
df.columns, df.shape

(Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
        'description', 'published_year', 'average_rating', 'num_pages',
        'ratings_count', 'title_and_subtitle', 'simple_categories', 'anger',
        'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral',
        'tagged_description'],
       dtype='object'),
 (5197, 21))

## save to cleaned data db table

In [47]:
ratings_mean_count = df[["isbn13", "title", "average_rating", "ratings_count"]].sort_values(by="ratings_count" ,ascending=False)
ratings_mean_count

Unnamed: 0,isbn13,title,average_rating,ratings_count
2125,9780439554930,Harry Potter and the Sorcerer's Stone (Book 1),4.47,5629932.0
1341,9780316015844,Twilight,3.59,4367341.0
2910,9780618260300,"The Hobbit, Or, There and Back Again",4.26,2364968.0
4502,9781416524793,Angels & Demons,3.88,2279854.0
2134,9780439655484,Harry Potter and the Prisoner of Azkaban (Book 3),4.55,2149872.0
...,...,...,...,...
1419,9780333666388,City on the Seine,4.17,0.0
1334,9780313309335,The Fantastic Vampire,0.00,0.0
4448,9781403904560,"V.S. Naipaul, Second Edition",2.00,0.0
3438,9780744005615,Juiced,0.00,0.0


In [48]:
pop_recommendations = ratings_mean_count[(ratings_mean_count["average_rating"]>4.1) & (ratings_mean_count["ratings_count"]>700000)].sort_values(by="ratings_count" ,ascending=False)

In [49]:
pop_recommendations

Unnamed: 0,isbn13,title,average_rating,ratings_count
2125,9780439554930,Harry Potter and the Sorcerer's Stone (Book 1),4.47,5629932.0
2910,9780618260300,"The Hobbit, Or, There and Back Again",4.26,2364968.0
2134,9780439655484,Harry Potter and the Prisoner of Azkaban (Book 3),4.55,2149872.0
2097,9780439064866,Harry Potter and the Chamber of Secrets (Book 2),4.41,2115562.0
2913,9780618346257,The Fellowship of the Ring,4.35,2009749.0
2108,9780439358071,Harry Potter and the Order of the Phoenix (Boo...,4.49,1996446.0
2142,9780439785969,Harry Potter and the Half-Blood Prince (Book 6),4.56,1944099.0
1880,9780385732550,The Giver,4.12,1464909.0
425,9780064410939,Charlotte's Web (full color),4.16,1229902.0
2347,9780450040184,The Shining,4.21,911271.0


#### save to popular recommendations db table

## create embeddings of tagged description

In [50]:
api_key = os.getenv("OPENAI_API_KEY")

In [52]:
df["tagged_description"].to_csv("tagged_description.txt",
                                      sep="\n",
                                      index=False,
                                      header=False
                                      )

In [None]:
raw_doc = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")

In [None]:
documents = text_splitter.split_documents(raw_doc)

In [None]:
db_books = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings(api_key=api_key)
)

In [None]:
query = "book about astronomy"
docs = db_books.similarity_search(query, k=10)
docs

In [None]:
def retrieve_sementic_recommendations(books_df, query:str, top_k:int=10) -> pd.DataFrame:
    docs = db_books.similarity_search(query, k=50)
    recommendations = []
    
    for i in range(0, len(docs)):
        recommendations += [int(docs[i].page_content.strip('"').split()[0])]
        
    return books_df[books_df["isbn13"].isin(recommendations)].head(top_k)

In [None]:
retrieve_sementic_recommendations(books_df, "book about astronomy")

## Save in a text file or db table

#### similarity search