In [1]:
import os

import pandas as pd
import numpy as np

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from transformers import pipeline

from tqdm import tqdm
tqdm.pandas()

  _torch_pytree._register_pytree_node(
2025-10-15 15:51:35.746357: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-15 15:51:35.805632: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-15 15:51:44.376469: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  _torch_pytree._register_pytree_node(


In [2]:
# Ghost logging to avoid splitting warnings

import logging

logging.getLogger("langchain_text_splitters.base").setLevel(logging.ERROR)
old_log_record_factory = logging.getLogRecordFactory()

def new_log_record_factory(*args, **kwargs):
    record = old_log_record_factory(*args, **kwargs)
    if record.getMessage().startswith("Created a chunk of size"):
        return None
    return record

logging.setLogRecordFactory(new_log_record_factory)

# Creating a Embeddings-vector DB

In [3]:
# Read data table and only write documents content

books = pd.read_csv("../data/books_cleaned.csv")
books['tagged_description'].to_csv("../data/tagged_description.txt", sep="\n", index=False, header=False)

In [4]:
# Load documents, configure text splitter, and perform splitting

raw_documents = TextLoader("../data/tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [5]:
documents[0]

Document(metadata={'source': '../data/tagged_description.txt'}, page_content='9780002005883: A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s de

In [6]:
# Initialize embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  return torch._C._cuda_getDeviceCount() > 0
  _torch_pytree._register_pytree_node(


In [7]:
# Extract texts
texts = [doc.page_content.strip("\"") for doc in documents]

#Process embeddings in batches
batch_size = 16
embeddings = []

for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
    batch = texts[i:i + batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    embeddings.extend(batch_embeddings)

Embedding batches: 100%|█████████████████████████████████████████████████████████| 325/325 [01:26<00:00,  3.75it/s]


In [8]:
PERSIST_DIR = '../data/db_books_embeddings'

db_books = Chroma(
    embedding_function=embedding_model, # Used for queries
    persist_directory = PERSIST_DIR     # Optional: set a folder if you want it saved
)

# Add the data manually
db_books._collection.add(
    ids=[text[:13] for text in texts],
    embeddings=embeddings,
    documents=[text[15:] for text in texts],
    metadatas=[doc.metadata for doc in documents]
)

print(f"Successfully created and saved {len(embeddings)} documents to '{PERSIST_DIR}'")
print(f"Current document count: {db_books._collection.count()}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Successfully created and saved 5197 documents to '../data/db_books_embeddings'
Current document count: 5197


In [9]:
# Loading test

db_books_loaded = Chroma(
    embedding_function=embedding_model,
    persist_directory=PERSIST_DIR
)

count = db_books_loaded._collection.count()
print(f"Successfully loaded database with {count} documents from '{PERSIST_DIR}'.")

# Example query to show it works
results = db_books_loaded.similarity_search("Classic literature", k=1)
print(f"Query Result: {results[0].page_content}")

Successfully loaded database with 5197 documents from '../data/db_books_embeddings'.
Query Result: ""A boon for classicists and general readers alike. For the reader who comes to tragedy for the first time, these translations are eminently 'accessible,' and consummately American in tone and feeling. For the classicist, these versions constitute an ambitious reinterpretation of traditional masterpieces; after 2,500 years, the poetry of Euripides and Aeschylus has found a new voice—in fact, ten of them.""—The Boston Book Review


In [10]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=5)
docs

[Document(id='9780786808069', metadata={'source': 'data/tagged_description.txt'}, page_content='Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(id='9780786808380', metadata={'source': 'data/tagged_description.txt'}, page_content="Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful -- information about the subjects."),
 Document(id='9780786808397', metadata={'source': 'data/tagged_description.txt'}, page_content="Introduce your baby to birds, cats, dogs, 

In [11]:
%%time
query = "A book for teaching kids about nature"
docs = db_books.similarity_search(query, k=5)
ids = [int(doc.id) for doc in docs]
books.query("isbn13.isin(@ids)")

CPU times: user 440 ms, sys: 35.3 ms, total: 475 ms
Wall time: 30.6 ms


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
324,9780060959036,0060959037,Prodigal Summer,Barbara Kingsolver,Fiction,http://books.google.com/books/content?id=06IwG...,Barbara Kingsolver's fifth novel is a hymn to ...,2001.0,4.0,444.0,85440.0,Prodigal Summer: A Novel,9780060959036: Barbara Kingsolver's fifth nove...
1642,9780374522599,0374522596,The Control of Nature,John McPhee,Nature,http://books.google.com/books/content?id=p1qKQ...,The Control of Nature is John McPhee's bestsel...,1990.0,4.24,288.0,3365.0,The Control of Nature,9780374522599: The Control of Nature is John M...
3747,9780786808069,0786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069: Children will discover the exci...
3749,9780786808380,0786808381,Baby Einstein: Babies,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=jv4NA...,"Introduce your babies to birds, cats, dogs, an...",2002.0,4.03,20.0,29.0,Baby Einstein: Babies,"9780786808380: Introduce your babies to birds,..."
3750,9780786808397,078680839X,Baby Einstein: Dogs,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=qut8t...,"Introduce your baby to birds, cats, dogs, and ...",2002.0,3.81,20.0,26.0,Baby Einstein: Dogs,"9780786808397: Introduce your baby to birds, c..."


# Zero-shot categorization

In [12]:
# Mapping of top-12 categories
category_mapping = {
    'Fiction' : "Fiction",
    'Juvenile Fiction': "Children's Fiction",
    'Biography & Autobiography': "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books["simple_categories"] = books["categories"].map(category_mapping)

#Proportions of null values in category fields
books[['categories', 'simple_categories']].isna().mean()

categories           0.005773
simple_categories    0.279777
dtype: float64

In [13]:
fiction_categories = ['Fiction', 'Nonfiction']

pipe = pipeline("zero-shot-classification",
                model="facebook/bart-large-mnli")

  _torch_pytree._register_pytree_node(


In [14]:
sequence = books['description'].iloc[50]

pipe(sequence, fiction_categories)

{'sequence': 'In an absorbing narrative about personalities and social history, Menand discusses the Metaphysical Club, an informal group that met in Cambridge, Massachusetts, in 1872, to talk about ideas. Members included Oliver Wendell Holmes, Jr., William James, and Charles Sanders Peirce. 21 photos.',
 'labels': ['Nonfiction', 'Fiction'],
 'scores': [0.8759342432022095, 0.12406571954488754]}

In [15]:
def generate_predictions(sequence: str, categories: list[str]) -> str:
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

f = lambda sequence: generate_predictions(sequence, fiction_categories)

Let us now predict the category for the missing values:

In [16]:
pred_categories = books["description"].progress_apply(f)

100%|██████████████████████████████████████████████████████████████████████████| 5197/5197 [49:35<00:00,  1.75it/s]


In [17]:
pred_categories.value_counts()

description
Nonfiction    2816
Fiction       2381
Name: count, dtype: int64

In [18]:
books['categories'].isna().mean()

0.005772561092938233

In [19]:
books['predicted_categories'] = pred_categories

In [20]:
cat = "Fiction"
acc = books.query("simple_categories == @cat").eval("simple_categories == predicted_categories").mean()
print(f"Category prediction accuracy for {cat}: {acc:.2%}")

Category prediction accuracy for Fiction: 67.89%


In [21]:
cat = "Nonfiction"
acc = books.query("simple_categories == @cat").eval("simple_categories == predicted_categories").mean()
print(f"Category prediction accuracy for {cat}: {acc:.2%}")

Category prediction accuracy for Nonfiction: 87.34%


In [22]:
books.to_csv('../data/books_pred_categories.csv',index=False)

# Sentiment Analysis

The goal is to classify the dominant emotion in the book among 7 categories: anger, disgust, fear, joy, neutral, sadness, and surprise. Instead of using zero-shot classification, we will use a fine-tuned model on this task.

In [23]:
%%time

from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k=None)
classifier("I love this!")



CPU times: user 2.32 s, sys: 516 ms, total: 2.84 s
Wall time: 920 ms


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528676815330982},
  {'label': 'neutral', 'score': 0.005764594301581383},
  {'label': 'anger', 'score': 0.004419781267642975},
  {'label': 'sadness', 'score': 0.0020923952106386423},
  {'label': 'disgust', 'score': 0.001611993182450533},
  {'label': 'fear', 'score': 0.0004138521908316761}]]

In [24]:
desc = books['description'][0]
print(desc)
classifier(desc)

A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has

[[{'label': 'fear', 'score': 0.654840886592865},
  {'label': 'neutral', 'score': 0.1698521077632904},
  {'label': 'sadness', 'score': 0.11640918999910355},
  {'label': 'surprise', 'score': 0.020700626075267792},
  {'label': 'disgust', 'score': 0.019100768491625786},
  {'label': 'joy', 'score': 0.015161294490098953},
  {'label': 'anger', 'score': 0.003935155458748341}]]

Descriptions have a mix of different feelinds. Let's divide the description into different sentences and pass them to the classifier:

In [25]:
sentences = [sentence.strip() for sentence in books['description'][0].split(".") if len(sentence.strip())>0]
predictions = classifier(sentences)

for sentence, pred in zip(sentences, predictions):
    print(sentence)
    display(pred)
    print('')

A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives


[{'label': 'surprise', 'score': 0.7296024560928345},
 {'label': 'neutral', 'score': 0.14038565754890442},
 {'label': 'fear', 'score': 0.06816236674785614},
 {'label': 'joy', 'score': 0.047942519187927246},
 {'label': 'anger', 'score': 0.009156353771686554},
 {'label': 'disgust', 'score': 0.0026284779887646437},
 {'label': 'sadness', 'score': 0.0021221619099378586}]


John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers


[{'label': 'neutral', 'score': 0.4662502110004425},
 {'label': 'disgust', 'score': 0.33823898434638977},
 {'label': 'joy', 'score': 0.08201287686824799},
 {'label': 'sadness', 'score': 0.06111671030521393},
 {'label': 'anger', 'score': 0.029641302302479744},
 {'label': 'surprise', 'score': 0.017968814820051193},
 {'label': 'fear', 'score': 0.004771149251610041}]


It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up


[{'label': 'neutral', 'score': 0.6978455781936646},
 {'label': 'sadness', 'score': 0.201043039560318},
 {'label': 'disgust', 'score': 0.03659290447831154},
 {'label': 'surprise', 'score': 0.029497869312763214},
 {'label': 'joy', 'score': 0.012942732311785221},
 {'label': 'fear', 'score': 0.012581037357449532},
 {'label': 'anger', 'score': 0.009496787562966347}]


Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist


[{'label': 'fear', 'score': 0.9839727878570557},
 {'label': 'neutral', 'score': 0.004363983869552612},
 {'label': 'sadness', 'score': 0.004239339847117662},
 {'label': 'anger', 'score': 0.0027069870848208666},
 {'label': 'surprise', 'score': 0.002331699011847377},
 {'label': 'disgust', 'score': 0.001794762327335775},
 {'label': 'joy', 'score': 0.0005904841236770153}]


He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption


[{'label': 'sadness', 'score': 0.9560651183128357},
 {'label': 'neutral', 'score': 0.021593980491161346},
 {'label': 'disgust', 'score': 0.009161856956779957},
 {'label': 'fear', 'score': 0.006611268036067486},
 {'label': 'surprise', 'score': 0.002416071016341448},
 {'label': 'anger', 'score': 0.0021669946145266294},
 {'label': 'joy', 'score': 0.0019846975337713957}]


Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has to offer


[{'label': 'joy', 'score': 0.9490272998809814},
 {'label': 'disgust', 'score': 0.02581697329878807},
 {'label': 'neutral', 'score': 0.014153186231851578},
 {'label': 'sadness', 'score': 0.005034740082919598},
 {'label': 'anger', 'score': 0.0040159691125154495},
 {'label': 'surprise', 'score': 0.0014610494254156947},
 {'label': 'fear', 'score': 0.0004908551345579326}]


At its heart is a tale of the sacred bonds between fathers and sons, pitch-perfect in style and story, set to dazzle critics and readers alike


[{'label': 'joy', 'score': 0.670195460319519},
 {'label': 'neutral', 'score': 0.24651286005973816},
 {'label': 'surprise', 'score': 0.05786948651075363},
 {'label': 'sadness', 'score': 0.009629090316593647},
 {'label': 'disgust', 'score': 0.0076239705085754395},
 {'label': 'anger', 'score': 0.004351059906184673},
 {'label': 'fear', 'score': 0.0038179662078619003}]




For each book, let's have a separate column, one for each sentiment class. Let's then take the highest probability from accross the whole description for that particular sentiment.

In [26]:
def calculate_max_emotion_scores(predictions):
    all_scores = [(d['label'], d['score']) for sublist in predictions for d in sublist]
    result = pd.DataFrame(all_scores, columns=['emotion','score']).groupby("emotion").max()['score']
    return result

calculate_max_emotion_scores(predictions)

emotion
anger       0.029641
disgust     0.338239
fear        0.983973
joy         0.949027
neutral     0.697846
sadness     0.956065
surprise    0.729602
Name: score, dtype: float64

In [27]:
desc = books['description'][1]
print(desc)

sentences = [s for sentence in desc.split(".") if len(s := sentence.strip())>0]
predictions = classifier(sentences)
calculate_max_emotion_scores(predictions)

A new 'Christie for Christmas' -- a full-length novel adapted from her acclaimed play by Charles Osborne Following BLACK COFFEE and THE UNEXPECTED GUEST comes the final Agatha Christie play novelisation, bringing her superb storytelling to a new legion of fans. Clarissa, the wife of a Foreign Office diplomat, is given to daydreaming. 'Supposing I were to come down one morning and find a dead body in the library, what should I do?' she muses. Clarissa has her chance to find out when she discovers a body in the drawing-room of her house in Kent. Desperate to dispose of the body before her husband comes home with an important foreign politician, Clarissa persuades her three house guests to become accessories and accomplices. It seems that the murdered man was not unknown to certain members of the house party (but which ones?), and the search begins for the murderer and the motive, while at the same time trying to persuade a police inspector that there has been no murder at all... SPIDER'S

emotion
anger       0.594469
disgust     0.461992
fear        0.935215
joy         0.704422
neutral     0.891110
sadness     0.051413
surprise    0.212222
Name: score, dtype: float64

In [28]:
def compute_emotion_scores(desc):
    sentences = [s for sentence in desc.split(".") if len(s := sentence.strip())>0]
    predictions = classifier(sentences)
    scores = calculate_max_emotion_scores(predictions)

    return scores

In [29]:
emotion_scores = books.set_index("isbn13")['description'].progress_apply(compute_emotion_scores).reset_index()
books_with_sentiment_analysis = books.merge(emotion_scores, on='isbn13', validate='1:1')

100%|██████████████████████████████████████████████████████████████████████████| 5197/5197 [05:38<00:00, 15.35it/s]


In [30]:
books_with_sentiment_analysis.to_csv('../data/books_for_dashboard.csv',index=False)