In [None]:
import os

import pandas as pd
import numpy as np

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from transformers import pipeline

from tqdm import tqdm
tqdm.pandas()

In [None]:
# Ghost logging to avoid splitting warnings

import logging

logging.getLogger("langchain_text_splitters.base").setLevel(logging.ERROR)
old_log_record_factory = logging.getLogRecordFactory()

def new_log_record_factory(*args, **kwargs):
    record = old_log_record_factory(*args, **kwargs)
    if record.getMessage().startswith("Created a chunk of size"):
        return None
    return record

logging.setLogRecordFactory(new_log_record_factory)

# Creating a Embeddings-vector DB

In [None]:
# Read data table and only write documents content

books = pd.read_csv("../data/books_cleaned.csv")
books['tagged_description'].to_csv("data/tagged_description.txt", sep="\n", index=False, header=False)

In [None]:
# Load documents, configure text splitter, and perform splitting

raw_documents = TextLoader("../data/tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0]

In [None]:
# Initialize embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Extract texts
texts = [doc.page_content.strip("\"") for doc in documents]

#Process embeddings in batches
batch_size = 16
embeddings = []

for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
    batch = texts[i:i + batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    embeddings.extend(batch_embeddings)

In [None]:
PERSIST_DIR = '../data/db_books_embeddings'

db_books = Chroma(
    embedding_function=embedding_model, # Used for queries
    persist_directory = PERSIST_DIR     # Optional: set a folder if you want it saved
)

# Add the data manually
db_books._collection.add(
    ids=[text[:13] for text in texts],
    embeddings=embeddings,
    documents=[text[15:] for text in texts],
    metadatas=[doc.metadata for doc in documents]
)

print(f"Successfully created and saved {len(embeddings)} documents to '{PERSIST_DIR}'")
print(f"Current document count: {db_books._collection.count()}")

In [None]:
# Loading test

db_books_loaded = Chroma(
    embedding_function=embedding_model,
    persist_directory=PERSIST_DIR
)

count = db_books_loaded._collection.count()
print(f"Successfully loaded database with {count} documents from '{PERSIST_DIR}'.")

# Example query to show it works
results = db_books_loaded.similarity_search("Classic literature", k=1)
print(f"Query Result: {results[0].page_content}")

In [None]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=5)
docs

In [None]:
%%time
query = "A book for teaching kids about nature"
docs = db_books.similarity_search(query, k=5)
ids = [int(doc.id) for doc in docs]
books.query("isbn13.isin(@ids)")

# Zero-shot categorization

In [None]:
# Mapping of top-12 categories
category_mapping = {
    'Fiction' : "Fiction",
    'Juvenile Fiction': "Children's Fiction",
    'Biography & Autobiography': "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books["simple_categories"] = books["categories"].map(category_mapping)

#Proportions of null values in category fields
books[['categories', 'simple_categories']].isna().mean()

In [None]:
fiction_categories = ['Fiction', 'Nonfiction']

pipe = pipeline("zero-shot-classification",
                model="facebook/bart-large-mnli")

In [None]:
sequence = books['description'].iloc[50]

pipe(sequence, fiction_categories)

In [None]:
def generate_predictions(sequence: str, categories: list[str]) -> str:
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

f = lambda sequence: generate_predictions(sequence, fiction_categories)

Let us now predict the category for the missing values:

In [None]:
pred_categories = books["description"].progress_apply(f)

In [None]:
pred_categories.value_counts()

In [None]:
books['categories'].isna().mean()

In [None]:
books['predicted_categories'] = pred_categories

In [None]:
cat = "Fiction"
acc = books.query("simple_categories == @cat").eval("simple_categories == predicted_categories").mean()
print(f"Category prediction accuracy for {cat}: {acc:.2%}")

In [None]:
cat = "Nonfiction"
acc = books.query("simple_categories == @cat").eval("simple_categories == predicted_categories").mean()
print(f"Category prediction accuracy for {cat}: {acc:.2%}")

In [None]:
books.to_csv('../data/books_pred_categories.csv',index=False)

# Sentiment Analysis

The goal is to classify the dominant emotion in the book among 7 categories: anger, disgust, fear, joy, neutral, sadness, and surprise. Instead of using zero-shot classification, we will use a fine-tuned model on this task.

In [None]:
%%time

from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k=None)
classifier("I love this!")

In [None]:
desc = books['description'][0]
print(desc)
classifier(desc)

Descriptions have a mix of different feelinds. Let's divide the description into different sentences and pass them to the classifier:

In [None]:
sentences = [sentence.strip() for sentence in books['description'][0].split(".") if len(sentence.strip())>0]
predictions = classifier(sentences)

for sentence, pred in zip(sentences, predictions):
    print(sentence)
    display(pred)
    print('')

For each book, let's have a separate column, one for each sentiment class. Let's then take the highest probability from accross the whole description for that particular sentiment.

In [None]:
def calculate_max_emotion_scores(predictions):
    all_scores = [(d['label'], d['score']) for sublist in predictions for d in sublist]
    result = pd.DataFrame(all_scores, columns=['emotion','score']).groupby("emotion").max()['score']
    return result

calculate_max_emotion_scores(predictions)

In [None]:
desc = books['description'][1]
print(desc)

sentences = [s for sentence in desc.split(".") if len(s := sentence.strip())>0]
predictions = classifier(sentences)
calculate_max_emotion_scores(predictions)

In [None]:
def compute_emotion_scores(desc):
    sentences = [s for sentence in desc.split(".") if len(s := sentence.strip())>0]
    predictions = classifier(sentences)
    scores = calculate_max_emotion_scores(predictions)

    return scores

In [None]:
emotion_scores = books.set_index("isbn13")['description'].progress_apply(compute_emotion_scores).reset_index()
books_with_sentiment_analysis = books.merge(emotion_scores, on='isbn13', validate='1:1')

In [None]:
books_with_sentiment_analysis.to_csv('../data/books_for_dashboard.csv',index=False)