In [None]:
import pandas as pd

data = pd.read_csv("../data/books_emotions.csv")
data.categories.value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Conspiracies,1
475,Brothers and sisters,1
476,Rock musicians,1
477,Community life,1


In [3]:
def map_category(cat):
    if not isinstance(cat, str):
        return cat
    lower_cat = cat.lower()
    if (
        "literary criticism" in lower_cat
        or "poetry" in lower_cat
        or "Literary Collections" in cat
        or "History" in cat
    ):
        return "Literary"
    if "comics & graphic novels" in lower_cat:
        return "Comedy"
    if (
        "religion" in lower_cat
        or "philosophy" in lower_cat
        or "Psychology" in cat
        or "Family & Relationships" in cat
        or "Self-Help" in cat
        or "Humor" in cat
    ):
        return "Philosophy"
    if "fiction" in lower_cat:
        return "Fiction"
    if "Biography & Autobiography" in cat:
        return "Biography"
    if (
        "Business & Economics" in cat
        or "science" in lower_cat
        or "Computers" in cat
        or "Education" in cat
        or "Medical" in cat
        or "Nature" in cat
        or "Cooking" in cat
        or "Body, Mind & Spirit" in cat
        or "Health & Fitness" in cat
        or "Law" in cat
        or "Foreign Language Study" in cat
        or "Architecture" in cat
    ):
        return "Science"
    if "art" in lower_cat:
        return "Art"
    if "mystery" in lower_cat:
        return "Mystery"
    if (
        "stories" in lower_cat
        or "Travel" in cat
        or "Games" in cat
        or "Music" in cat
        or "Sports & Recreation" in cat
        or "True Crime" in cat
        or "Comedy" in cat
        or "Photography" in cat
        or "Drama" in cat
    ):
        return "Intertainment"
    return cat


data["categories"] = data["categories"].apply(map_category)
data["categories"].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2648
1,Literary,443
2,Science,392
3,Intertainment,358
4,Philosophy,355
...,...,...
371,"Authors, Canadian",1
372,"Poets, Chilean",1
373,Zero (The number),1
374,Babytime resource,1


In [None]:
from ollama import chat


def classify(book_description: str, candidate: str) -> str:
    prompt = f"""
You are a highly specialized zero-shot classifier with deep expertise in narrative forms, literary styles, scientific themes, and philosophical explorations.

=== TASK ===
Analyze the book description below and determine which category best captures its primary focus. Choose one from the available categories.

=== BOOK DESCRIPTION ===
{book_description}

=== AVAILABLE CATEGORIES ===
- Fiction
- Literary
- Science
- Philosophy
- Intertainment

=== PRELIMINARY IMPRESSION ===
{candidate}

=== REFERENCE EXAMPLES ===
- **Fiction:** A narrative set in a fantastical world with imaginative characters and creative storytelling.
- **Literary:** A work characterized by poetic language.
- **Science:** A text that discusses scientific theories, discoveries, or futuristic innovations.
- **Philosophy:** A book that explores deep philosophical questions, ethical dilemmas, or abstract ideas.
- **Intertainment:** A lively and engaging story crafted to entertain and amuse.

=== INSTRUCTIONS ===
1. Read the book description carefully.
2. If the Preliminary Impression exactly matches one of the Available Categories, immediately select and return that category.
3. Otherwise, evaluate the themes, tone, narrative structure, and any subtle cues in the description.
4. Choose the single category that most accurately represents the dominant focus of the book.
5. Your final response must contain only the chosen category label (exactly as listed), without any additional commentary or explanation.

Your answer:"""

    messages = [
        {
            "role": "user",
            "content": prompt,
        },
    ]

    response = chat("llama3.2", messages=messages)
    return response["message"]["content"].strip()


print(classify(data.description[8], data.categories[8]))

Fiction


In [6]:
top_7 = data["categories"].value_counts().index[:7].tolist()

data["refined_category"] = data.apply(
    lambda row: (
        classify(row["description"], row["categories"])
        if row["categories"] not in top_7
        else row["categories"]
    ),
    axis=1,
)

In [11]:
data["refined_category"].value_counts().reset_index()

Unnamed: 0,refined_category,count
0,Fiction,2735
1,Literary,745
2,Intertainment,460
3,Science,456
4,Philosophy,386
5,Biography,311
6,Art,104


In [12]:
data.to_csv("./data/books_refined.csv", index=False)