In [None]:
import pandas as pd
from transformers import pipeline

pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
books = pd.read_csv("books_cleaned.csv")

In [None]:
books["categories"].value_counts().reset_index().head(15)

In [None]:
category_maping = {
    'Fiction': "Fiction", 
    'Juvenile Fiction': "Children's Fiction", 
    'Biography & Autobiography': "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books['simple_category'] = books['categories'].map(category_maping)

In [None]:
books.head(5)

In [None]:
len(books[~books['simple_category'].isna()])

In [None]:
import numpy as np

In [None]:
def classify_sequence(sequence, categories = ["Fiction", "Nonfiction"]):
    out = pipe(sequence, categories)
    max_index = np.argmax(out["scores"])
    max_label = out["labels"][max_index]
    return max_label

In [None]:
from tqdm import tqdm

actual_categories = []
predicted_categories = []
labels =  ["Fiction", "Nonfiction"]

fiction_df = books.loc[books["simple_category"] == "Fiction", "description"].reset_index(drop=True)
for i in tqdm(range(0, 200)):
    seq = fiction_df[i]
    predicted_categories += [classify_sequence(seq, labels)]
    actual_categories += ["Fiction"]

In [None]:
predicted_categories

In [None]:
nonfiction_df = books.loc[books["simple_category"] == "Nonfiction", "description"].reset_index(drop=True)
for i in tqdm(range(0, 200)):
    seq = nonfiction_df[i]
    predicted_categories += [classify_sequence(seq, labels)]
    actual_categories += ["Nonfiction"]

In [None]:
predictions_df = pd.DataFrame({"actual_categories": actual_categories, "predicted_categories": predicted_categories})
predictions_df["is_correct"] = np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
predictions_df["is_correct"].sum() / len(predictions_df)

In [None]:
isbns = []
predicted_cats = []

missing_category_data = books.loc[books["simple_category"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0, len(missing_category_data))):
    sequence = missing_category_data["description"][i]
    predicted_cats += [classify_sequence(sequence, labels)]
    isbns += [missing_category_data["isbn13"][i]]

In [None]:
missing_predicted_df = pd.DataFrame({'isbn13': isbns, 'predicted_categories': predicted_cats})

In [None]:
missing_predicted_df

In [None]:
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")

In [None]:
books

In [None]:
books["simple_category"] = np.where(books["simple_category"].isna(), books["predicted_categories"], books["simple_category"])
books.drop(columns=["predicted_categories"], inplace=True)

In [None]:
books.isna().sum()