# Zero Shot Text Classification

In [56]:
import pandas as pd
import numpy as np
books = pd.read_csv("./ds/cleaned_data.csv")

In [57]:
from transformers import pipeline
from typing import List

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cuda")


Device set to use cuda


KeyboardInterrupt: 

### Manually map top categories into Fiction or Non Fiction to establish a basis

In [58]:
category_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

books["simplified_category"] = books["categories"].map(category_mapping)
books["simplified_category"]

0          Fiction
1              NaN
2          Fiction
3              NaN
4              NaN
           ...    
5688           NaN
5689           NaN
5690    Nonfiction
5691    Nonfiction
5692    Nonfiction
Name: simplified_category, Length: 5693, dtype: object

> we don't have enough samples in our dataset to check if the model would correctly classify for classes with other niche labels like "romance". etc.

> We could make the model make predictions for more classes, but we wouldnt be able to verify if those predictions would be right.

> however, we have enough Fiction, Nonfiction samples. so we can verify (by calculating accuracy) with those samples and if proven to be accourate enough, we can use the model to make predictions for those classes.

> we could also assume that the classifier is apt enough to classify the niche descriptions, but that would be risky.

> still, I'm keeping childrens fiction and childrens nonfiction as possible class

In [71]:
category_classes = ["Fiction", "Nonfiction", "Children's Fiction", "Children's Nonfiction"]

In [5]:
sample_text = "a book about startups run by school children. it has stories of little founders of 500 companies registered in the USA from 2010 to 2025. Motivates kids by acquainting them with children of their age. aimed at children"
sample_text = books.iloc[0]["description"]

### Continue classification

In [6]:
# eg:
max_index = np.argmax(classifier(sample_text, category_classes)["scores"])
label_or_category = classifier(sample_text, category_classes)["labels"][max_index]
label_or_category

'Fiction'

In [33]:
from typing import List
def classify_text(text: str, labels: List[str] = category_classes):
    
    # print(result["labels"])
    # print(result["scores"])
    
    #found that the labels and scores are sorted in descending order automatically. So, we can simply use the 0th index instead of argmax and use that index as in the above cell
    return classifier(text,labels)["labels"][0]

In [34]:
text = "a book about the king of london, king charles and queen elizabeth. Case study of how they run politics and the country"
classify_text(text)

'Nonfiction'

> Now that we have our classifier function ready, we can go on and check the accuracy of the classifications to see if it is usable

In [39]:
from tqdm.notebook import tqdm

#expects a series of descriptio, all of the same category (actual_category)
def get_classification_accuracy( description_series: List[str], actual_category: str):
    correct_prediction = 0

    for description in tqdm(description_series):
        
        predicted_category = classify_text(description)
        
        if predicted_category == actual_category:
            correct_prediction +=1
    
    return correct_prediction/len(description_series)

In [40]:
books.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'title_and_subtitle', 'tagged_description',
       'simplified_category'],
      dtype='object')

In [43]:
n= 300

fiction_series = books.loc[books["simplified_category"] == "Fiction", "description"][:n]
nonfiction_series = books.loc[books["simplified_category"] == "Nonfiction", "description"][:n]

In [44]:
get_classification_accuracy(fiction_series, "Fiction")

  0%|          | 0/300 [00:00<?, ?it/s]

0.6566666666666666

In [45]:
get_classification_accuracy(nonfiction_series, "Nonfiction")

  0%|          | 0/300 [00:00<?, ?it/s]

0.87

> Not very good accuracies but I'm going with the classifier anyways

### Finding categories for the records with NaN in categories field.

In [51]:
records_with_nan_category = books.loc[books["simplified_category"].isna(), ["isbn13", "description"]].reset_index(drop=True)
records_with_nan_category

Unnamed: 0,isbn13,description
0,9780002261982,A new 'Christie for Christmas' -- a full-lengt...
1,9780006280897,Lewis' work on the nature of love divides love...
2,9780006280934,"""In The Problem of Pain, C.S. Lewis, one of th..."
3,9780006380832,Until Vasco da Gama discovered the sea-route t...
4,9780006470229,A new-cover reissue of the fourth book in the ...
...,...,...
1571,9788125026600,Not only does Nietzsche for Beginners delve in...
1572,9788171565641,"Forster's lively, informed originality and wit..."
1573,9788172235222,On A Train Journey Home To North India After L...
1574,9788173031014,This book tells the tale of a man who goes on ...


In [52]:
records_with_nan_category["predicted_category"] = records_with_nan_category["description"].apply(classify_text)
records_with_nan_category

Unnamed: 0,isbn13,description,predicted_category
0,9780002261982,A new 'Christie for Christmas' -- a full-lengt...,Fiction
1,9780006280897,Lewis' work on the nature of love divides love...,Nonfiction
2,9780006280934,"""In The Problem of Pain, C.S. Lewis, one of th...",Nonfiction
3,9780006380832,Until Vasco da Gama discovered the sea-route t...,Nonfiction
4,9780006470229,A new-cover reissue of the fourth book in the ...,Fiction
...,...,...,...
1571,9788125026600,Not only does Nietzsche for Beginners delve in...,Nonfiction
1572,9788171565641,"Forster's lively, informed originality and wit...",Fiction
1573,9788172235222,On A Train Journey Home To North India After L...,Fiction
1574,9788173031014,This book tells the tale of a man who goes on ...,Nonfiction


In [60]:
books = pd.merge(books, records_with_nan_category[["isbn13","predicted_category"]], on="isbn13", how="left")

> this will cause the books df to have a predicted_category column. 
> we want the values from predicted_category to be in simplified_category whenever there are null values in simplified_category

In [61]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simplified_category,predicted_category
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,,Nonfiction
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",,Nonfiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5688,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,,Nonfiction
5689,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,,Fiction
5690,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction,
5691,9789027712059,9027712050,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,http://books.google.com/books/content?id=Vy7Sk...,Since the three volume edition ofHegel's Philo...,1981.0,0.00,210.0,0.0,The Berlin Phenomenology,9789027712059 Since the three volume edition o...,Nonfiction,


In [63]:
books["simplified_category"] = books["simplified_category"].fillna(books["predicted_category"])
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simplified_category,predicted_category
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Nonfiction,Nonfiction
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Nonfiction,Nonfiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5688,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,Nonfiction,Nonfiction
5689,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,Fiction,Fiction
5690,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction,
5691,9789027712059,9027712050,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,http://books.google.com/books/content?id=Vy7Sk...,Since the three volume edition ofHegel's Philo...,1981.0,0.00,210.0,0.0,The Berlin Phenomenology,9789027712059 Since the three volume edition o...,Nonfiction,


In [None]:
books = books.drop(columns=["predicted_category"])

In [69]:
books["simplified_category"].isna().sum()

np.int64(0)

In [70]:
books["simplified_category"].unique()

array(['Fiction', 'Nonfiction', "Children's Fiction",
       "Children's Nonfiction"], dtype=object)

> Now we have no nans and only 4 categories in total

In [72]:
books.to_csv("./ds/cleaned_data_2[category_fixed].csv")