In [1]:
from langchain_community.document_loaders import TextLoader #raw_text description -> langchain workable format
from langchain_text_splitters import CharacterTextSplitter #splitting into meaningful chunks
from langchain_openai import OpenAIEmbeddings #using open ai apis to covert them
from langchain_chroma import Chroma #Chroma -> vector database

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import pandas as pd
books = pd.read_csv(r"D:\Projects\Semantic_book_recommender\data\processed\books.csv")
books.head(5)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."


In [4]:
books["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

In [5]:
with open(r"D:\Projects\Semantic_book_recommender\data\processed\tagged_description.txt",
          encoding="utf-8") as f:
    descriptions = [line.strip() for line in f.readlines() if line.strip()]


In [6]:
from langchain_core.documents import Document
documents = [Document(page_content=d) for d in descriptions]

In [7]:
documents[0]

Document(metadata={}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and 

In [8]:
db_books = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings())


In [9]:
query = "A book about Christianity"
docs = db_books.similarity_search(query, k = 10)
docs

[Document(id='cc28905f-4499-4146-9f4c-26afec323c29', metadata={}, page_content='9780310263456 A guide to living an authentic Christian life urges readers to seek an expression of faith that is personal, rather than in accordance with the belief systems of others, in a handbook that cites the examples of Jesus while offering a perspective on the unlimited nature of God.'),
 Document(id='dd03b4a3-2c3d-4367-af29-f461250a370e', metadata={}, page_content='9780830832941 Focusing on the key aspects of Christian life--prayer, character, and wisdom--this one-year devotional shows believers how to know God more deeply and discover how he works in them.'),
 Document(id='d8d400d8-6812-49b1-acfc-3f693ffe98f1', metadata={}, page_content="9780060652920 A forceful and accessible discussion of Christian belief that has become one of the most popular introductions to Christianity and one of the most popular of Lewis's books. Uncovers common ground upon which all Christians can stand together."),
 Docume

In [10]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
1165,9780310263456,031026345X,Velvet Elvis,Rob Bell,Religion,http://books.google.com/books/content?id=TLZO9...,A guide to living an authentic Christian life ...,2005.0,3.78,194.0,19646.0,Velvet Elvis: Repainting the Christian Faith,9780310263456 A guide to living an authentic C...


In [11]:
def retrieve_semantic_recommendations(
    query: str,
    top_k: int = 10,
) ->pd.DataFrame:
    recs = db_books.similarity_search(query, k = 50)

    books_list = []

    for i in range(0, len(recs)):
        books_list += [int(recs[i].page_content.strip('"').split()[0])] #cuz some desc is enclosed in quotations

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [12]:
retrieve_semantic_recommendations("A book about life and philosophy")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
74,9780007195718,7195710,Discover Your Destiny with the Monk Who Sold H...,Robin Sharma,Conduct of life,http://books.google.com/books/content?id=4hVbN...,A potent pathway to self-awakening that will h...,2004.0,3.9,240.0,1956.0,Discover Your Destiny with the Monk Who Sold H...,9780007195718 A potent pathway to self-awakeni...
170,9780060589462,60589469,Zen and the Art of Motorcycle Maintenance,Robert M. Pirsig,Psychology,http://books.google.com/books/content?id=KMRRe...,One of the most important and influential book...,2006.0,3.77,540.0,157734.0,Zen and the Art of Motorcycle Maintenance: An ...,9780060589462 One of the most important and in...
304,9780060931360,60931361,"Plato, Not Prozac!",Lou Marinoff,Philosophy,http://books.google.com/books/content?id=CZgAC...,If you're facing a dilemma -- whether it's han...,2000.0,3.39,320.0,836.0,"Plato, Not Prozac!: Applying Eternal Wisdom to...",9780060931360 If you're facing a dilemma -- wh...
322,9780060957605,60957603,The Making of a Philosopher,Colin McGinn,Philosophy,http://books.google.com/books/content?id=oJ5dL...,"Part memoir, part study, The Making of a Philo...",2003.0,3.65,256.0,219.0,The Making of a Philosopher: My Journey Throug...,"9780060957605 Part memoir, part study, The Mak..."
396,9780062511119,62511114,A Manual for Living,Epictetus,Religion,http://books.google.com/books/content?id=cofP7...,The essence of perennial Stoic wisdom in aphor...,1994.0,4.22,96.0,148.0,A Manual for Living,9780062511119 The essence of perennial Stoic w...
581,9780140159950,140159959,Ludwig Wittgenstein,Ray Monk,Biography & Autobiography,http://books.google.com/books/content?id=NCfy6...,Wittgenstein possessed one of the most acute p...,1991.0,4.35,654.0,3498.0,Ludwig Wittgenstein: The Duty of Genius,9780140159950 Wittgenstein possessed one of th...
609,9780140264920,140264922,Existentialists and Mystics,Iris Murdoch,Philosophy,http://books.google.com/books/content?id=gvsK6...,A collection of the author's most influential ...,1999.0,4.1,576.0,176.0,Existentialists and Mystics: Writings on Philo...,9780140264920 A collection of the author's mos...
692,9780140448009,140448004,Three Tales,Gustave Flaubert;Roger Whitehouse;Geoffrey Wall,Fiction,http://books.google.com/books/content?id=XFzga...,Features short fiction by the French naturalis...,2005.0,3.71,110.0,3050.0,Three Tales,9780140448009 Features short fiction by the Fr...
701,9780140449143,140449140,The Republic,Plato;Sir Henry Desmond Pritchard Lee,Philosophy,http://books.google.com/books/content?id=R9Paw...,A model for the ideal state includes discussio...,2003.0,3.93,416.0,127393.0,The Republic,9780140449143 A model for the ideal state incl...
896,9780156013031,156013037,The Tale of the Unknown Island,José Saramago,Fiction,http://books.google.com/books/content?id=LBcvk...,A moving and eloquent fable from the 1998 winn...,2000.0,3.89,64.0,3138.0,The Tale of the Unknown Island,9780156013031 A moving and eloquent fable from...


In [13]:
category_mapping = {
    'Fiction': "Fiction",
    'Juvenile Fiction': "Children's Fiction",
    'Biography & Autobiography': "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books["simple_categories"] = books["categories"].map(category_mapping)

In [14]:
books.head(5)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",


In [15]:
books[~(books["simple_categories"].isna())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079 Tricked once more by his wily ha...,Fiction
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006 Joe and his parents are enjoying...,Children's Fiction
46,9780007121014,0007121016,Taken at the Flood,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,Taken at the Flood,9780007121014 A Few Weeks After Marrying An At...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,9781933648279,1933648279,Night Has a Thousand Eyes,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,Night Has a Thousand Eyes,"9781933648279 ""Cornell Woolrich's novels defin...",Fiction
5188,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,Coin Locker Babies,9784770028969 Rescued from the lockers in whic...,Fiction
5189,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,"Cry, the Peacock",9788122200850 This book is the story of a youn...,Fiction
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [16]:
#for zero-shot classification
from transformers import pipeline
fiction_categories = ["Fiction", "Nonfiction"]
pipe = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

Device set to use cpu


In [17]:
sequence = books.loc[books["simple_categories"]== "Fiction", "description"].reset_index(drop=True)[0]

In [18]:
result = pipe(sequence, fiction_categories) # testing out a sample description to see how well the model performs

In [19]:
import numpy as np
max_index = np.argmax(result["scores"])
max_label = result["labels"][max_index]
max_label

'Fiction'

In [20]:
def generate_predictions(sequence, categories):
    results = pipe(sequence, categories)
    max_index = np.argmax(results["scores"])
    max_label = results["labels"][max_index]
    return max_label

In [21]:
from tqdm import tqdm
#checking how good the model is at predicting
actual_cats = []
predicted_cats = []
#checking out fiction and nonfiction predictions
for cat in ["Fiction", "Nonfiction"]:
    subset = books.loc[books["simple_categories"] == cat, "description"].reset_index(drop=True)
    for i in tqdm(range(100), desc=f"Processing {cat}"):
        sequence = subset[i]
        predicted_cats.append(generate_predictions(sequence, fiction_categories))
        actual_cats.append(cat)

Processing Fiction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [03:06<00:00,  1.87s/it]
Processing Nonfiction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:55<00:00,  1.76s/it]


In [22]:
predictions_df = pd.DataFrame()
predictions_df["actual_categories"] = actual_cats
predictions_df["predicted_categories"] = predicted_cats
predictions_df["correct_prediction"] = np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)

In [23]:
predictions_df

Unnamed: 0,actual_categories,predicted_categories,correct_prediction
0,Fiction,Fiction,1
1,Fiction,Fiction,1
2,Fiction,Fiction,1
3,Fiction,Nonfiction,0
4,Fiction,Fiction,1
...,...,...,...
195,Nonfiction,Nonfiction,1
196,Nonfiction,Nonfiction,1
197,Nonfiction,Nonfiction,1
198,Nonfiction,Nonfiction,1


In [24]:
predictions_df["correct_prediction"].sum() / len(predictions_df)

np.float64(0.805)

In [25]:
#using zero shot to now predict missing categories
missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop = True)

In [26]:
isbns = []
predicted_cats = []
for i in tqdm(range(0,len(missing_cats)), desc = "Classifying"):
    sequence = missing_cats["description"][i]
    predicted_cats.append(generate_predictions(sequence, fiction_categories))
    isbns.append(missing_cats["isbn13"][i])

Classifying: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1454/1454 [36:34<00:00,  1.51s/it]


In [27]:
missing_predictions = pd.DataFrame()
missing_predictions["isbn13"] = isbns
missing_predictions["predicted_categories"] = predicted_cats
missing_predictions

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,Nonfiction
2,9780006280934,Nonfiction
3,9780006380832,Nonfiction
4,9780006470229,Fiction
...,...,...
1449,9788125026600,Nonfiction
1450,9788171565641,Fiction
1451,9788172235222,Fiction
1452,9788173031014,Nonfiction


In [28]:
#merging with the original dataset
books = pd.merge(books, missing_predictions, on = "isbn13", how = "left")

In [29]:
books.head(2)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories,predicted_categories
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,,Fiction


In [30]:
#using predicted categories wherever simple categories is missing.
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns = ["predicted_categories"])

In [31]:
books["simple_categories"].isna().sum() # to check if any missing exists

np.int64(0)

In [32]:
books.to_csv(r"D:\Projects\Semantic_book_recommender\data\processed\books_with_categories.csv", index = False)