In [11]:
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
df = pd.read_csv('book-dataset/books.csv')

In [3]:
print('Length of dataset: ', len(df))

Length of dataset:  52478


In [4]:
# Resize the dataset

df = df[:1000]

In [5]:
selected_columns = ['title', 'series', 'author', 'description', 'language', 'genres', 'characters', 'setting', 'coverImg']

df = df[selected_columns]

In [6]:
# Checking NA entries

df.isnull().sum()

title            0
series         526
author           0
description      7
language        10
genres           0
characters       0
setting          0
coverImg         1
dtype: int64

In [7]:
df = df.fillna('')

In [8]:
df.isnull().sum()

title          0
series         0
author         0
description    0
language       0
genres         0
characters     0
setting        0
coverImg       0
dtype: int64

In [9]:
df['embedding_text'] = df.apply(lambda row: ' '.join([str(row['title']), str(row['series']), str(row['author']),
                                                      str(row['description']), str(row['language']), str(row['genres']),
                                                      str(row['characters']), str(row['setting'])]), axis=1)

In [10]:
df.index

RangeIndex(start=0, stop=1000, step=1)

In [23]:
embedding_texts = list(df['embedding_text'])

In [13]:
# encoding function

sentence_transformer_ef = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
from langchain.docstore.document import Document

In [25]:
documents = []

for i in range(1000):
    documents.append(Document(page_content=embedding_texts[i], metadata={'id':i}))

In [28]:
vectordb = Chroma.from_documents(collection_name='books', documents=documents, persist_directory='db', embedding=sentence_transformer_ef)

In [29]:
vectordb.persist()

In [45]:
df.to_csv('books_df.csv', index=False)