Create and store embeddings in persistant dir (PDF)

In [2]:
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings

# Load PDF documents
pdf_loader = PyPDFDirectoryLoader("./DataDocs")
loaders = [pdf_loader]

documents = []
for loader in loaders:
    documents.extend(loader.load())
# print(documents)
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)
all_documents = text_splitter.split_documents(documents)

print(f"Total number of documents: {len(all_documents)}")

# Setup embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs = {'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Initialize Chroma database
db = Chroma(embedding_function=embeddings, persist_directory="./chromadb")
retv = db.as_retriever()

searchDocs = db.similarity_search("CHAUHAN JAIMIN VIJAYKUMAR")
print(searchDocs)
print(searchDocs[0].page_content)

# Set the batch size
batch_size = 96

# Calculate the number of batches
num_batches = len(all_documents) // batch_size + (len(all_documents) % batch_size > 0)

# Iterate over batches
for batch_num in range(num_batches):
    # Calculate start and end indices for the current batch
    start_index = batch_num * batch_size
    end_index = (batch_num + 1) * batch_size
    # Extract documents for the current batch
    batch_documents = all_documents[start_index:end_index]
    # Add documents to Chroma
    retv.add_documents(batch_documents)
    print(start_index, end_index)

# Persist the collection
print("Success!")
db.persist()


Total number of documents: 534
[Document(page_content='L D College of Engineering, Ahmedabad    20         CHAUHAN NIKHILKUMAR KANUBHAI     15...'), Document(page_content='L D College of Engineering, Ahmedabad    19      CHAUHAN MANISHKUMAR GHANSHYAMBHAI   15...'), Document(metadata={'page': 1, 'source': 'DataDocs\\PLACEMENT-2019 (1).pdf'}, page_content='85 CHAUHAN NIKUNJ BHARATBHAI 160283106004 CIVIL 2019 ON CAMPUS TORRENT'), Document(metadata={'page': 8, 'source': 'DataDocs\\PLACEMENT-2019 (1).pdf'}, page_content='356 KATESHIYA RAJ CHANDULAL 150280117022 I.C. 2019 ON CAMPUS GUJARAT GAS\n357 BRAHAMKSTRIYA BHAVESHKUMAR S. 150280117004 I.C. 2019 ON CAMPUS GUJARAT GAS\n358 VYAS SAGARKUMAR BHAGAVANBHAI 150280117062 I.C. 2019 ON CAMPUS GUJARAT GAS\n359 CHAUHAN VISHALKUMAR BHARATBHAI 150280117008 I.C. 2019 ON CAMPUS AMUL FED DAIRY\n360 LORIYA VIVEKKUMAR SURESHBHAI 150280117024 I.C. 2019 ON CAMPUS AMUL FED DAIRY\n361 PATEL RAHULKUMAR VIJAYBHAI 150280117036 I.C. 2019 EMPLOYED Sani Chem Corpor

  warn_deprecated(


Create and store embeddings in persistant dir (CSV)

In [3]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Define a function to load CSV files from a directory and convert to Document objects
def load_csv_directory(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            # Convert each row of the dataframe to a Document
            for index, row in df.iterrows():
                content = row.to_string()
                data.append(Document(page_content=content))
    return data

# Load CSV documents
directory = "./DataDocs/CSV/"
documents = load_csv_directory(directory)

# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)
all_documents = text_splitter.split_documents(documents)

print(f"Total number of documents: {len(all_documents)}")

# Setup embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Initialize Chroma database
db = Chroma(embedding_function=embeddings, persist_directory="./chromadb")
retv = db.as_retriever()

searchDocs = db.similarity_search("CHAUHAN RIDHAM VIJAYKUMAR")
print(searchDocs)
if searchDocs:
    print(searchDocs[0].page_content)

# Set the batch size
batch_size = 100

# Calculate the number of batches
num_batches = len(all_documents) // batch_size + (len(all_documents) % batch_size > 0)

# Iterate over batches
for batch_num in range(num_batches):
    # Calculate start and end indices for the current batch
    start_index = batch_num * batch_size
    end_index = (batch_num + 1) * batch_size
    # Extract documents for the current batch
    batch_documents = all_documents[start_index:end_index]
    # Add documents to Chroma
    retv.add_documents(batch_documents)
    print(start_index, end_index)

# Persist the collection
print("Success!")
db.persist()

Total number of documents: 4350
[Document(metadata={'page': 19, 'source': 'DataDocs\\PLACEMENT-2022 (3).pdf'}, page_content='649 CHAUHAN JAIMIN VIJAYKUMAR 180280109012 ELECTRICAL 2022 On campus GFL\n650 PHANASE MIT TUSHAR 180280109081 ELECTRICAL 2022 On campus HITACHI\n651 BHATT KASHYAP JAYANTBHAI 180280117002 IC 2022 On campus EINFOCHIPS\n652 JOSHI MAYURKUMAR ASHOKBHAI 180280117021 IC 2022 On campus EINFOCHIPS\n653 AHIR PARTHKUMAR JAYARAMBHAI 180280117001 IC 2022 On campus TCS\n654 JOSHI HIMANSHU VASANTBHAI 180280117020 IC 2022 On campus TCS\n655 KOTADIYA RONAK BHARATBHAI 180280117026 IC 2022 On campus TCS\n656 MISTRY MIRAL JASHWANTBHAI 180280117031 IC 2022 On campus TCS'), Document(metadata={'page': 9, 'source': 'DataDocs\\PLACEMENT-2023.pdf'}, page_content='324 CHAUHAN CHIRAG ASHVINBHAI 190280111015 EC 2023On Campus TCS\n325 GUPTA NIKHIL SARVESH 190280111030 EC 2023On Campus TCS\n326 MAYANK PATEL 190280111090 EC 2023On Campus TCS'), Document(metadata={'page': 2, 'source': 'DataDocs\