In [36]:
# Loader
from langchain_community.document_loaders import PyPDFLoader

# Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings
from langchain_community.embeddings import SentenceTransformerEmbeddings

# Vector Store
from langchain_community.vectorstores import Chroma

# additional imports
import os
import logging
from uuid import uuid4 # for unique ids

## **Vector Store Chroma**



#### **Load and split documents**

In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 100, add_start_index = False) # splits the text into chunks

# Load and split documents
def load_pdfs(pdf_path):
    chunks = []
    if not os.path.exists(pdf_path):
        logging.error(f"Filepath {pdf_path} does not exist")
        return chunks
    
    for file in os.listdir(pdf_path):
        if file.endswith(".pdf"):
            filepath = os.path.join(pdf_path, file)
            try:
                loader = PyPDFLoader(filepath)
                docs = loader.load()
                split_texts = text_splitter.split_documents(docs)
                chunks.extend(split_texts)
                logging.info(f"Processed {filepath} successfully with {len(split_texts)} chunks")
            except Exception as e:
                logging.error(f"Failed to process {filepath}: {e}")
    return chunks

In [None]:
# Create chunks
pdf_path = "/home/ssever/rag-llm-demo/data/files"
chunks = load_pdfs(pdf_path)

In [21]:
# Load the embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L12-v2")

# Load chunks into the vector store
destination = "/home/ssever/rag-llm-demo/data/vector_store/Chroma"
db = Chroma.from_documents(chunks, embedding_function, persist_directory=destination)

##### **Test query**

In [None]:
query = "Taking social and environmental responsibility for all we do is an integral part of how we perceive ourselves as a company."
docs = db.similarity_search_with_score(query, k=5)

found_chunks = []

for doc, score in docs:
    found_chunks.append(doc.page_content)
    
found_chunks[0]