## Pre-requisites

In [None]:
# Import libraries
import os
import sys
import shutil

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, "..")) # absolute path for ekr_rag directory
repo_dir = os.path.abspath(os.path.join(kit_dir, "..")) # absolute path for starter-kit directory
print('kit_dir: %s'%kit_dir)
print('repo_dir: %s'%repo_dir)

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from src.document_retrieval import DocumentRetrieval
from utils.parsing.sambaparse import SambaParse, parse_doc_universal

CONFIG_PATH = os.path.join(kit_dir,'config.yaml')
PERSIST_DIRECTORY = os.path.join(kit_dir,f"data/my-vector-db")

## Document loading and splitting

In [None]:
# Specify PDF folder location
pdf_folder = kit_dir+'/data/test'

# Initialize DocumentRetrieval class
documentRetrieval =  DocumentRetrieval()

# Parse and chunk the documents
additional_metadata = {}
_, _, text_chunks = parse_doc_universal(doc=pdf_folder, additional_metadata=additional_metadata)
print('Nb of chunks: %d'%len(text_chunks))


## Vectorization and storage

In [None]:
# Create vector store
embeddings = documentRetrieval.load_embedding_model()
if os.path.exists(PERSIST_DIRECTORY):
    shutil.rmtree(PERSIST_DIRECTORY)
    print(f"The directory Chroma has been deleted.")
#vectorstore = documentRetrieval.create_vector_store(text_chunks, embeddings, output_db=None)
vectorstore = documentRetrieval.create_vector_store(text_chunks, embeddings, output_db=PERSIST_DIRECTORY)

## Retrieval and generation

In [None]:
# Create conversation chain
documentRetrieval.init_retriever(vectorstore)
conversation = documentRetrieval.get_qa_retrieval_chain()

In [None]:
# Ask questions about your data
user_question = "What is a composition of experts?"

response = conversation.invoke({"question":user_question})
print(response['question'])
print(response['answer'])

for i in range(0,len(response['source_documents'])):
    print('\nSource #%d:'%(i+1))
    print(response['source_documents'][i].page_content)
    print(response['source_documents'][i].metadata)