# Q&A with RAG

## Pre-requisites

In [1]:
# Import libraries
import os
import sys
import logging
import pickle # remove
import shutil
from PyPDF2 import PdfReader
from langchain_community.document_loaders import UnstructuredPDFLoader

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, "..")) # absolute path for ekr_rag directory
repo_dir = os.path.abspath(os.path.join(kit_dir, "../../../")) # absolute path for starter-kit directory
print('kit_dir: %s'%kit_dir)
print('repo_dir: %s'%repo_dir)

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from src.document_retrieval import DocumentRetrieval

CONFIG_PATH = os.path.join(kit_dir,'config.yaml')
PERSIST_DIRECTORY = os.path.join(kit_dir,f"data/my-vector-db")

kit_dir: /Users/varunk/repo/ai-starter-kit/workshops/ai_engineer_2024/ekr_rag
repo_dir: /Users/varunk/repo/ai-starter-kit


## Document loading and splitting

In [2]:
# Specify PDF file
pdf_file = kit_dir + '/data/tmp/SN40LPaper2Pages.pdf'

# Initialize DocumentRetrieval class
documentRetrieval =  DocumentRetrieval()
print('Loader: %s'%documentRetrieval.loaders['pdf'])

# Get pdf text
raw_text = []
meta_data = []
if documentRetrieval.loaders['pdf'] == "unstructured":
    loader = UnstructuredPDFLoader(pdf_file)
    docs_unstructured = loader.load()
    for doc in docs_unstructured:
        raw_text.append(doc.page_content)
        meta_data.append({"filename": pdf_file})
elif documentRetrieval.loaders['pdf'] == "pypdf2":
    pdf_reader = PdfReader(pdf_file)
    for page in pdf_reader.pages:
        raw_text.append(page.extract_text())
        meta_data.append({"filename": pdf_file})#, "page": page_number})
else:
    raise ValueError(f"{self.documentRetrieval.loaders['pdf']} is not a valid pdf loader")

# Get the text chunks
text_chunks = documentRetrieval.get_text_chunks_with_metadata(docs=raw_text, meta_data=meta_data) # lst of langchain_core.documents.base.Document
#print(text_chunks[0].page_content)
print('Nb of chunks: %d'%len(text_chunks))


Loader: pypdf2
Nb of chunks: 12


## Vectorization and storage

In [3]:
# Create vector store
embeddings = documentRetrieval.load_embedding_model()
if os.path.exists(PERSIST_DIRECTORY):
    shutil.rmtree(PERSIST_DIRECTORY)
    print(f"The directory Chroma has been deleted.")
#vectorstore = documentRetrieval.create_vector_store(text_chunks, embeddings, output_db=None)
vectorstore = documentRetrieval.create_vector_store(text_chunks, embeddings, output_db=PERSIST_DIRECTORY)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

2024-06-24 14:17:46,714 [INFO] - Load pretrained SentenceTransformer: intfloat/e5-large-v2


load INSTRUCTOR_Transformer


2024-06-24 14:17:48,794 [INFO] - Use pytorch device: cpu


max_seq_length  512
The directory Chroma has been deleted.


2024-06-24 14:17:52,183 [INFO] - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-06-24 14:18:11,095 [INFO] - Vector store saved to /Users/varunk/repo/ai-starter-kit/workshops/ai_engineer_2024/ekr_rag/data/my-vector-db


## Retrieval and generation

In [4]:
# Create conversation chain
documentRetrieval.init_retriever(vectorstore)
conversation = documentRetrieval.get_qa_retrieval_chain()

In [5]:
# Ask questions about your data
user_question = "What is a monolithic model?"

response = conversation.invoke({"question":user_question})
print(response['question'])
print(response['answer'])
#print(len(response['source_documents']))

for i in range(0,len(response['source_documents'])):
    print('\nSource #%d:'%(i+1))
    print(response['source_documents'][i].page_content)
    print(response['source_documents'][i].metadata)

What is a monolithic model?
A monolithic model is a large language model that has billions or trillions of parameters and is trained with curated datasets that consist of trillions of tokens scraped from the web.

Source #1:
Source: /Users/varunk/repo/ai-starter-kit/workshops/ai_engineer_2024/ekr_rag/data/tmp/SN40LPaper2Pages, Text: 
from the web. However, training and serving a state-of-
the-art monolithic LLM is both an extraordinarily expensive
affair and a complex systems engineering challenge. Training
requires building and operating a supercomputer composed ofthousands of hosts, purpose-built networks, power and cooling
infrastructure, and thousands of accelerators – typically
GPUs [ 29], [30] or TPUs [ 46]–[49]. The prohibitive cost
and expertise required to train and serve 100s of billions of
parameters put state-of-the-art AI capabilities out of reach
for many academic researchers and smaller organizations,
especially when on-premise deployments are needed. For
instance, compu