## RAG (Langchain)

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)


FILE_PATH = "docs/sample.pdf"  # Contract

from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

docs = loader.load()
splits = text_splitter.split_documents(docs)
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

from tempfile import TemporaryDirectory

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
)



Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 100395.57it/s]
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [7]:
from langchain_huggingface import HuggingFaceEndpoint

HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [11]:
rag_chain.invoke("Explain Duties?")

'Under the Professional Services Agreement, the Consultant (represented by \\_\\_\\_\\_\\_\\_\\_ in this case) agrees to perform specific tasks and deliver results that are satisfactory to the Santa Cruz County Regional Transportation Commission (Commission). These tasks, along with the details of the work, are specified in Exhibit A: Scope of Services, which is incorporated into this Agreement. The Consultant will provide'

In [9]:
rag_chain.invoke("What about insurance?")

'The Consultant is required to obtain and maintain certain insurance coverage for the full term of the Agreement and any extensions. This includes:\n\n1) Comprehensive or Commercial General Liability Insurance in the minimum amount of one million dollars ($1,000,000) combined single limit (CSL), including coverage for bodily injury, personal injury, broad form property damage, contractual liability, and cross-liability.\n\n2) If requested by the Commission, Professional Liability Insurance in the minimum amount of one million dollars ($1,000,000) combined single limit.\n\n3) If any insurance policy of the Consultant includes language conditioning the insurer\'s legal obligation to defend or indemnify the Commission, the policy shall also name the Commission as a named insured.\n\n4) The Consultant shall do all things required to be performed by it pursuant to its insurance policies, including paying all deductibles and self-insured retentions (SIR) required to be paid under any insuran