In [1]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil

In [2]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [4]:
loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
documents=loader.load()

In [7]:
len(documents)

17

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ".", ",", " ", ""],
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )

In [9]:
chunks = text_splitter.split_documents(documents)

In [11]:
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

In [15]:
db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )

In [16]:
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

Saved 73 chunks to chroma.


# LLM

In [17]:
from dataclasses import dataclass
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import os
from dotenv import load_dotenv

In [18]:
load_dotenv()
CHROMA_PATH = "chroma"

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [19]:
query_text = input("Enter your question: \n")
print(query_text)

A
