In [5]:
#python3 -m venv cenv
# source env/bin/activate
# source ~/.zshrc

In [4]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader


In [5]:
DATA_DIR = "data"

def load_documents(directory):
    loader = DirectoryLoader(directory, glob="*.pdf", loader_cls=PyPDFLoader)
    return loader.load()

def chunk_documents(documents, chunk_size=1000, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_documents(documents)

def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"):
    return HuggingFaceEmbeddings(model_name=model_name)

def store_chroma(docs, embeddings, persist_dir="./chroma_db"):
    vectorstore = Chroma.from_documents(docs, embeddings, persist_directory=persist_dir)
    return vectorstore

def load_chroma(persist_dir="./chroma_db", embeddings=None):
    return Chroma(persist_directory=persist_dir, embedding_function=embeddings)

def query_chroma(query, vectorstore):
    docs = vectorstore.similarity_search(query, k=3)
    return [(doc.metadata, doc.page_content) for doc in docs]


In [11]:
# rm -rf ./chroma_db
# lsof | grep chroma_db

In [6]:
# Load and process data
documents = load_documents(DATA_DIR)
chunks = chunk_documents(documents)

# Generate embeddings
embeddings = get_embeddings()

# Store in ChromaDB
vectorstore = store_chroma(chunks, embeddings)

# Load ChromaDB for querying
vectorstore = load_chroma(embeddings=embeddings)

  return Chroma(persist_directory=persist_dir, embedding_function=embeddings)


In [7]:
display(vectorstore)

<langchain_community.vectorstores.chroma.Chroma at 0x11a924a50>

In [8]:
query = "what is Pinus greggii"
results = query_chroma(query, vectorstore)

In [9]:
results

[({'author': 'umax',
   'creationdate': 'D:20001016124829',
   'creator': 'Adobe PageMaker 6.0',
   'keywords': '',
   'moddate': 'D:20001019130050',
   'page': 88,
   'page_label': '89',
   'producer': 'Acrobat Distiller 2.1 for Power Macintosh',
   'source': 'data/Camcore_BookAllChapters.pdf',
   'subject': '',
   'title': 'Final camcore book pt 1',
   'total_pages': 250},
  'PINUS GREGGII\n73'),
 ({'author': 'umax',
   'creationdate': 'D:20001016124829',
   'creator': 'Adobe PageMaker 6.0',
   'keywords': '',
   'moddate': 'D:20001019130050',
   'page': 88,
   'page_label': '89',
   'producer': 'Acrobat Distiller 2.1 for Power Macintosh',
   'source': 'data/Camcore_BookAllChapters.pdf',
   'subject': '',
   'title': 'Final camcore book pt 1',
   'total_pages': 250},
  'PINUS GREGGII\n73'),
 ({'author': 'umax',
   'creationdate': 'D:20001016124829',
   'creator': 'Adobe PageMaker 6.0',
   'keywords': '',
   'moddate': 'D:20001019130050',
   'page': 68,
   'page_label': '69',
   'prod

In [10]:
# from langchain.prompts import PromptTemplate
#
# # Define the Prompt Template
# prompt = PromptTemplate(
#     input_variables=["context", "question"],
#     template="Use the following context to answer the question:\n\n{context}\n\nQuestion: {question}\nAnswer:"
# )

In [1]:
from langchain_ollama import OllamaLLM

LLM_MODEL = "deepseek-llm"

def get_llm(model_name=LLM_MODEL):
    return OllamaLLM(model=model_name)

# RAG Pipeline Function
def rag_pipeline(query, vectorstore):
    # embeddings = get_embeddings()
    # vectorstore = load_chroma(embeddings=embeddings)

    retrieved_docs = query_chroma(query, vectorstore)
    context = "\n".join([doc[1] for doc in retrieved_docs])

    llm = get_llm()
    prompt = f"Answer the question based on the following context:\n{context}\n\nQuestion: {query}"
    response = llm.invoke(prompt)
    return response

#grid search - chunk size - hyperparam

ModuleNotFoundError: No module named 'ollama'

In [33]:
query = "What is Pinus greggii?"
response = rag_pipeline(query, vectorstore)
print(response)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10577ee40>>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt

