In [14]:
import os
import shutil
import weaviate
from weaviate.classes.config import Configure
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain_ollama import OllamaEmbeddings

In [15]:
DATA_DIR = '../data'

UPLOAD_DIR = "uploads"

UPLOAD_PATH = os.path.join(DATA_DIR, UPLOAD_DIR)

os.makedirs(UPLOAD_PATH, exist_ok=True)

In [16]:
def process_files():
    docs = []
    for file_name in os.listdir(UPLOAD_PATH):
        file_path = os.path.join(UPLOAD_PATH, file_name)

        try:
            print(f"🔍 Processing file: {file_path}")

            if file_name.endswith(".pdf"):
                loader = PyPDFLoader(file_path)
            elif file_name.endswith(".txt"):
                loader = TextLoader(file_path, encoding="utf-8")
            else:
                print(f"⚠️ Skipping unsupported file: {file_name}")
                continue

            loaded_docs = loader.load()
            print(f"✅ Successfully loaded {file_name}")

            docs.extend(loaded_docs)

        except Exception as e:
            print(f"❌ Error loading {file_name}: {str(e)}")

    if not docs:
        print("❌ No valid documents found!")
        return

    return docs

In [17]:
docs = process_files()

🔍 Processing file: ../data/uploads/genetics101.pdf
✅ Successfully loaded genetics101.pdf


In [18]:
display(docs[:15])
display(len(docs))

[Document(metadata={'producer': 'Adobe Acrobat Pro DC 20.6.20042', 'creator': 'Adobe Acrobat Pro DC 20.6.20042', 'creationdate': '2020-04-27T13:33:09-04:00', 'moddate': '2020-04-27T15:49:48-04:00', 'title': '', 'source': '../data/uploads/genetics101.pdf', 'total_pages': 265, 'page': 0, 'page_label': ''}, page_content=''),
 Document(metadata={'producer': 'Adobe Acrobat Pro DC 20.6.20042', 'creator': 'Adobe Acrobat Pro DC 20.6.20042', 'creationdate': '2020-04-27T13:33:09-04:00', 'moddate': '2020-04-27T15:49:48-04:00', 'title': '', 'source': '../data/uploads/genetics101.pdf', 'total_pages': 265, 'page': 1, 'page_label': '1'}, page_content='GENETICS 101'),
 Document(metadata={'producer': 'Adobe Acrobat Pro DC 20.6.20042', 'creator': 'Adobe Acrobat Pro DC 20.6.20042', 'creationdate': '2020-04-27T13:33:09-04:00', 'moddate': '2020-04-27T15:49:48-04:00', 'title': '', 'source': '../data/uploads/genetics101.pdf', 'total_pages': 265, 'page': 2, 'page_label': '2'}, page_content=''),
 Document(meta

265

In [19]:
weaviate_client = weaviate.connect_to_local()

embeddings = OllamaEmbeddings(
    model="llama3.2:3b",
)

            Please make sure to close the connection using `client.close()`.
  embeddings = OllamaEmbeddings(


In [20]:
input_text = "The meaning of life is 42"
vector = embeddings.embed_query(input_text)
print(vector[:3])

[-0.0005777967, 0.009332829, 0.0055234805]


In [None]:
def update_vector_store(client, embeddings, documents, tenant = None):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)
    
    print("✅ text splitting done")

    # db = WeaviateVectorStore.from_documents(
    #     docs, embeddings, client=weaviate_client)
    db_with_mt = WeaviateVectorStore.from_documents(
        docs, embeddings, client=client, tenant=tenant
    )

    # genetics = weaviate_client.collections.create(
    #     name="Genetics",
    #     vectorizer_config=Configure.Vectorizer.text2vec_ollama(     # Configure the Ollama embedding integration
    #         # Allow Weaviate from within a Docker container to contact your Ollama instance
    #         api_endpoint="http://host.docker.internal:11434",
    #         model="nomic-embed-text",                               # The model to use
    #     ),
    #     generative_config=Configure.Generative.ollama(              # Configure the Ollama generative integration
    #         # Allow Weaviate from within a Docker container to contact your Ollama instance
    #         api_endpoint="http://host.docker.internal:11434",
    #         model="llama3.2",                                       # The model to use
    #     )
    # )

    print("✅ Knowledge base updated successfully!")
    return db_with_mt


In [22]:
# %%time
# db = update_vector_store(client=weaviate_client, embeddings=embeddings, documents=docs, tenant="genetics")

2025-Mar-01 07:23 PM - langchain_weaviate.vectorstores - INFO - Tenant genetics does not exist in index LangChain_a685b460f26b46478fbe90d4016ee64d. Creating tenant.


✅ Knowledge base updated successfully!


In [26]:
query = "What is RNA?"
docs = db.similarity_search(query, tenant="genetics")

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")


Document 1:
scribed into an mRNA (messenger RNA), and from there a ribo -
some translates the instructions into ...

Document 2:
actually translating the genetic code of A’s, U’s, G’s, and C’s into 
protein. They can do this beca...

Document 3:
strand of DNA. First, let’s think about when and where this happens....

Document 4:
a hybrid between a cocker span -
iel and a poodle.
 Hydrogen bond
 An attraction between mol -
ecule...


In [27]:
from pydantic import BaseModel
from typing import List, Dict, Optional

from langchain import hub
from langchain_ollama import ChatOllama


prompt = hub.pull("rlm/rag-prompt")


class State(BaseModel):
    question: str
    context: Optional[List[Dict]]
    answer: str
    tenant: str


llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0,
)


def retrieve(state: State):
    retrieved_docs = db.similarity_search(state["question"], tenant=state['tenant'])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke(
        {"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}



In [31]:
%%time

state = {'question': "what is the shape of DNA?", 'context': None, 'answer': '', 'tenant': "genetics"}

context = retrieve(state)
print("got the context!!")
state['context'] = context['context']

print(generate(state)['answer'])

got the context!!
I don't know the shape of DNA. The text describes a spiral staircase, but doesn't explicitly state what the shape of DNA is. It does mention that the resulting shape is called a double helix, which implies a specific structure, but it's not directly stated in this context.
CPU times: user 58 ms, sys: 41.2 ms, total: 99.2 ms
Wall time: 1min 52s
