In [1]:
import os
import shutil
import weaviate
from weaviate.classes.config import Configure
from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain_ollama import OllamaEmbeddings

In [2]:
DATA_DIR = '../data'

UPLOAD_DIR = "uploads"

UPLOAD_PATH = os.path.join(DATA_DIR, UPLOAD_DIR)

os.makedirs(UPLOAD_PATH, exist_ok=True)

In [34]:
def process_files(file_path):
    docs = []

    file_name = file_path.rsplit('/', 1)[1]
    try:
        print(f"🔍 Processing file: {file_path}")

        if file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith(".txt"):
            loader = TextLoader(file_path, encoding="utf-8")
        elif file_name.endswith(".json"):
            # loader = JSONLoader(file_path, jq_schema='{question: .[].question, answer: .[].answer}')
            loader = JSONLoader(file_path, jq_schema='.[].answer')
        else:
            print(f"⚠️ Skipping unsupported file: {file_name}")
            return

        loaded_docs = loader.load()
        print(f"✅ Successfully loaded {file_name}")

        docs.extend(loaded_docs)

    except Exception as e:
        print(f"❌ Error loading {file_name}: {str(e)}")

    if not docs:
        print("❌ No valid documents found!")
        return

    return docs

In [35]:
file_path = os.path.join(DATA_DIR, 'solar-system.json')
docs = process_files(file_path)

🔍 Processing file: ../data/solar-system.json
✅ Successfully loaded solar-system.json


In [37]:
display(docs[:15])
display(len(docs))

[Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 1}, page_content='The reddish color is from the rocks that contain iron.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 2}, page_content='The moon casts a shadow on the Earth.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 3}, page_content='It is Uranus.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 4}, page_content='There are 7 large groups.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 5}, page_content='Jupiter, Saturn, Uranus, Neptune, and Pluto are the 5 outer planets.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 's

20

In [38]:
weaviate_client = weaviate.connect_to_local()

embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)

In [20]:
input_text = "The meaning of life is 42"
vector = embeddings.embed_query(input_text)
print(vector[:3])

[-0.0005777967, 0.009332829, 0.0055234805]


In [39]:
def update_vector_store(client, embeddings, documents, tenant = None):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)
    
    print("✅ text splitting done")

    # db = WeaviateVectorStore.from_documents(
    #     docs, embeddings, client=weaviate_client)
    db_with_mt = WeaviateVectorStore.from_documents(
        docs, embeddings, client=client, tenant=tenant
    )

    # genetics = weaviate_client.collections.create(
    #     name="Genetics",
    #     vectorizer_config=Configure.Vectorizer.text2vec_ollama(     # Configure the Ollama embedding integration
    #         # Allow Weaviate from within a Docker container to contact your Ollama instance
    #         api_endpoint="http://host.docker.internal:11434",
    #         model="nomic-embed-text",                               # The model to use
    #     ),
    #     generative_config=Configure.Generative.ollama(              # Configure the Ollama generative integration
    #         # Allow Weaviate from within a Docker container to contact your Ollama instance
    #         api_endpoint="http://host.docker.internal:11434",
    #         model="llama3.2",                                       # The model to use
    #     )
    # )

    print("✅ Knowledge base updated successfully!")
    return db_with_mt


In [40]:
db = update_vector_store(client=weaviate_client, embeddings=embeddings, documents=docs, tenant="solar-system")

✅ text splitting done


2025-Mar-01 10:32 PM - langchain_weaviate.vectorstores - INFO - Tenant solar-system does not exist in index LangChain_0b0b3243518a4c268131915f0ebe04cb. Creating tenant.


✅ Knowledge base updated successfully!


In [43]:
query = "Hottest planet"
docs = db.similarity_search(query, tenant="solar-system")

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    # print(doc.page_content[:100] + "...")
    print(doc.page_content)


Document 1:
Venus is the hottest planet.

Document 2:
There are 4 planets made of gas.

Document 3:
It is made of rock and the others are gas planets.

Document 4:
Jupiter has a small core and has a thick layer of gas around it.


In [44]:
from langchain import hub
from pydantic import BaseModel
from typing import List, Dict

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)



You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [48]:
from pydantic import BaseModel
from typing import List, Dict, Optional

from langchain import hub
from langchain_ollama import ChatOllama


prompt = hub.pull("rlm/rag-prompt")


class State(BaseModel):
    question: str
    context: Optional[List[Dict]]
    answer: str
    tenant: str


llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0,
)


def retrieve(state: State):
    retrieved_docs = db.similarity_search(state["question"], tenant=state['tenant'])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke(
        {"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}



In [49]:
state = {'question': "hottest planet", 'context': None, 'answer': '', 'tenant': "solar-system"}

context = retrieve(state)
print("got the context!!")
state['context'] = context['context']

print(generate(state)['answer'])

Venus is the hottest planet, with surface temperatures reaching as high as 462°C (863°F). It's primarily composed of rock and has a thick atmosphere that traps heat. This makes Venus significantly hotter than other planets in our solar system.


In [62]:
print(db.__dict__)

{'_client': <weaviate.client.WeaviateClient object at 0x11e144ce0>, '_index_name': 'LangChain_0b0b3243518a4c268131915f0ebe04cb', '_embedding': OllamaEmbeddings(model='nomic-embed-text', base_url=None, client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None), '_text_key': 'text', '_query_attrs': ['text', 'source', 'seq_num'], 'relevance_score_fn': <function _default_score_normalizer at 0x11c233c40>, '_collection': <weaviate.collections.collection.sync.Collection object at 0x11c84bcb0>, '_multi_tenancy_enabled': True}


In [63]:
vec_db = WeaviateVectorStore(
    client=weaviate_client,
    index_name="LangChain_0b0b3243518a4c268131915f0ebe04cb",
    text_key='text',
    embedding=embeddings
)

In [64]:
def retrieve(db, state: State):
    retrieved_docs = db.similarity_search(state["question"], tenant=state['tenant'])
    return {"context": retrieved_docs}

In [67]:
state = {'question': "hottest planet", 'context': None, 'answer': '', 'tenant': "solar-system"}

context = retrieve(vec_db, state)
state['context'] = context['context']

# Print the first 100 characters of each result
for i, doc in enumerate(context['context']):
    print(f"\nDocument {i+1}:")
    # print(doc.page_content[:100] + "...")
    print(doc.page_content)


Document 1:
Venus is the hottest planet.

Document 2:
There are 4 planets made of gas.

Document 3:
It is made of rock and the others are gas planets.

Document 4:
Jupiter has a small core and has a thick layer of gas around it.


In [68]:
print(generate(state)['answer'])

Venus is the hottest planet, with surface temperatures reaching as high as 462°C (863°F). It is composed primarily of rock and metal, unlike the other gas giants in our solar system. Jupiter is not considered one of the four gas planets.
