In [9]:
import os
import shutil
import faiss
from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings



In [10]:
DATA_DIR = '../data'

UPLOAD_DIR = "uploads"

UPLOAD_PATH = os.path.join(DATA_DIR, UPLOAD_DIR)

os.makedirs(UPLOAD_PATH, exist_ok=True)

In [11]:
def process_files(file_path):
    docs = []

    file_name = file_path.rsplit('/', 1)[1]
    try:
        print(f"🔍 Processing file: {file_path}")

        if file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith(".txt"):
            loader = TextLoader(file_path, encoding="utf-8")
        elif file_name.endswith(".json"):
            # loader = JSONLoader(file_path, jq_schema='{question: .[].question, answer: .[].answer}')
            loader = JSONLoader(file_path, jq_schema='.[].answer')
        else:
            print(f"⚠️ Skipping unsupported file: {file_name}")
            return

        loaded_docs = loader.load()
        print(f"✅ Successfully loaded {file_name}")

        docs.extend(loaded_docs)

    except Exception as e:
        print(f"❌ Error loading {file_name}: {str(e)}")

    if not docs:
        print("❌ No valid documents found!")
        return

    return docs

In [12]:
file_path = os.path.join(DATA_DIR, 'solar-system.json')
docs = process_files(file_path)

🔍 Processing file: ../data/solar-system.json
✅ Successfully loaded solar-system.json


In [13]:
display(docs[:15])
display(len(docs))

[Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 1}, page_content='The reddish color is from the rocks that contain iron.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 2}, page_content='The moon casts a shadow on the Earth.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 3}, page_content='It is Uranus.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 4}, page_content='There are 7 large groups.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 'seq_num': 5}, page_content='Jupiter, Saturn, Uranus, Neptune, and Pluto are the 5 outer planets.'),
 Document(metadata={'source': '/Users/anurags/Projects/PersonalProjects/book-rag/data/solar-system.json', 's

20

In [14]:
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [15]:
input_text = "The meaning of life is 42"
vector = embeddings.embed_query(input_text)
print(vector[:3])

[0.008603463, -0.008584372, -0.15307632]


In [16]:
def update_vector_store(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(documents)
    
    print("✅ text splitting done")

    _ = vector_store.add_documents(documents=docs)

    print("✅ Knowledge base updated successfully!")
    return vector_store


In [17]:
vector_store = update_vector_store(documents=docs)

✅ text splitting done
✅ Knowledge base updated successfully!


In [19]:
query = "Hottest planet"
retrieved_docs = vector_store.similarity_search(query)

# Print the first 100 characters of each result
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    # print(doc.page_content[:100] + "...")
    print(doc.page_content)


Document 1:
Venus is the hottest planet.

Document 2:
There are 4 planets made of gas.

Document 3:
It is made of rock and the others are gas planets.

Document 4:
Jupiter has a small core and has a thick layer of gas around it.


In [20]:
from langchain import hub
from pydantic import BaseModel
from typing import List, Dict

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)



You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [26]:
from pydantic import BaseModel
from typing import List, Dict, Optional

from langchain import hub
from langchain_ollama import ChatOllama


prompt = hub.pull("rlm/rag-prompt")


class State(BaseModel):
    question: str
    context: Optional[List[Dict]]
    answer: str
    tenant: str


llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0,
)


def retrieve(vector_store, state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke(
        {"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


def stream(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke(
        {"question": state["question"], "context": docs_content})
    for token in llm.stream(messages):
        yield token



In [23]:
state = {'question': "hottest planet", 'context': None, 'answer': '', 'tenant': "solar-system"}

context = retrieve(vector_store, state)
print("got the context!!")
state['context'] = context['context']

print(generate(state)['answer'])

got the context!!
Venus is the hottest planet, with surface temperatures reaching as high as 462°C (863°F). It is composed primarily of rock and metal, unlike the other gas giants in our solar system. Jupiter is not considered the hottest planet.


In [28]:
state = {'question': "hottest planet", 'context': None, 'answer': '', 'tenant': "solar-system"}

context = retrieve(vector_store, state)
print("got the context!!")
state['context'] = context['context']

for resp in stream(state):
    print(resp.content, end='', flush=True)

got the context!!
Venus is the hottest planet, with surface temperatures reaching as high as 462°C (863°F). It's primarily composed of rock and has a thick atmosphere that traps heat. This makes Venus the hottest planet in our solar system.

In [31]:
%%timeit

state = {'question': "hottest planet", 'context': None, 'answer': '', 'tenant': "solar-system"}

context = retrieve(vector_store, state)
state['context'] = context['context']

185 ms ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
print(vector_store.__dict__)

{'embedding_function': OllamaEmbeddings(model='nomic-embed-text', base_url=None, client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None), 'index': <faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x103ad5f20> >, 'docstore': <langchain_community.docstore.in_memory.InMemoryDocstore object at 0x103ad5a90>, 'index_to_docstore_id': {0: 'cac0e650-6639-4cb5-b28c-412db1f1b4c9', 1: 'bfeb6dbb-698c-41af-9aa1-4f332459ab93', 2: 'bc1404d7-78f2-4ed6-b6b8-5a0982a35627', 3: 'a344ff14-e4c0-4f5d-819f-e07171104a53', 4: '3f0f3354-8dd2-45c8-97c6-b881f924bf1c', 5: '3a42927c-345f-4c76-9e36-60e136b9f0df', 6: '54a75b6a-0284-4d41-b1be-a944b386f8c5', 7: '1f1e162a-8f40-422e-be10-65293aff5986', 8: '9c2b7a42-e194-4d50-a847-25f655a9ea3b', 9: '12bf1a66-07ff-4541-9c1e-bc7c0d60fb66', 10: '1de389da-9e04-4a1b-81e8-deea5633