In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-4.1-mini")
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [3]:
from llama_index.core import StorageContext, load_index_from_storage

try: 
    storage_context = StorageContext.from_defaults(
        persist_dir='./storage/vllm'
    )

    vllm_index = load_index_from_storage(storage_context)
    index_loaded = True
except: 
    index_loaded = False


In [14]:
from llama_index.core import StorageContext, load_index_from_storage

query_engine_tools = []

try: 
    storage_context = StorageContext.from_defaults(
        persist_dir='./storage/'
    )

    indices = load_index_from_storage(storage_context)
    index_loaded = True
except: 
    index_loaded = False


In [16]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.tools import QueryEngineTool

if not index_loaded:
    # Group documents by filename
    docs_by_file = {}
    docs = SimpleDirectoryReader(
        input_dir='./data',
        recursive=True,
        filename_as_id=True
    ).load_data()
    
    # Group documents by their filename
    for doc in docs:
        file_name = doc.metadata.get('file_name', 'unknown')
        if file_name not in docs_by_file:
            docs_by_file[file_name] = []
        docs_by_file[file_name].append(doc)
    
    # Create one index and tool per file
    for file_name, file_docs in docs_by_file.items():
        print(f"Processing {file_name}")
        
        # Create index from all chunks of the same file
        doc_index = VectorStoreIndex.from_documents(file_docs)
        
        persist_dir = f'./storage/{file_name}'
        doc_index.storage_context.persist(persist_dir=persist_dir)
        
        doc_engine = doc_index.as_query_engine(similarity_top_k=3)
        tool = QueryEngineTool.from_defaults(
            query_engine=doc_engine,
            name=file_name,
            description=f"A tool to answer questions about the {file_name} paper. If asked about a specific part, provide the exact text from the paper."
        )
        
        query_engine_tools.append(tool)


Processing pipeswitch.pdf
Processing vLLM.pdf


In [17]:
from llama_index.core.agent.workflow import ReActAgent
from llama_index.core.workflow import Context

print(query_engine_tools)
agent = ReActAgent(
    tools=query_engine_tools,
    llm=Settings.llm,
    system_prompt="""You are a helpful RAG agent that can answer questions about multiple documents. 
    When answering questions:
    1. First determine which document(s) are most relevant to the question
    2. Use the appropriate tool(s) to search those documents
    3. Always specify which document the information came from
    4. If information comes from multiple documents, clearly indicate this
    5. Keep all text in English
    6. Provide exact quotes when relevant"""
)

ctx = Context(agent)

[<llama_index.core.tools.query_engine.QueryEngineTool object at 0x3108411f0>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x3106114f0>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x3101dff50>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310808a10>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310808c50>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809070>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809430>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809610>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809790>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310808710>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809a90>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809eb0>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x310809fd0>

In [19]:
from llama_index.core.agent.workflow import ToolCallResult, AgentStream

handler = agent.run(input("Enter a question: "), ctx=ctx)

async for ev in handler.stream_events():
    if isinstance(ev, ToolCallResult):
        print(f"\nCall {ev.tool_name} with {ev.tool_kwargs}\nReturned: {ev.tool_output}")
    if isinstance(ev, AgentStream):
        print(f"{ev.delta}", end="", flush=True)

response = await handler

Thought: The user is asking about "pipeswitch." I need to use a tool to help me answer the question.
Action: pipeswitch.pdf
Action Input: {"input":"What is pipeswitch?"}
Call pipeswitch.pdf with {'input': 'What is pipeswitch?'}
Returned: PipeSwitch is a system designed to enable fast pipelined context switching specifically for deep learning applications. It optimizes the process of switching tasks on GPUs by pipelining model transmission and task execution, reducing overhead and latency. PipeSwitch achieves efficient task switching by grouping model layers in a model-aware manner to balance the trade-off between pipelining overhead and efficiency. This grouping minimizes the number of PCIe calls and synchronization overhead, allowing computation to start as soon as parameters for a group are transmitted. Additionally, PipeSwitch incorporates unified memory management to further reduce overhead during task switching. Overall, it significantly improves throughput and latency for deep le