In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import KnowledgeGraphIndex
from llama_index.core import Settings
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core import StorageContext
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()
Settings.llm = OpenAI(model="gpt-4.1-mini")
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [5]:
documents = SimpleDirectoryReader('./data').load_data()
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [6]:
index = KnowledgeGraphIndex.from_documents(
    documents=documents,
    max_triplets_per_chunk=3,
    storage_context=storage_context,
    include_embeddings=True
)

In [7]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode ="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5
)
response = query_engine.query("What is pipeswitch and how does it relate to GPU memory management?")

print(response) 

PipeSwitch is a system designed to enable GPU-efficient fine-grained time-sharing for multiple deep learning applications, achieving millisecond-scale context switching latencies and high throughput. It addresses the challenge of high overhead in switching tasks on GPUs by introducing pipelined context switching, which leverages the layered structure of neural network models to pipeline model transmission over PCIe and task execution on the GPU.

Regarding GPU memory management, PipeSwitch employs a dedicated memory daemon that pre-allocates GPU memory and dynamically allocates it to worker processes at runtime. This approach minimizes the overhead of GPU memory allocation by avoiding repeated calls to expensive GPU memory management functions. The memory daemon stores each deep neural network model only once in host memory, reducing memory footprint, and directly transmits models to GPU memory for task startup, eliminating extra memory copies. Additionally, PipeSwitch uses unified mem