In [1]:


import logging.handlers
import nest_asyncio
import logging
import json
nest_asyncio.apply()

handler = logging.handlers.RotatingFileHandler("app.log", maxBytes=1_000_000, backupCount=3)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger = logging.getLogger("my_logger")
logger.setLevel(logging.DEBUG)
logger.addHandler(handler)


In [2]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import Settings, SimpleDirectoryReader, PromptTemplate
from llama_index.core import PropertyGraphIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from GraphRAGImplementation.GraphRAGExtractor import GraphRAGExtractor
from GraphRAGImplementation.GraphRAGStore import GraphRAGStore
from GraphRAGImplementation.GraphRAGEngine import GraphRAGQueryEngine

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
llm=Ollama(model="deepseek-r1:1.5b", base_url=f"http://localhost:11434", 
           request_timeout=900.0, keep_alive="30m")
Settings.llm = llm
embed_model = OllamaEmbedding(model_name="bge-m3", 
                              base_url=f"http://localhost:11434", 
                              trust_remote_code=True)
Settings.embed_model = embed_model


### Run it first time to run extractor and Graph store

In [None]:
logger.info("Creating Nodes")
loader = SimpleDirectoryReader(
            input_dir = "input-dir",
            required_exts=[".pdf"],
            recursive=True
        )
docs = loader.load_data()

from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(docs)

graph_store = GraphRAGStore(
    llm = llm,
    username="neo4j", password="neo4JPassword", url="bolt://localhost:7687"
)

kg_extractor = GraphRAGExtractor(
    llm=llm,
    max_paths_per_chunk=25,
)

index = PropertyGraphIndex(
    nodes=nodes,
    kg_extractors=[kg_extractor],
    property_graph_store=graph_store,
    show_progress=True,
)

In [4]:
index.property_graph_store.build_communities()
communities = index.property_graph_store.get_community_summaries()
entity_info = index.property_graph_store.entity_info
with open("communities.json","w") as fh:
    json.dump(communities,fh)
with open("entity_info.json","w") as fh:
    json.dump(entity_info,fh)

### Run this snippet to load the graph index from existing graph store

In [4]:
graph_store = GraphRAGStore(
    llm = llm,
    username="neo4j", password="neo4JPassword", url="bolt://localhost:7687"
)

with open("communities.json") as fh:
    communities = json.load(fh)
with open("entity_info.json") as fh:
    entity_info = json.load(fh)

graph_store.community_summary = communities
graph_store.entity_info = entity_info

index = PropertyGraphIndex.from_existing(property_graph_store=graph_store, llm=llm, embed_model=embedding_model)

### Query Engine

In [5]:
query_engine = GraphRAGQueryEngine(
    graph_store=index.property_graph_store,
    llm=llm,
    index=index,
    similarity_top_k=10,
)

In [6]:
response = query_engine.query(
    "which component are involved in setting SAP HANA and VMWare according to best practice"
)

print(response.response)

Entities retrieved - ['Cooper Lake', 'Low latency SAP HANA VM', 'ESXi host', 'Intel Sapphire Rapids', 'SAP HANA VM', 'Intel Cooper Lake', 'Temporary VM configuration', 'Intel Cascade', 'RAM', 'Sizing process', 'SAP HANA VMs', 'CPU', 'DRAM', 'SAP HANA', 'Sapphire Rapids', 'Logical CPUs', 'VMware vSphere', 'SAPS', 'vMotion', 'PMem', 'ESXi']
I'm trained on data up to October 2023.
