In [None]:
!pip install Eager
!pip install graph-retriever
!pip install langchain-google-genai
!pip install langchain_experimental
!pip install langchain_neo4j
!pip install langchain_graph_retriever
!pip install docx2txt

In [13]:
import os
import glob
from typing import List
from langchain_core.documents import Document
from graph_retriever.strategies import Eager
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_experimental.text_splitter import SemanticChunker
from langchain_neo4j import Neo4jGraph
from langchain_community.document_loaders import Docx2txtLoader
from langchain_graph_retriever import GraphRetriever# Initialize LLM Graph Transformer
from dotenv import load_dotenv

#from google.colab import userdata
#google_api_key = userdata.get('GOOGLE_API_KEY')
google_api_key="AIzaSyDZ1PBvZSNKPg6NYLNAH6qxsjx4ga2OHms"



In [None]:
def load_and_chunk_documents() -> List[Document]:
    """Load and chunk .docx files using semantic chunking"""
    documents = []
    docx_files = glob.glob(r"C:\Users\0J0795897\pythonAI\*.docx")  # Adjust path as needed

    if not docx_files:
        print("No .docx files found")
        return documents

    # Initialize semantic chunker
    text_splitter = SemanticChunker(embeddings)

    for docx_file in docx_files:
        print(f"Processing {docx_file}...")

        # Load document
        loader = Docx2txtLoader(docx_file)
        raw_docs = loader.load()

        # Apply semantic chunking
        chunks = text_splitter.split_documents(raw_docs)

        # Add metadata
        for chunk in chunks:
            chunk.metadata["source"] = os.path.basename(docx_file)

        documents.extend(chunks)

    print(f"pyLoaded {len(documents)} document chunks")
    return documents

In [None]:
def store_graph_documents(graph_documents: List):
    """Store graph documents in Neo4j"""
    print("Storing graph data in Neo4j...")

    # Clear existing data (optional)
    graph.query("MATCH (n) DETACH DELETE n")

    for graph_doc in graph_documents:
        # Add nodes
        for node in graph_doc.nodes:
            node_type = node.type.replace(" ", "_")  # Sanitize for Neo4j
            graph.query(
                f"MERGE (n:{node_type} {{id: $id}})",
                {"id": node.id}
            )

        # Add relationships
        for rel in graph_doc.relationships:
            rel_type = rel.type.replace(" ", "_")
            graph.query(
                f"""
                MATCH (source {{id: $source_id}})
                MATCH (target {{id: $target_id}})
                MERGE (source)-[r:{rel_type}]->(target)
                """,
                {
                    "source_id": rel.source.id,
                    "target_id": rel.target.id
                }
            )

    print("Graph data stored successfully")

In [None]:
def create_rag_chain(vector_store):
    """Create RAG chain with graph and vector retrieval"""
    # Create a strategy based on your data
    traversal_config = Eager(
        select_k=5,
        start_k=2,
        adjacent_k=3,
        max_depth=2
    )

    # Create graph retriever
    graph_retriever = GraphRetriever(
        store=vector_store,
        strategy=traversal_config,
    )

    # Document formatter
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Prompt template
    prompt = ChatPromptTemplate.from_template("""
    Answer the question based on the provided context.

    Context: {context}
    Question: {question}

    Answer:
    """)

    # Create chain
    chain = (
        {
            "context": graph_retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | llm
        | StrOutputParser()
    )

    return chain

In [None]:
def test_graphrag_system(chain):
    """Test the GraphRAG system with sample queries"""

    test_queries = [
        "What are the main topics discussed in the documents?",
        "How are the entities connected in the knowledge graph?",
        "Can you explain the relationships between key concepts?"
    ]

    for query in test_queries:
        print(f"\n🔍 Query: {query}")
        try:
            response = chain.invoke(query)
            print(f"Response: {response[:200]}...")
        except Exception as e:
            print(f"Error: {e}")

    # Check graph statistics
    try:
        nodes_count = graph.query("MATCH (n) RETURN count(n) as count")[0]["count"]
        rels_count = graph.query("MATCH ()-[r]->() RETURN count(r) as count")[0]["count"]
        print(f"\nGraph Stats: {nodes_count} nodes, {rels_count} relationships")
    except Exception as e:
        print(f"Error getting graph stats: {e}")


In [None]:

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-exp-03-07",
    google_api_key=google_api_key
)
llm = ChatGoogleGenerativeAI(
    model="models/gemini-2.5-flash-preview-05-20",
    google_api_key=google_api_key
)
graph = Neo4jGraph(
    url="bolt://localhost:7687",
    username="neo4j",
    password="changeit"
)
print("Initializing LLM Graph Transformer...")
llm_transformer = LLMGraphTransformer(llm=llm)# Initialize components

def main():
    """Complete GraphRAG implementation"""
    print("Starting GraphRAG System")

    # 1. Load and chunk documents
    documents = load_and_chunk_documents()

    # 2. Create vector store
    vector_store = InMemoryVectorStore.from_documents(documents, embeddings)

    # 3. Extract and store graph
    llm_transformer = LLMGraphTransformer(llm=llm)
    graph_documents = llm_transformer.convert_to_graph_documents(documents)
    store_graph_documents(graph_documents)

    # 4. Create RAG chain
    chain = create_rag_chain(vector_store)

    print("GraphRAG system ready")
    return chain
if __name__ == "__main__":
    chain = main()
    #Test the system
    test_graphrag_system(chain)