# Demo - Continued

This notebook is a continuation of the Demo.ipynb notebook. It demos more advanced features:
- Running pipeline from a config file
- Running entity resolution in a separate process
- Customizing components

In [4]:
import os

from dotenv import load_dotenv
import neo4j
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

In [5]:
load_dotenv()

True

In [6]:
file_path = "./data/Climate change - Wikipedia long.pdf"

In [7]:
driver = neo4j.GraphDatabase.driver(
    os.getenv("NEO4J_URI", "bolt://localhost:7687"),
    auth=(
        os.getenv("NEO4J_USERNAME", "neo4j"),
        os.getenv("NEO4J_PASSWORD", "neo4j")
    )
)

llm = OpenAILLM(
    model_name="gpt-4o",
    model_params={
        "response_format": {"type": "json_object"}
    }
)

embedder = OpenAIEmbeddings(
    model="text-embedding-3-small",
)

In [18]:
# driver.execute_query("MATCH (n) DETACH DELETE n")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x78a861648c20>, keys=[])

In [9]:
from neo4j_graphrag.experimental.components.schema import GraphSchema
new_schema = GraphSchema.from_file("refined_schema.json")

## Config file

In [26]:
from neo4j_graphrag.experimental.pipeline.config.runner import PipelineRunner

pipeline = PipelineRunner.from_config_file("simple_kg_pipeline_config.yaml")
await pipeline.run({
    "file_path": file_path,
    "document_metadata": {
        "source": "Wikipedia",
    }
});

## Entity Resolution

In [12]:
from neo4j_graphrag.experimental.components.resolver import FuzzyMatchResolver

In [None]:
resolver = FuzzyMatchResolver(
    driver=driver,
)
await resolver.run();

In [28]:
resolver = FuzzyMatchResolver(
    driver=driver,
    filter_query="""
    MATCH (entity)-[:FROM_CHUNK]->(:Chunk)-[:FROM_DOCUMENT]->(d:Document)
    WHERE d.source = "Wikipedia"
    """
)
await resolver.run();

## Custom components

In [None]:
import pymupdf4llm
from typing import Optional
from pathlib import Path
from neo4j_graphrag.experimental.components.pdf_loader import DataLoader
from neo4j_graphrag.experimental.components.types import PdfDocument, DocumentInfo


class LoaderToMarkdown(DataLoader):
    async def run(
        self, filepath: Path, metadata: Optional[dict[str, str]] = None
    ) -> PdfDocument:
        doc = pymupdf4llm.to_markdown(filepath)
        return PdfDocument(
            text=doc,
            document_info=DocumentInfo(
                path=str(filepath),
                metadata=metadata or {},
            ),
        )


In [None]:
my_loader = LoaderToMarkdown()
new_document = await my_loader.run(file_path)

In [None]:
from neo4j_graphrag.experimental.components.types import TextChunks, TextChunk, DocumentInfo
from neo4j_graphrag.experimental.components.text_splitters.base import TextSplitter


class SectionSplitter(TextSplitter):
    async def run(self, text: str) -> TextChunks:
        return TextChunks(chunks=[
            TextChunk(text=sec.strip(), index=k)
            for k, sec in enumerate(text.split('\n#'))
        ])


In [None]:
my_splitter = SectionSplitter()
chunks = await my_splitter.run(text=new_document.text)
print(len(chunks.chunks))

In [None]:
pipeline = SimpleKGPipeline(
    driver=driver,
    llm=llm,
    embedder=embedder,
    text_splitter=my_splitter,
    pdf_loader=my_loader,
)
await pipeline.run_async(
    file_path=file_path,
    document_metadata={
        "source": "Wikipedia",
    },
);