# Import data from unstructured text

## Automatic schema extraction (with LLM)

In [1]:
import neo4j
from dotenv import load_dotenv
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.llm import OpenAILLM
import os
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
# import logging

In [2]:
# This example requires an OPENAI_API_KEY env variable
load_dotenv()

# Create the embedder instance
embedder = OpenAIEmbeddings()

# Create the llm instance 

llm = OpenAILLM(
    model_name="gpt-4o",
    model_params={
        "max_tokens": 2000,
        "response_format": {"type": "json_object"},
        "temperature": 0,
    },
)

# Initialize the Neo4j driver
# This example requires a local Neo4j dbms instance up and running with APOC plugin being enabled
URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)

In [3]:
file_path=""

In [5]:
# set log level to DEBUG for all neo4j_graphrag.* loggers
# logging.basicConfig()
# logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)

In [6]:
kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    embedder=embedder,
    from_pdf=True,
)

# Run the pipeline on one PDF file
await kg_builder.run_async(file_path=file_path)

DEBUG:neo4j_graphrag.experimental.pipeline.config.runner:PIPELINE_RUNNER: instantiating Pipeline from config type: PipelineType.SIMPLE_KG_PIPELINE
DEBUG:neo4j_graphrag.experimental.pipeline.config.runner:PIPELINE_CONFIG: start parsing config...
DEBUG:neo4j_graphrag.experimental.pipeline.config.pipeline_config:PIPELINE_CONFIG: resolved 'extras': {'extras': {}}
DEBUG:neo4j_graphrag.experimental.pipeline.config.pipeline_config:PIPELINE_CONFIG: resolved globals: {'extras': {}, 'neo4j_config': {'default': <neo4j._sync.driver.Neo4jDriver object at 0x125a312d0>}, 'llm_config': {'default': <neo4j_graphrag.llm.openai_llm.OpenAILLM object at 0x1258e9ad0>}, 'embedder_config': {'default': <neo4j_graphrag.embeddings.openai.OpenAIEmbeddings object at 0x1258e9bd0>}}
DEBUG:neo4j_graphrag.experimental.pipeline.config.template_pipeline.base:TEMPLATE_PIPELINE: resolved component name='pdf_loader' component=<neo4j_graphrag.experimental.components.pdf_loader.PdfLoader object at 0x12631fe50> run_params={}
D

PipelineResult(run_id='63d29ef2-9c1a-4c7c-92fc-015c54fad0f1', result={'resolver': {'number_of_nodes_to_resolve': 93, 'number_of_created_nodes': 20}})

## Customize schema

### Start from automatic schema extraction

In [2]:
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
from neo4j_graphrag.experimental.components.schema import SchemaFromTextExtractor, GraphSchema

In [None]:
loader = PdfLoader()
document = await loader.run(file_path)

In [None]:
schema_builder = SchemaFromTextExtractor(llm)
schema = await schema_builder.run(text=document.text)
schema

In [None]:
from neo4j_graphrag.experimental.utils.schema import schema_visualization
schema_visualization(schema)

In [None]:
schema.save("my_schema.json")

### Make changes to JSON schema and reload it

In [None]:
schema = GraphSchema.from_file("my_schema.json")

kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    embedder=embedder,
    schema=schema, # USE OUR SCHEMA
    from_pdf=False,  # RUN FROM ALREADY EXTRACTED TEXT
)

# Run the pipeline on the extracted text
await kg_builder.run_async(text=document.text)