# Import data from unstructured text

## Automatic schema extraction (with LLM)

In [1]:
import neo4j
from dotenv import load_dotenv
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.llm import OpenAILLM
import os
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
# import logging

In [2]:
# This example requires an OPENAI_API_KEY env variable
load_dotenv()

# Create the embedder instance
embedder = OpenAIEmbeddings()

# Create the llm instance 

llm = OpenAILLM(
    model_name="gpt-4o",
    model_params={
        "max_tokens": 2000,
        "response_format": {"type": "json_object"},
        "temperature": 0,
    },
)

# Initialize the Neo4j driver
# This example requires a local Neo4j dbms instance up and running with APOC plugin being enabled
URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)

In [8]:
file_path="AA21-287A.pdf"
# text="""On July 10, 2024, CyberSecure Inc. observed a targeted phishing and malware campaign affecting multiple financial institutions across North America and Europe linked to the threat actor group Shadow Falcon. 
# The campaign began with spear-phishing emails impersonating executives and delivering a malicious attachment that exploited CVE-2024-12345 in Microsoft Exchange Server to achieve remote code execution; once executed, 
# the FalconSpy trojan established persistent C2 connections (notably communicating with 198.51.100.44 and domains like falcon-secure[.]com) and performed credential harvesting, lateral movement, and data exfiltration. 
# Analysis shows attackers leveraged misconfigured RDP access for persistence and used obfuscation techniques to evade detection. 
# Immediate mitigations include patching affected Exchange servers, enforcing multi-factor authentication for remote access, hardening RDP exposure, and running organization-wide phishing awareness training, 
# while monitoring the listed indicators of compromise and sharing telemetry with partners. Continuous monitoring and rapid intelligence sharing are recommended to reduce exposure to similar campaigns."
# """

In [None]:
# set log level to DEBUG for all neo4j_graphrag.* loggers
# logging.basicConfig()
# logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)

In [10]:
kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    embedder=embedder,
    from_pdf=True,
)

# Run the pipeline on one PDF file
await kg_builder.run_async(file_path=file_path)

# Run the pipeline on a text
# await kg_builder.run_async(text=text)

PipelineResult(run_id='48710bab-98cf-4d24-ade0-b9386ce27d11', result={'resolver': {'number_of_nodes_to_resolve': 32, 'number_of_created_nodes': 18}})

## Customize schema

### Start from automatic schema extraction

In [11]:
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
from neo4j_graphrag.experimental.components.schema import SchemaFromTextExtractor, GraphSchema

In [12]:
loader = PdfLoader()
document = await loader.run(file_path)

In [13]:
schema_builder = SchemaFromTextExtractor(llm)
schema = await schema_builder.run(text=document.text)
# schema = await schema_builder.run(text=text)
schema

GraphSchema(node_types=(NodeType(label='Facility', description='', properties=[PropertyType(name='location', type='STRING', description='', required=False)], additional_properties=False), NodeType(label='ThreatActor', description='', properties=[], additional_properties=True), NodeType(label='Ransomware', description='', properties=[PropertyType(name='variant', type='STRING', description='', required=False)], additional_properties=False), NodeType(label='Employee', description='', properties=[PropertyType(name='role', type='STRING', description='', required=False)], additional_properties=False), NodeType(label='Organization', description='', properties=[PropertyType(name='name', type='STRING', description='', required=False)], additional_properties=False)), relationship_types=(RelationshipType(label='TARGETS', description='', properties=[], additional_properties=True), RelationshipType(label='USES', description='', properties=[], additional_properties=True), RelationshipType(label='EMP

In [14]:
from neo4j_graphrag.experimental.utils.schema import schema_visualization
schema_visualization(schema)

<neo4j_viz.visualization_graph.VisualizationGraph at 0x119f4d4d0>

In [15]:
schema.save("my_schema.json")

### Make changes to JSON schema and reload it

In [16]:
schema = GraphSchema.from_file("my_schema.json")

kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    embedder=embedder,
    schema=schema, # USE OUR SCHEMA
    from_pdf=False,  # RUN FROM ALREADY EXTRACTED TEXT
)

# Run the pipeline on the extracted text
await kg_builder.run_async(text=document.text)

PipelineResult(run_id='5c58a973-dd00-450e-8787-b181239d251c', result={'resolver': {'number_of_nodes_to_resolve': 67, 'number_of_created_nodes': 20}})