In [1]:
!pip install neo4j neo4j-graphrag -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/204.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m194.6/204.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/204.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:
!pip install rapidfuzz -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install fsspec -q

In [4]:
import nest_asyncio
nest_asyncio.apply()

In [53]:
from neo4j import GraphDatabase
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.retrievers import VectorCypherRetriever
from neo4j_graphrag.retrievers import Text2CypherRetriever
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader, PdfDocument
from rapidfuzz import fuzz
import os
import asyncio
from neo4j_graphrag.generation.prompts import ERExtractionTemplate




In [54]:
def load_properties(path):
    data = {}
    with open(path) as f:
        for line in f:
            if "=" in line:
                k, v = line.split("=", 1)
                data[k.strip()] = v.strip()
    return data

In [55]:

props = load_properties("openai_key.txt")

os.environ["OPENAI_KEY"] = props["OPENAI_KEY"]
os.environ["NEO4J_URI"] = props["NEO4J_URI"]
os.environ["NEO4J_USERNAME"] = props["NEO4J_USERNAME"]
os.environ["NEO4J_PASSWORD"] = props["NEO4J_PASSWORD"]
#os.environ["NEO4J_DATABASE"] = props["NEO4J_DATABASE"]


In [56]:
neo4j_driver = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=(os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))
)

In [57]:
neo4j_driver.verify_connectivity()

In [58]:
llm = OpenAILLM(
    model_name="gpt-4o",
    api_key=os.environ["OPENAI_KEY"],
    model_params={
        "temperature": 0,
        "response_format": {"type": "json_object"},
    }
)

In [59]:
embedder = OpenAIEmbeddings(
    api_key=os.environ["OPENAI_KEY"],
    model="text-embedding-ada-002"
    )

In [60]:
#using text splitter for creating data chunks
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter

text_splitter = FixedSizeSplitter(chunk_size=500, chunk_overlap=50)

#Loading data from PDFs One file at a time

In [61]:
from typing import Union, Dict, Optional
from pathlib import Path
import re
from fsspec.spec import AbstractFileSystem

class CustomPDFLoader(PdfLoader):
    async def run(self,filepath: Union[str, Path], metadata: Optional[Dict[str, str]] = None,fs: Optional[Union[AbstractFileSystem, str]] = None,) -> PdfDocument:

        pdf_document = await super().run(filepath, metadata, fs)

        # Process the PDF document
        # remove asciidoc attribute lines like :id:
        pdf_document.text = re.sub(r':*:.*\n?', '', pdf_document.text, flags=re.MULTILINE)

        return pdf_document



In [62]:
data_loader = CustomPDFLoader()

In [63]:
NODE_TYPES = [
    "Job Title",
    "Skills",
    "Roles",
    {
        "label": "Responsibilities",
        "description": "descriptive details about the responsibilities handled"
    },
    {
        "label": "Awards",
        "description": "Awarded with"
    }
  ]
RELATIONSHIP_TYPES = [
    "RESPONSIBILITIES_HANDLED",
    "AWARDED_WITH"
]

PATTERNS = [
    ("Roles", "RESPONSIBILITIES_HANDLED", "Responsibilities"),
    ("Job Title", "AWARDED_WITH", "Awards"),

   ]


In [64]:
domain_instructions = (
    "Only extract entities that are related to the Job"
    "These include companies, Job Title, responsiblities in Job, Awards given"
    "\n"
)

prompt_template = ERExtractionTemplate(
    template = domain_instructions + ERExtractionTemplate.DEFAULT_TEMPLATE
)


In [65]:
kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=neo4j_driver,
    neo4j_database=os.getenv("NEO4J_DATABASE"),
    embedder=embedder,
    prompt_template=prompt_template,
    from_pdf=True,
    pdf_loader=data_loader,
    schema={
        "node_types": NODE_TYPES,
        "relationship_types": RELATIONSHIP_TYPES,
        "patterns": PATTERNS
    },
)

In [66]:
pdf_file = "Datafiles/11121498.pdf"
doc = asyncio.run(data_loader.run(pdf_file))
#print(doc.text)

In [67]:
print(f"Processing {pdf_file}")
result = asyncio.run(kg_builder.run_async(file_path=pdf_file))
print(result.result)

Processing Datafiles/11121498.pdf
{'resolver': {'number_of_nodes_to_resolve': 39, 'number_of_created_nodes': 6}}
