In [1]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, split_uri

from openai import OpenAI
from langchain_community.graphs import RdfGraph
import re

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
# Opening config file, the config structure is:
# {"openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [3]:
# Load the JSON dataset
with open('benchmarks/CQs_SPARQL_ea.json', 'r') as f:
    qa = json.load(f)

# Access the dataset
qa_pairs = qa['qa_pairs']

In [4]:
def load_graph(data):
    g = rdflib.Graph()
    g.parse(data=data, format="turtle")
    return g

In [5]:
def print_rdf(rdf):
    g = rdflib.Graph()
    g.parse(data=rdf, format="turtle")

    for s, p, o in g:
        print(s, p, o)

In [6]:
# Namespaces
she = Namespace("https://soilwise-he.github.io/soil-health#")
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
agrontology = Namespace("http://aims.fao.org/aos/agrontology#")
sio = Namespace("http://semanticscience.org/resource/")
glosis_lh = Namespace("http://w3id.org/glosis/model/layerhorizon/")
glosis_sp = Namespace("http://w3id.org/glosis/model/siteplot/")
qudt = Namespace("http://qudt.org/schema/qudt/")
unit = Namespace("http://qudt.org/vocab/unit/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
obo = Namespace("http://purl.obolibrary.org/obo/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
biolink = Namespace("https://w3id.org/biolink/vocab/")
afox = Namespace("http://purl.allotrope.org/ontologies/property#")
afor = Namespace("http://purl.allotrope.org/ontologies/result#")
sorelsc = Namespace("http://sweetontology.net/relaSci/")
sorelpr = Namespace("http://sweetontology.net/relaProvenance/")
sohuj = Namespace("http://sweetontology.net/humanJurisdiction/")
sorelph = Namespace("http://sweetontology.net/relaPhysical/")
sorelm = Namespace("http://sweetontology.net/relaMath/")
sorepsg = Namespace("http://sweetontology.net/reprSpaceGeometry/")
bao = Namespace("http://www.bioassayontology.org/bao#")
repr = Namespace("https://w3id.org/reproduceme#")
sorelch = Namespace("http://sweetontology.net/relaChemical/")
sorelsp = Namespace("http://sweetontology.net/relaSpace/")
om = Namespace("http://www.ontology-of-units-of-measure.org/resource/om-2/")
afop = Namespace("http://purl.allotrope.org/ontologies/process#")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")

### QA over soil health KG by RAG

#### Entity extraction from query

In [7]:
system_prompt_ee = """You are a highly specialized entity extraction agent. Your primary function is to analyze a user's natural language query and extract all mentions of entities (people, organizations, dates, products, technical concepts, geographical locations, etc.). The ultimate goal is to use these entities to sample a subgraph from a domain knowledge graph.

Your task is to identify entities and provide a standardized canonical term for each entity that would exist in a formal knowledge graph.

**# Extraction & Standardization Rules**

1. **Identify Entities:** Carefully read the user's query and identify all possible entities.
2. **Preserve Original Text:** For each entity found, you must capture the exact text as it appeared in the query.
3. **Standardize Term:** For each entity, you must also provide a standardized, canonical term. This term should be the most common or formal name for the entity, as one might find in a knowledge base or encyclopedia.

**# Output Format**

Your output MUST be a valid JSON array of objects."""

In [8]:
prompt_cq = f"""Now please extract all entities from the following query:
'{qa_pairs[1]['competency_question']}'"""

In [9]:
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": system_prompt_ee},
        {"role": "user", "content": prompt_cq}
    ],
    response_format={ 
        "type": "json_schema", 
        "json_schema": {
            "name": "entity_extraction",
            "schema": {
                "type": "object",
                "properties": {
                    "user_query": {
                        "type": "string",
                        "description": "The natural language query provided by the user for which entities need to be extracted.",
                        "minLength": 1
                        },
                    "extracted_entities": {
                        "type": "array",
                        "description": "A list of entities extracted from the user query.",
                        "items": {
                            "type": "object",
                            "properties": {
                                "entity": {
                                    "type": "string",
                                    "description": "The original entity extracted from the query."
                                    },
                                "standardized_term": {
                                    "type": "string",
                                    "description": "A standardized term corresponding to the original entity, possibly inferred."
                                    }
                                  },
                          "required": [
                              "entity",
                              "standardized_term"
                              ],
                          "additionalProperties": False
                        }
                      }
                    },
                "required": [
                    "user_query",
                    "extracted_entities"
                    ],
                "additionalProperties": False
              },
            "strict": True
        }
    }
)

print(completion.choices[0].message.content)

{"user_query":"What knowledge is needed in order to manage soil sustainably and protect it?","extracted_entities":[{"entity":"manage soil","standardized_term":"Soil Management"},{"entity":"protect it","standardized_term":"Soil Protection"}]}


In [10]:
print(json.loads(completion.choices[0].message.content)["extracted_entities"])

[{'entity': 'manage soil', 'standardized_term': 'Soil Management'}, {'entity': 'protect it', 'standardized_term': 'Soil Protection'}]


#### Sample subgraph from soil health KG

In [11]:
def sample_subgraph_from_json(json_string: str, turtle_file_path: str) -> str:
    """
    Samples a subgraph from an RDF graph based on terms in a JSON object,
    with special handling for blank nodes and ignoring mapping predicates.

    Args:
        json_string: A string containing the JSON data with entities and terms.
        turtle_file_path: The file path to the RDF knowledge graph in Turtle format.

    Returns:
        A string representing the sampled subgraph in Turtle format.
    """
    # --- Step 1: Get all terms from JSON and convert to lowercase ---
    try:
        # The replace call handles JSON strings that use single quotes
        data = json.loads(json_string.replace("'", "\""))["extracted_entities"]
        search_terms = set()
        for item in data:
            if 'entity' in item:
                search_terms.add(item['entity'].lower())
            if 'standardized_term' in item:
                search_terms.add(item['standardized_term'].lower())
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return ""
        
    print(f"Searching for terms: {search_terms}\n")

    # --- Step 2: Load the knowledge graph from file and find matched nodes ---
    g = Graph()
    try:
        if not os.path.exists(turtle_file_path):
            return f"# Error: File not found at '{turtle_file_path}'"
        g.parse(source=turtle_file_path, format="turtle")
    except Exception as e:
        print(f"Error parsing Turtle file: {e}")
        return ""

    matched_nodes = set()
    label_properties = [SKOS.prefLabel, SKOS.altLabel]

    for s, p, o in g:
        if p in label_properties and isinstance(o, Literal) and o.value.lower() in search_terms:
            print(f"Found match: Subject <{s}> has label '{o}'")
            matched_nodes.add(s)

    if not matched_nodes:
        print("No matching nodes found in the knowledge graph.")
        return "# No matching nodes found."

    print(f"\nMatched nodes: {[str(node) for node in matched_nodes]}\n")

    # --- Step 3: Sample the subgraph with special blank node and predicate handling ---
    subgraph = Graph()
    for prefix, namespace in g.namespace_manager.namespaces():
        subgraph.bind(prefix, namespace)

    # Define the set of predicates to ignore
    ignored_predicates = {SKOS.exactMatch, SKOS.closeMatch}
    b_nodes_to_expand = set()

    # First pass: handle direct connections to matched nodes
    for s, p, o in g:
        # Case 1: The subject is a matched node.
        if s in matched_nodes:
            # Add the triple ONLY if the predicate is not ignored
            if p not in ignored_predicates:
                subgraph.add((s, p, o))
                # If the object is a blank node, mark it for expansion.
                if isinstance(o, BNode):
                    b_nodes_to_expand.add(o)

        # Case 2: The object is a matched node.
        elif isinstance(o, URIRef) and o in matched_nodes:
            # Per rule: Ignore triples where a blank node links TO a matched node.
            # Also ignore triples with specified mapping predicates.
            if not isinstance(s, BNode) and p not in ignored_predicates:
                subgraph.add((s, p, o))

    # Second pass: expand the collected blank nodes one level deeper.
    if b_nodes_to_expand:
        print(f"Expanding blank nodes: {[str(bnode) for bnode in b_nodes_to_expand]}\n")
        for s, p, o in g:
            if s in b_nodes_to_expand:
                # Also apply predicate filter here for consistency
                if p not in ignored_predicates:
                    subgraph.add((s, p, o))
            
    # --- Step 4: Return the subgraph in Turtle format ---
    return subgraph.serialize(format="turtle")


In [15]:
# Write the sample KG to a local file for the example
ttl_file_name = "soil_health_KG.ttl"

# Run the sampling function using the file path
sampled_turtle = sample_subgraph_from_json(completion.choices[0].message.content, ttl_file_name)

# Print the final result
print("--- Sampled Subgraph (Turtle Format) ---")
print(sampled_turtle)

Searching for terms: {'soil protection', 'protect it', 'manage soil', 'soil management'}

Found match: Subject <https://soilwise-he.github.io/soil-health#SoilProtection> has label 'soil protection'
Found match: Subject <https://soilwise-he.github.io/soil-health#SoilManagement> has label 'soil management'

Matched nodes: ['https://soilwise-he.github.io/soil-health#SoilManagement', 'https://soilwise-he.github.io/soil-health#SoilProtection']

--- Sampled Subgraph (Turtle Format) ---
@prefix af-x: <http://purl.allotrope.org/ontologies/property#> .
@prefix agrontology: <http://aims.fao.org/aos/agrontology#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix she: <https://soilwise-he.github.io/soil-health#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

she:DynamicSoilParameters she:characterises she:SoilManagement .

she:SoilScreeningValues obo:RO_0002502 she:SoilProtection .

she:FilterFunction agrontology:isPartOf she:SoilP

In [16]:
# --- Configuration ---
# Define the new namespace for your human-readable terms
SHE = Namespace("https://soilwise-he.github.io/soil-health#")

# File paths for mapping files
CLASSES_CSV_PATH = 'ontologies/classes.csv'
PROPERTIES_CSV_PATH = 'ontologies/properties.csv'

# --- Helper Functions for Case Conversion ---

def to_pascal_case(text: str) -> str:
    """Converts a string to PascalCase.
    Example: "soil health" -> "SoilHealth"
    """
    if not isinstance(text, str):
        return ""
    return ''.join(word.capitalize() for word in re.split(r'[\s_-]+', text))

def to_camel_case(text: str) -> str:
    """Converts a string to camelCase.
    Example: "has measurement" -> "hasMeasurement"
    """
    if not isinstance(text, str):
        return ""
    parts = re.split(r'[\s_-]+', text)
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.capitalize() for word in parts[1:])

# --- Main Transformation Logic ---

def transform_rdf_graph(input_turtle_string: str, class_map: dict, prop_map: dict) -> str:
    """
    Loads an RDF graph from a string, replaces URIs based on the provided maps,
    and returns the transformed graph as a Turtle string.
    """
    # 1. Initialize graphs
    original_graph = Graph()
    transformed_graph = Graph()

    print("Parsing input Turtle string...")
    try:
        # Use `data` parameter to parse from a string
        original_graph.parse(data=input_turtle_string, format="turtle")
        print(f"Successfully parsed. Found {len(original_graph)} triples.")
    except Exception as e:
        print(f"Error parsing the Turtle string: {e}")
        return "" # Return an empty string on error

    # Bind the new 'she' prefix to the transformed graph for cleaner output
    transformed_graph.bind("she", SHE)
    # It's good practice to also bind other common prefixes from the original graph
    for prefix, namespace in original_graph.namespace_manager.namespaces():
        transformed_graph.bind(prefix, namespace)

    # 2. Iterate through all triples and transform them
    print("Transforming URIs...")
    for s, p, o in original_graph:
        new_s = s
        new_p = p
        new_o = o

        # Transform subject (if it's a URI we have a mapping for)
        if isinstance(s, URIRef) and str(s) in class_map:
            new_s = class_map[str(s)]

        # Transform predicate (if it's a property URI, and not rdf:type)
        if isinstance(p, URIRef) and p != RDF.type and str(p) in prop_map:
            new_p = prop_map[str(p)]

        # Transform object (if it's a URI we have a mapping for)
        if isinstance(o, URIRef):
            if str(o) in class_map:
                new_o = class_map[str(o)]
            elif str(o) in prop_map:
                # This case is less common but possible
                new_o = prop_map[str(o)]

        # Add the (potentially modified) triple to the new graph
        transformed_graph.add((new_s, new_p, new_o))

    # 3. Serialize the new graph to a string and return it
    print("Serialization complete!")
    return transformed_graph.serialize(format="turtle")


def main():
    """
    Main function to run the script.
    """
    # Check if required CSV files exist
    if not all(os.path.exists(f) for f in [CLASSES_CSV_PATH, PROPERTIES_CSV_PATH]):
        print("Error: One or more required files are missing.")
        print(f"Please ensure '{CLASSES_CSV_PATH}' and '{PROPERTIES_CSV_PATH}' are in the same directory as this script.")
        return

    # 1. Load mappings from CSV files
    print("Loading URI mappings from CSV files...")
    try:
        # Assuming CSVs have no header and URI is 2nd col (idx 1), label is 3rd col (idx 2)
        classes_df = pd.read_csv(CLASSES_CSV_PATH, header=None, usecols=[1, 2])
        properties_df = pd.read_csv(PROPERTIES_CSV_PATH, header=None, usecols=[1, 2])

        # Create the mapping dictionaries
        class_mappings = {row[1]: SHE[to_pascal_case(row[2])] for _, row in classes_df.iterrows()}
        property_mappings = {row[1]: SHE[to_camel_case(row[2])] for _, row in properties_df.iterrows()}

        print(f"Loaded {len(class_mappings)} class mappings and {len(property_mappings)} property mappings.")

    except FileNotFoundError as e:
        print(f"Error: Could not find the CSV file - {e.filename}")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV files: {e}")
        return

    # 2. Run the transformation and get the result as a string
    transformed_turtle_string = transform_rdf_graph(sampled_turtle, class_mappings, property_mappings)

    # 3. Print the transformed string
    print("\n--- Transformed Turtle Data ---")
    print(transformed_turtle_string)

    return transformed_turtle_string

if __name__ == "__main__":
    sampled_subgraph = main()

Loading URI mappings from CSV files...
Loaded 21 class mappings and 206 property mappings.
Parsing input Turtle string...
Successfully parsed. Found 26 triples.
Transforming URIs...
Serialization complete!

--- Transformed Turtle Data ---
@prefix she: <https://soilwise-he.github.io/soil-health#> .

she:DynamicSoilParameters she:characterises she:SoilManagement .

she:SoilScreeningValues she:dependsOn she:SoilProtection .

she:FilterFunction she:isPartOf she:SoilProtection .

she:ProductionService she:isPartOf she:SoilProtection .

she:SoilDwellingOrganismHabitat she:isPartOf she:SoilProtection .

she:SoilManagement a she:Concept ;
    she:preferredLabel "soil management"@en ;
    she:source she:Vogel2020 .

she:SoilThreats she:isPreventedBy she:SoilProtection .

she:SoilProtection a she:Concept ;
    she:hasPart she:FilterFunction,
        she:ProductionService,
        she:SoilDwellingOrganismHabitat ;
    she:hasScope she:ArableCroppingSystems,
        she:GroundwaterBodies,
        

#### RAG based on sampled graph

In [17]:
system_prompt_rag = """### Knowledge Graph Question Answering

**Your Role:** You are a highly specialized Question Answering AI. Your sole purpose is to answer a user's query based *only* on the information contained within a given knowledge graph segment. You must not use any external knowledge or information you were trained on.

**Instructions:**

1. **Analyze the User Query:** First, carefully examine the user's question to identify the key entities and the specific relationship or piece of information being asked for.
2. **Consult the Knowledge Graph:** You will be provided with a knowledge graph segment serialized in **Turtle (TTL) format**. This is your **only** source of truth.
3. **Synthesize the Answer:**
   * If you can find the information by traversing the relationships in the provided graph, formulate a concise, natural-language answer based directly on those facts.
   * Cite the specific triple(s) you used to derive the answer, referencing the subject, predicate, and object.
4. **Handle Missing Information:**
   * If the knowledge graph segment does not contain the entities or relationships needed to answer the query, you **must** respond with the following exact phrase: `The provided knowledge graph does not cover this information, therefore I cannot answer.`
   * Do not apologize, attempt to search elsewhere, or use your general knowledge.

**Example 1: Information Found**

**Input:**

* **User Query:** "Who directed the movie Inception?"
* **Knowledge Graph Segment:**
  ```turtle
  @prefix ex: <http://example.org/entity/> .
  @prefix prop: <http://example.org/property/> .
  
  ex:Inception prop:has_director ex:Christopher_Nolan ;
               prop:has_genre "Science Fiction" .
  
  ex:Christopher_Nolan prop:born_in "London" .
  ```

**Correct Output:**

Christopher Nolan directed the movie Inception.

Source: ex:Inception prop:has\_director ex:Christopher\_Nolan

**Example 2: Information Not Found**

**Input:**

* **User Query:** "What is the budget of the movie Inception?"
* **Knowledge Graph Segment:**
  ```turtle
  @prefix ex: <http://example.org/entity/> .
  @prefix prop: <http://example.org/property/> .
  
  ex:Inception prop:has_director ex:Christopher_Nolan ;
               prop:has_genre "Science Fiction" .
  
  ex:Christopher_Nolan prop:born_in "London" .
  ```

**Correct Output:**

The provided knowledge graph does not cover this information, therefore I cannot answer."""

In [18]:
prompt_rag = f"""Now please answer the following question:
'{qa_pairs[1]['competency_question']}', 
based on the provided knowledge graph segment:
'{sampled_subgraph}'"""

In [19]:
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": system_prompt_rag},
        {"role": "user", "content": prompt_rag}
    ]
)

print(completion.choices[0].message.content)

To manage soil sustainably and protect it, knowledge is needed about Critical Levels at Preventive Activity, Critical Levels at Restorative Activity, Critical Levels at Soil Functions, and the State of Soils.

Source: she:SoilProtection she:requiresKnowledgeAbout she:CriticalLevelAtPreventiveActivity, she:CriticalLevelAtRestorativeActivity, she:CriticalLevelAtSoilFunctions, she:StateOfSoils.


### QA over soil health KG by SPARQL

In [39]:
graph_all = RdfGraph(
    source_file="soil_health_KG.ttl",
    serialization="ttl",
)

In [40]:
with open('benchmarks/CQs_SPARQL_ea.json', 'r') as f:
    qa = json.load(f)
query = qa['qa_pairs'][0]['sparql_query']

print(query)

PREFIX agrontology: <http://aims.fao.org/aos/agrontology#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?soilLabel ?predicate ?objectLabel
WHERE {
  ?soil skos:prefLabel "soils"@en ;
        skos:prefLabel ?soilLabel . 

  {
    ?soil agrontology:isComposedOf ?object .
    BIND(agrontology:isComposedOf AS ?predicate)
    ?object skos:prefLabel ?objectLabel .
  }
  UNION
  {
    ?soilFunctions skos:prefLabel "soil functions"@en ;
                   skos:prefLabel ?objectLabel . 
    ?soil ?predicate ?soilFunctions .
  }
}

ORDER BY ?predicate ?objectLabel


In [41]:
for row in graph_all.query(query):
    print(f"{row.soilLabel}, {row.predicate}, {row.objectLabel}")

soils, http://aims.fao.org/aos/agrontology#isComposedOf, air
soils, http://aims.fao.org/aos/agrontology#isComposedOf, living organisms
soils, http://aims.fao.org/aos/agrontology#isComposedOf, mineral compounds
soils, http://aims.fao.org/aos/agrontology#isComposedOf, organic compounds
soils, http://aims.fao.org/aos/agrontology#isComposedOf, water
soils, http://aims.fao.org/aos/agrontology#performs, soil functions
soils, http://semanticscience.org/resource/SIO_000064, soil functions


In [43]:
print(graph_all.get_schema)

In the following, each IRI is followed by the local name and optionally its description in parentheses. 
The RDF graph supports the following node types:
<http://www.w3.org/2004/02/skos/core#Concept> (Concept, None), <http://semanticscience.org/resource/SIO_000510> (SIO_000510, None), <http://purl.org/dc/terms/BibliographicResource> (BibliographicResource, None), <http://semanticscience.org/resource/SIO_000999> (SIO_000999, None), <http://sweetontology.net/humanJurisdiction/Regulation> (Regulation, None), <https://schema.org/Country> (Country, None), <https://schema.org/City> (City, None), <http://purl.org/dc/terms/Policy> (Policy, None), <http://purl.obolibrary.org/obo/NCIT_C61419> (NCIT_C61419, None), <http://qudt.org/schema/qudt/Unit> (Unit, None), <http://sweetontology.net/reprSpaceGeometry/Region> (Region, None), <http://semanticscience.org/resource/SIO_000368> (SIO_000368, None), <http://purl.obolibrary.org/obo/NCIT_C80234> (NCIT_C80234, None), <http://purl.org/dc/terms/Standard>