In [None]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, split_uri

from openai import OpenAI
import re

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
# Opening config file, the config structure is:
# {"openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [3]:
def load_graph(data):
    g = rdflib.Graph()
    g.parse(data=data, format="turtle")
    return g

In [4]:
def print_rdf(rdf):
    g = rdflib.Graph()
    g.parse(data=rdf, format="turtle")

    for s, p, o in g:
        print(s, p, o)

In [5]:
# Namespaces
she = Namespace("https://soilwise-he.github.io/soil-health#")
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
agrontology = Namespace("http://aims.fao.org/aos/agrontology#")
sio = Namespace("http://semanticscience.org/resource/")
glosis_lh = Namespace("http://w3id.org/glosis/model/layerhorizon/")
glosis_sp = Namespace("http://w3id.org/glosis/model/siteplot/")
qudt = Namespace("http://qudt.org/schema/qudt/")
unit = Namespace("http://qudt.org/vocab/unit/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
obo = Namespace("http://purl.obolibrary.org/obo/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
biolink = Namespace("https://w3id.org/biolink/vocab/")
afox = Namespace("http://purl.allotrope.org/ontologies/property#")
afor = Namespace("http://purl.allotrope.org/ontologies/result#")
sorelsc = Namespace("http://sweetontology.net/relaSci/")
sorelpr = Namespace("http://sweetontology.net/relaProvenance/")
sohuj = Namespace("http://sweetontology.net/humanJurisdiction/")
sorelph = Namespace("http://sweetontology.net/relaPhysical/")
sorelm = Namespace("http://sweetontology.net/relaMath/")
sorepsg = Namespace("http://sweetontology.net/reprSpaceGeometry/")
bao = Namespace("http://www.bioassayontology.org/bao#")
repr = Namespace("https://w3id.org/reproduceme#")
sorelch = Namespace("http://sweetontology.net/relaChemical/")
sorelsp = Namespace("http://sweetontology.net/relaSpace/")
om = Namespace("http://www.ontology-of-units-of-measure.org/resource/om-2/")
afop = Namespace("http://purl.allotrope.org/ontologies/process#")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")

### Vocabs or not vocabs

In [7]:
def extract_skos_concepts_with_matches(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts that have exactMatch or closeMatch properties
    and save their URIs to a CSV file.
    
    Args:
        ttl_file_path (str): Path to the TTL file containing the RDF knowledge graph
        output_csv_path (str): Path where the CSV file will be saved
    """
    
    # Create a graph and load the TTL file
    g = Graph()
    try:
        g.parse(ttl_file_path, format='turtle')
        print(f"Successfully loaded {len(g)} triples from {ttl_file_path}")
    except Exception as e:
        print(f"Error loading TTL file: {e}")
        return
    
    # Define SKOS namespace
    SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
    
    # Set to store unique concept URIs
    concepts_with_matches = set()
    
    # Query for concepts with exactMatch
    exact_match_concepts = g.subjects(SKOS.exactMatch, None)
    for concept in exact_match_concepts:
        if isinstance(concept, URIRef):
            concepts_with_matches.add(str(concept))
    
    # Query for concepts with closeMatch
    close_match_concepts = g.subjects(SKOS.closeMatch, None)
    for concept in close_match_concepts:
        if isinstance(concept, URIRef):
            concepts_with_matches.add(str(concept))
    
    # Convert to sorted list for consistent output
    concepts_list = sorted(list(concepts_with_matches))
    
    print(f"Found {len(concepts_list)} unique concepts with exactMatch or closeMatch properties")
    
    # Save to CSV file
    try:
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            # Write header
            writer.writerow(['concept_uri'])
            # Write concept URIs
            for concept_uri in concepts_list:
                writer.writerow([concept_uri])
        
        print(f"Successfully saved concept URIs to {output_csv_path}")
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return
    
    return concepts_list

def extract_with_match_details(ttl_file_path, output_csv_path):
    """
    Alternative version that also extracts the match details (what each concept matches to)
    and the type of match (exact or close).
    """
    
    # Create a graph and load the TTL file
    g = Graph()
    try:
        g.parse(ttl_file_path, format='turtle')
        print(f"Successfully loaded {len(g)} triples from {ttl_file_path}")
    except Exception as e:
        print(f"Error loading TTL file: {e}")
        return
    
    # Define SKOS namespace
    SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
    
    # List to store detailed match information
    match_details = []
    
    # Query for exactMatch relationships
    for subject, predicate, obj in g.triples((None, SKOS.exactMatch, None)):
        if isinstance(subject, URIRef):
            match_details.append({
                'concept_uri': str(subject),
                'match_type': 'exactMatch',
                'matched_uri': str(obj)
            })
    
    # Query for closeMatch relationships
    for subject, predicate, obj in g.triples((None, SKOS.closeMatch, None)):
        if isinstance(subject, URIRef):
            match_details.append({
                'concept_uri': str(subject),
                'match_type': 'closeMatch',
                'matched_uri': str(obj)
            })
    
    print(f"Found {len(match_details)} total match relationships")
    
    # Save detailed information to CSV
    try:
        df = pd.DataFrame(match_details)
        df.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"Successfully saved detailed match information to {output_csv_path}")
        
        # Also print summary statistics
        unique_concepts = df['concept_uri'].nunique()
        exact_matches = len(df[df['match_type'] == 'exactMatch'])
        close_matches = len(df[df['match_type'] == 'closeMatch'])
        
        print(f"\nSummary:")
        print(f"- Unique concepts with matches: {unique_concepts}")
        print(f"- Total exactMatch relationships: {exact_matches}")
        print(f"- Total closeMatch relationships: {close_matches}")
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return
    
    return match_details

# Example usage
if __name__ == "__main__":
    # Basic version - just concept URIs
    ttl_file = "soil_health_KG.ttl"  # Replace with your TTL file path
    output_csv = "skos_concepts_with_matches.csv"
    
    concepts = extract_skos_concepts_with_matches(ttl_file, output_csv)
    
    # Detailed version - with match information
    # Uncomment the lines below if you want detailed match information
    # detailed_output_csv = "skos_concepts_detailed_matches.csv"
    # match_details = extract_with_match_details(ttl_file, detailed_output_csv)

Successfully loaded 10990 triples from soil_health_KG.ttl
Found 494 unique concepts with exactMatch or closeMatch properties
Successfully saved concept URIs to skos_concepts_with_matches.csv


In [11]:
def analyze_uri_sets(input_csv_path, output_dir="./"):
    """
    Analyze URIs from two columns and create separate CSV files for each set operation result.
    
    Args:
        input_csv_path (str): Path to the input CSV file
        output_dir (str): Directory to save output files
    """
    
    # Read the CSV file
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Successfully loaded CSV with {len(df)} rows")
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return
    
    # Check if required columns exist
    if 'keywords' not in df.columns or 'thesauri' not in df.columns:
        print("Error: Required columns 'keywords' and 'thesauri' not found in CSV")
        return
    
    # Remove NaN values and convert to sets
    keywords_set = set(df['keywords'].dropna().astype(str))
    thesauri_set = set(df['thesauri'].dropna().astype(str))
    
    print(f"Number of unique URIs in keywords column: {len(keywords_set)}")
    print(f"Number of unique URIs in thesauri column: {len(thesauri_set)}")
    
    # Perform set operations
    union_set = keywords_set.union(thesauri_set)
    intersection_set = keywords_set.intersection(thesauri_set)
    keywords_only = keywords_set - thesauri_set
    thesauri_only = thesauri_set - keywords_set
    
    print(f"\nSet operation results:")
    print(f"Union (all unique URIs): {len(union_set)}")
    print(f"Intersection (URIs in both columns): {len(intersection_set)}")
    print(f"Keywords only: {len(keywords_only)}")
    print(f"Thesauri only: {len(thesauri_only)}")
    
    # Save each set to a separate CSV file
    sets_data = {
        'union': union_set,
        'intersection': intersection_set,
        'keywords_only': keywords_only,
        'thesauri_only': thesauri_only
    }
    
    for set_name, uri_set in sets_data.items():
        filename = f"{output_dir}uri_{set_name}.csv"
        try:
            df_temp = pd.DataFrame({'URI': sorted(uri_set)})
            df_temp.to_csv(filename, index=False)
            print(f"Saved {len(uri_set)} URIs to {filename}")
        except Exception as e:
            print(f"Error saving {filename}: {e}")
    
    print(f"\nAll files saved successfully to directory: {output_dir}")

# Example usage
if __name__ == "__main__":
    # Replace with your actual file paths
    input_file = "matched_concepts.csv"
    output_directory = "./"  # Current directory, change as needed
    
    analyze_uri_sets(input_file, output_directory)

Successfully loaded CSV with 684 rows
Number of unique URIs in keywords column: 683
Number of unique URIs in thesauri column: 494

Set operation results:
Union (all unique URIs): 788
Intersection (URIs in both columns): 389
Keywords only: 294
Thesauri only: 105
Saved 788 URIs to ./uri_union.csv
Saved 389 URIs to ./uri_intersection.csv
Saved 294 URIs to ./uri_keywords_only.csv
Saved 105 URIs to ./uri_thesauri_only.csv

All files saved successfully to directory: ./


#### LLM-as-a-judge

In [7]:
system_prompt_ee = """You are a highly specialized entity extraction agent. Your primary function is to analyze a user's natural language query and extract all mentions of entities (people, organizations, dates, products, technical concepts, geographical locations, etc.). The ultimate goal is to use these entities to sample a subgraph from a domain knowledge graph.

Your task is to identify entities and provide a standardized canonical term for each entity that would exist in a formal knowledge graph.

**# Extraction & Standardization Rules**

1. **Identify Entities:** Carefully read the user's query and identify all possible entities.
2. **Preserve Original Text:** For each entity found, you must capture the exact text as it appeared in the query.
3. **Standardize Term:** For each entity, you must also provide a standardized, canonical term. This term should be the most common or formal name for the entity, as one might find in a knowledge base or encyclopedia.

**# Output Format**

Your output MUST be a valid JSON array of objects."""

In [8]:
prompt_cq = f"""Now please extract all entities from the following query:
'{qa_pairs[23]['competency_question']}'"""

In [9]:
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": system_prompt_ee},
        {"role": "user", "content": prompt_cq}
    ],
    response_format={ 
        "type": "json_schema", 
        "json_schema": {
            "name": "entity_extraction",
            "schema": {
                "type": "object",
                "properties": {
                    "user_query": {
                        "type": "string",
                        "description": "The natural language query provided by the user for which entities need to be extracted.",
                        "minLength": 1
                        },
                    "extracted_entities": {
                        "type": "array",
                        "description": "A list of entities extracted from the user query.",
                        "items": {
                            "type": "object",
                            "properties": {
                                "entity": {
                                    "type": "string",
                                    "description": "The original entity extracted from the query."
                                    },
                                "standardized_term": {
                                    "type": "string",
                                    "description": "A standardized term corresponding to the original entity, possibly inferred."
                                    }
                                  },
                          "required": [
                              "entity",
                              "standardized_term"
                              ],
                          "additionalProperties": False
                        }
                      }
                    },
                "required": [
                    "user_query",
                    "extracted_entities"
                    ],
                "additionalProperties": False
              },
            "strict": True
        }
    }
)

print(completion.choices[0].message.content)

{"user_query":"What is SOM and what is it made from, and how to convert measured SOC to SOM?","extracted_entities":[{"entity":"SOM","standardized_term":"Soil Organic Matter"},{"entity":"SOC","standardized_term":"Soil Organic Carbon"}]}


In [10]:
print(json.loads(completion.choices[0].message.content)["extracted_entities"])

[{'entity': 'SOM', 'standardized_term': 'Soil Organic Matter'}, {'entity': 'SOC', 'standardized_term': 'Soil Organic Carbon'}]
