In [25]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, split_uri

from openai import OpenAI
import re

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import pandas as pd

import time
from typing import Dict, Any, Optional
import logging

In [2]:
# Opening config file, the config structure is:
# {"openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [3]:
def load_graph(data):
    g = rdflib.Graph()
    g.parse(data=data, format="turtle")
    return g

In [4]:
def print_rdf(rdf):
    g = rdflib.Graph()
    g.parse(data=rdf, format="turtle")

    for s, p, o in g:
        print(s, p, o)

In [5]:
# Namespaces
she = Namespace("https://soilwise-he.github.io/soil-health#")
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
agrontology = Namespace("http://aims.fao.org/aos/agrontology#")
sio = Namespace("http://semanticscience.org/resource/")
glosis_lh = Namespace("http://w3id.org/glosis/model/layerhorizon/")
glosis_sp = Namespace("http://w3id.org/glosis/model/siteplot/")
qudt = Namespace("http://qudt.org/schema/qudt/")
unit = Namespace("http://qudt.org/vocab/unit/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
obo = Namespace("http://purl.obolibrary.org/obo/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
biolink = Namespace("https://w3id.org/biolink/vocab/")
afox = Namespace("http://purl.allotrope.org/ontologies/property#")
afor = Namespace("http://purl.allotrope.org/ontologies/result#")
sorelsc = Namespace("http://sweetontology.net/relaSci/")
sorelpr = Namespace("http://sweetontology.net/relaProvenance/")
sohuj = Namespace("http://sweetontology.net/humanJurisdiction/")
sorelph = Namespace("http://sweetontology.net/relaPhysical/")
sorelm = Namespace("http://sweetontology.net/relaMath/")
sorepsg = Namespace("http://sweetontology.net/reprSpaceGeometry/")
bao = Namespace("http://www.bioassayontology.org/bao#")
repr = Namespace("https://w3id.org/reproduceme#")
sorelch = Namespace("http://sweetontology.net/relaChemical/")
sorelsp = Namespace("http://sweetontology.net/relaSpace/")
om = Namespace("http://www.ontology-of-units-of-measure.org/resource/om-2/")
afop = Namespace("http://purl.allotrope.org/ontologies/process#")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")

### Vocabs or not vocabs

In [7]:
def extract_skos_concepts_with_matches(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts that have exactMatch or closeMatch properties
    and save their URIs to a CSV file.
    
    Args:
        ttl_file_path (str): Path to the TTL file containing the RDF knowledge graph
        output_csv_path (str): Path where the CSV file will be saved
    """
    
    # Create a graph and load the TTL file
    g = Graph()
    try:
        g.parse(ttl_file_path, format='turtle')
        print(f"Successfully loaded {len(g)} triples from {ttl_file_path}")
    except Exception as e:
        print(f"Error loading TTL file: {e}")
        return
    
    # Define SKOS namespace
    SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
    
    # Set to store unique concept URIs
    concepts_with_matches = set()
    
    # Query for concepts with exactMatch
    exact_match_concepts = g.subjects(SKOS.exactMatch, None)
    for concept in exact_match_concepts:
        if isinstance(concept, URIRef):
            concepts_with_matches.add(str(concept))
    
    # Query for concepts with closeMatch
    close_match_concepts = g.subjects(SKOS.closeMatch, None)
    for concept in close_match_concepts:
        if isinstance(concept, URIRef):
            concepts_with_matches.add(str(concept))
    
    # Convert to sorted list for consistent output
    concepts_list = sorted(list(concepts_with_matches))
    
    print(f"Found {len(concepts_list)} unique concepts with exactMatch or closeMatch properties")
    
    # Save to CSV file
    try:
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            # Write header
            writer.writerow(['concept_uri'])
            # Write concept URIs
            for concept_uri in concepts_list:
                writer.writerow([concept_uri])
        
        print(f"Successfully saved concept URIs to {output_csv_path}")
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return
    
    return concepts_list

def extract_with_match_details(ttl_file_path, output_csv_path):
    """
    Alternative version that also extracts the match details (what each concept matches to)
    and the type of match (exact or close).
    """
    
    # Create a graph and load the TTL file
    g = Graph()
    try:
        g.parse(ttl_file_path, format='turtle')
        print(f"Successfully loaded {len(g)} triples from {ttl_file_path}")
    except Exception as e:
        print(f"Error loading TTL file: {e}")
        return
    
    # Define SKOS namespace
    SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
    
    # List to store detailed match information
    match_details = []
    
    # Query for exactMatch relationships
    for subject, predicate, obj in g.triples((None, SKOS.exactMatch, None)):
        if isinstance(subject, URIRef):
            match_details.append({
                'concept_uri': str(subject),
                'match_type': 'exactMatch',
                'matched_uri': str(obj)
            })
    
    # Query for closeMatch relationships
    for subject, predicate, obj in g.triples((None, SKOS.closeMatch, None)):
        if isinstance(subject, URIRef):
            match_details.append({
                'concept_uri': str(subject),
                'match_type': 'closeMatch',
                'matched_uri': str(obj)
            })
    
    print(f"Found {len(match_details)} total match relationships")
    
    # Save detailed information to CSV
    try:
        df = pd.DataFrame(match_details)
        df.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"Successfully saved detailed match information to {output_csv_path}")
        
        # Also print summary statistics
        unique_concepts = df['concept_uri'].nunique()
        exact_matches = len(df[df['match_type'] == 'exactMatch'])
        close_matches = len(df[df['match_type'] == 'closeMatch'])
        
        print(f"\nSummary:")
        print(f"- Unique concepts with matches: {unique_concepts}")
        print(f"- Total exactMatch relationships: {exact_matches}")
        print(f"- Total closeMatch relationships: {close_matches}")
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return
    
    return match_details

# Example usage
if __name__ == "__main__":
    # Basic version - just concept URIs
    ttl_file = "soil_health_KG.ttl"  # Replace with your TTL file path
    output_csv = "skos_concepts_with_matches.csv"
    
    concepts = extract_skos_concepts_with_matches(ttl_file, output_csv)
    
    # Detailed version - with match information
    # Uncomment the lines below if you want detailed match information
    # detailed_output_csv = "skos_concepts_detailed_matches.csv"
    # match_details = extract_with_match_details(ttl_file, detailed_output_csv)

Successfully loaded 10990 triples from soil_health_KG.ttl
Found 494 unique concepts with exactMatch or closeMatch properties
Successfully saved concept URIs to skos_concepts_with_matches.csv


In [11]:
def analyze_uri_sets(input_csv_path, output_dir="./"):
    """
    Analyze URIs from two columns and create separate CSV files for each set operation result.
    
    Args:
        input_csv_path (str): Path to the input CSV file
        output_dir (str): Directory to save output files
    """
    
    # Read the CSV file
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Successfully loaded CSV with {len(df)} rows")
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return
    
    # Check if required columns exist
    if 'keywords' not in df.columns or 'thesauri' not in df.columns:
        print("Error: Required columns 'keywords' and 'thesauri' not found in CSV")
        return
    
    # Remove NaN values and convert to sets
    keywords_set = set(df['keywords'].dropna().astype(str))
    thesauri_set = set(df['thesauri'].dropna().astype(str))
    
    print(f"Number of unique URIs in keywords column: {len(keywords_set)}")
    print(f"Number of unique URIs in thesauri column: {len(thesauri_set)}")
    
    # Perform set operations
    union_set = keywords_set.union(thesauri_set)
    intersection_set = keywords_set.intersection(thesauri_set)
    keywords_only = keywords_set - thesauri_set
    thesauri_only = thesauri_set - keywords_set
    
    print(f"\nSet operation results:")
    print(f"Union (all unique URIs): {len(union_set)}")
    print(f"Intersection (URIs in both columns): {len(intersection_set)}")
    print(f"Keywords only: {len(keywords_only)}")
    print(f"Thesauri only: {len(thesauri_only)}")
    
    # Save each set to a separate CSV file
    sets_data = {
        'union': union_set,
        'intersection': intersection_set,
        'keywords_only': keywords_only,
        'thesauri_only': thesauri_only
    }
    
    for set_name, uri_set in sets_data.items():
        filename = f"{output_dir}uri_{set_name}.csv"
        try:
            df_temp = pd.DataFrame({'URI': sorted(uri_set)})
            df_temp.to_csv(filename, index=False)
            print(f"Saved {len(uri_set)} URIs to {filename}")
        except Exception as e:
            print(f"Error saving {filename}: {e}")
    
    print(f"\nAll files saved successfully to directory: {output_dir}")

# Example usage
if __name__ == "__main__":
    # Replace with your actual file paths
    input_file = "matched_concepts.csv"
    output_directory = "./"  # Current directory, change as needed
    
    analyze_uri_sets(input_file, output_directory)

Successfully loaded CSV with 684 rows
Number of unique URIs in keywords column: 683
Number of unique URIs in thesauri column: 494

Set operation results:
Union (all unique URIs): 788
Intersection (URIs in both columns): 389
Keywords only: 294
Thesauri only: 105
Saved 788 URIs to ./uri_union.csv
Saved 389 URIs to ./uri_intersection.csv
Saved 294 URIs to ./uri_keywords_only.csv
Saved 105 URIs to ./uri_thesauri_only.csv

All files saved successfully to directory: ./


#### LLM-as-a-judge

In [44]:
system_prompt_voc = """You are an expert AI assistant specializing in soil science and controlled vocabulary development. Your task is to analyze a given term and determine if it is suitable for inclusion in a formal, standardized soil science vocabulary.

**Your Goal:**
Classify each term you receive into one of two categories: "Vocabulary" or "Un-vocabulary".

**Definitions and Rules:**

1. **"Vocabulary" Term:**
   
   * Represents a standardized, reusable, and generic concept *within the domain of soil science*.
   * It is often a general concept that can have specific instances, values, or measurements.
   * It can be singular or plural.
   * Abbreviations or standard acronyms that refer directly to those concepts (e.g. `SOC`, `DDT`).
   * It should be a noun or a noun phrase that is broadly recognized and used in soil science literature, without evaluative or descriptive adjectives (avoid “high”, “moderate”, “low”, etc.).
   * *Examples of Vocabulary Terms:* `soil organic carbon`, `cation exchange capacity`, `soil texture`, `bulk density`, `soil horizon`, `parent material`, `silt loam`.
2. **"Un-vocabulary" Term:**
   A term is classified as "Un-vocabulary" if it meets **any** of the following criteria:
   
   * **Evaluative/descriptive instances:** It represents a specific *measurement*, *qualitative state*, or *quantitative description* of a vocabulary term (e.g. “moderate soil organic carbon content”, “high bulk density”, “poor CEC”).
   * **Too broad or out of scope:** The term is a generic concept that is not specific to soil science and lacks a direct, unique meaning within the domain (e.g. “time”, “location” when unqualified).
   * **Context-specific phrases:** The term is phrased as a statement or sentence fragment rather than a standardized standalone noun concept (e.g. “agricultural area under severe erosion”).
3. **Confidence:**
   
   * Provide a confidence score between 0 and 1 reflecting how certain you are in your Vocabulary/Un-vocabulary decision.

**Output Format:**
For every term you are given, you MUST respond in the strict JSON format. Do not add any extra conversation or pleasantries."""

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TermProcessor:
    def __init__(self, system_prompt: str, user_prompt_template: str):
        """
        Initialize the processor with prompts
        
        Args:
            system_prompt: The system prompt (unchanged for all calls)
            user_prompt_template: Template for user prompt with {term} placeholder
        """
        self.client = OpenAI()
        self.system_prompt = system_prompt
        self.user_prompt_template = user_prompt_template
        
    def get_term_from_row(self, row: pd.Series) -> str:
        """
        Extract the term from a CSV row (preferred label, fallback to alternative label)
        
        Args:
            row: Pandas Series representing a CSV row
            
        Returns:
            The term to process
        """
        # Assuming columns are: URL, preferred_label, alternative_label
        preferred_label = row.iloc[1] if len(row) > 1 else ""
        alternative_label = row.iloc[2] if len(row) > 2 else ""
        
        # Use preferred label if available and not empty, otherwise use alternative
        if pd.notna(preferred_label) and str(preferred_label).strip():
            return str(preferred_label).strip()
        elif pd.notna(alternative_label) and str(alternative_label).strip():
            return str(alternative_label).strip()
        else:
            return ""
    
    def call_llm_api(self, term: str) -> Optional[Dict[str, Any]]:
        """
        Call the LLM API with the given term
        
        Args:
            term: The term to evaluate
            
        Returns:
            JSON response from LLM or None if error
        """
        try:
            # Create the user prompt with the term
            prompt_voc = self.user_prompt_template.format(term=term)
            
            completion = self.client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": prompt_voc}
                ],
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "soil_vocab_review",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "term": {
                                    "type": "string",
                                    "description": "The specific term being evaluated, exactly as input."
                                },
                                "is_vocab_term": {
                                    "type": "boolean",
                                    "description": "Whether the term should be included in the controlled vocabulary for soil science."
                                },
                                "confidence_score": {
                                    "type": "number",
                                    "description": "Confidence score of the judgement, from 0 to 1 (inclusive).",
                                    "minimum": 0,
                                    "maximum": 1
                                }
                            },
                            "required": [
                                "term",
                                "is_vocab_term",
                                "confidence_score"
                            ],
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                }
            )
            
            # Parse the JSON response
            response_content = completion.choices[0].message.content
            return json.loads(response_content)
            
        except Exception as e:
            logger.error(f"Error calling LLM API for term '{term}': {str(e)}")
            return None
    
    def process_csv(self, input_csv_path: str, output_csv_path: str, 
                   delay_seconds: float = 1.0, resume_from_row: int = 0):
        """
        Process the CSV file and generate results
        
        Args:
            input_csv_path: Path to input CSV file
            output_csv_path: Path to output CSV file
            delay_seconds: Delay between API calls to respect rate limits
            resume_from_row: Row number to resume from (0-indexed)
        """
        # Read the input CSV
        try:
            df = pd.read_csv(input_csv_path)
            logger.info(f"Loaded CSV with {len(df)} rows")
        except Exception as e:
            logger.error(f"Error reading CSV file: {str(e)}")
            return
        
        # Prepare results list
        results = []
        
        # Load existing results if resuming
        if resume_from_row > 0:
            try:
                existing_df = pd.read_csv(output_csv_path)
                results = existing_df.to_dict('records')
                logger.info(f"Resuming from row {resume_from_row}, loaded {len(results)} existing results")
            except FileNotFoundError:
                logger.warning(f"Output file {output_csv_path} not found, starting fresh")
                resume_from_row = 0
        
        # Process each row starting from resume_from_row
        for idx, row in df.iloc[resume_from_row:].iterrows():
            actual_idx = idx if resume_from_row == 0 else resume_from_row + (idx - df.iloc[resume_from_row:].index[0])
            
            # Get the term from the row
            term = self.get_term_from_row(row)
            
            if not term:
                logger.warning(f"Row {actual_idx}: No valid term found, skipping")
                continue
            
            logger.info(f"Processing row {actual_idx}: '{term}'")
            
            # Call LLM API
            result = self.call_llm_api(term)
            
            if result:
                # Add original row data to the result
                result['original_url'] = row.iloc[0] if len(row) > 0 else ""
                result['preferred_label'] = row.iloc[1] if len(row) > 1 else ""
                result['alternative_label'] = row.iloc[2] if len(row) > 2 else ""
                result['row_index'] = actual_idx
                
                results.append(result)
                logger.info(f"Row {actual_idx}: Success - is_vocab_term: {result['is_vocab_term']}, confidence: {result['confidence_score']}")
            else:
                # Add error entry
                error_result = {
                    'term': term,
                    'is_vocab_term': None,
                    'confidence_score': None,
                    'original_url': row.iloc[0] if len(row) > 0 else "",
                    'preferred_label': row.iloc[1] if len(row) > 1 else "",
                    'alternative_label': row.iloc[2] if len(row) > 2 else "",
                    'row_index': actual_idx,
                    'error': 'API call failed'
                }
                results.append(error_result)
                logger.error(f"Row {actual_idx}: Failed to process term '{term}'")
            
            # Save results periodically (every 10 rows)
            if len(results) % 10 == 0:
                self.save_results(results, output_csv_path)
                logger.info(f"Saved intermediate results ({len(results)} rows)")
            
            # Delay between API calls
            if delay_seconds > 0:
                time.sleep(delay_seconds)
        
        # Save final results
        self.save_results(results, output_csv_path)
        logger.info(f"Processing complete. Results saved to {output_csv_path}")
        
        # Print summary
        successful_calls = sum(1 for r in results if r.get('is_vocab_term') is not None)
        failed_calls = len(results) - successful_calls
        vocab_terms = sum(1 for r in results if r.get('is_vocab_term') is True)
        
        logger.info(f"Summary: {successful_calls} successful, {failed_calls} failed, {vocab_terms} vocab terms identified")
    
    def save_results(self, results: list, output_csv_path: str):
        """Save results to CSV file"""
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(output_csv_path, index=False)
        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")

def main():
    # Configuration
    INPUT_CSV_PATH = "ontovocabs/soil_health_KG.csv"  # Change this to your input file path
    OUTPUT_CSV_PATH = "llm_results.csv"  # Change this to your desired output file path
    DELAY_SECONDS = 1.0  # Delay between API calls (adjust based on rate limits)
    RESUME_FROM_ROW = 0  # Set to row number if resuming from interruption
    
    # Define your prompts here
    SYSTEM_PROMPT = system_prompt_voc
    
    USER_PROMPT_TEMPLATE = """Now please determine if the following term is a vocabulary term or un-vocabulary term: {term}
    """
    
    # Initialize processor
    processor = TermProcessor(SYSTEM_PROMPT, USER_PROMPT_TEMPLATE)
    
    # Process the CSV
    processor.process_csv(
        input_csv_path=INPUT_CSV_PATH,
        output_csv_path=OUTPUT_CSV_PATH,
        delay_seconds=DELAY_SECONDS,
        resume_from_row=RESUME_FROM_ROW
    )

if __name__ == "__main__":
    main()

2025-07-14 15:46:11,361 - INFO - Loaded CSV with 2 rows
2025-07-14 15:46:11,363 - INFO - Processing row 0: 'soil screening values'
2025-07-14 15:46:12,336 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 15:46:12,336 - INFO - Row 0: Success - is_vocab_term: True, confidence: 0.93
2025-07-14 15:46:13,345 - INFO - Processing row 1: 'soil phosphorus loss'
2025-07-14 15:46:13,988 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 15:46:13,991 - INFO - Row 1: Success - is_vocab_term: True, confidence: 0.93
2025-07-14 15:46:14,998 - INFO - Processing complete. Results saved to test_results.csv
2025-07-14 15:46:14,999 - INFO - Summary: 2 successful, 0 failed, 2 vocab terms identified
