In [37]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, OWL, split_uri
from rdflib.collection import Collection
from datetime import datetime

from openai import OpenAI
from pydantic import BaseModel
import re

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import pandas as pd

import time
from typing import Dict, Any, Optional, Tuple, List
import logging
import sys
from collections import defaultdict

from pathlib import Path
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# Opening config file, the config structure is:
# {"openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [3]:
def load_graph(data):
    g = Graph()
    g.parse(data=data, format="turtle")
    return g

In [4]:
def print_rdf(rdf):
    g = Graph()
    g.parse(data=rdf, format="turtle")

    for s, p, o in g:
        print(s, p, o)

In [5]:
# Namespaces
she = Namespace("https://soilwise-he.github.io/soil-health#")
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
agrontology = Namespace("http://aims.fao.org/aos/agrontology#")
sio = Namespace("http://semanticscience.org/resource/")
glosis_lh = Namespace("http://w3id.org/glosis/model/layerhorizon/")
glosis_sp = Namespace("http://w3id.org/glosis/model/siteplot/")
qudt = Namespace("http://qudt.org/schema/qudt/")
unit = Namespace("http://qudt.org/vocab/unit/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
obo = Namespace("http://purl.obolibrary.org/obo/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
biolink = Namespace("https://w3id.org/biolink/vocab/")
afox = Namespace("http://purl.allotrope.org/ontologies/property#")
afor = Namespace("http://purl.allotrope.org/ontologies/result#")
sorelsc = Namespace("http://sweetontology.net/relaSci/")
sorelpr = Namespace("http://sweetontology.net/relaProvenance/")
sohuj = Namespace("http://sweetontology.net/humanJurisdiction/")
sorelph = Namespace("http://sweetontology.net/relaPhysical/")
sorelm = Namespace("http://sweetontology.net/relaMath/")
sorepsg = Namespace("http://sweetontology.net/reprSpaceGeometry/")
bao = Namespace("http://www.bioassayontology.org/bao#")
repr = Namespace("https://w3id.org/reproduceme#")
sorelch = Namespace("http://sweetontology.net/relaChemical/")
sorelsp = Namespace("http://sweetontology.net/relaSpace/")
om = Namespace("http://www.ontology-of-units-of-measure.org/resource/om-2/")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")

### Vocabs or not vocabs

In [7]:
def extract_skos_concepts_to_csv(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts from a TTL file and save to CSV.
    
    Args:
        ttl_file_path (str): Path to the input TTL file
        output_csv_path (str): Path to the output CSV file
    """
    
    # Create a graph and parse the TTL file
    g = Graph()
    g.parse(ttl_file_path, format='turtle')
    
    # Bind namespaces for cleaner output (optional)
    g.bind("skos", SKOS)
    g.bind("agrontology", agrontology)
    
    # Query for all SKOS concepts
    concepts_data = []
    
    # Find all subjects that are of type skos:Concept
    for concept_uri in g.subjects(predicate=URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), 
                                 object=SKOS.Concept):
        
        # Get prefLabel (should be unique)
        pref_label = ""
        for pref in g.objects(concept_uri, SKOS.prefLabel):
            pref_label = str(pref)
            break  # Take the first one if multiple exist
        
        # Get altLabels and abbreviations
        alt_labels = []
        
        # Collect skos:altLabel
        for alt in g.objects(concept_uri, SKOS.altLabel):
            alt_labels.append(str(alt))
        
        # Collect agrontology:hasAbbreviation
        for abbrev in g.objects(concept_uri, agrontology.hasAbbreviation):
            alt_labels.append(str(abbrev))
        
        # Join alternative labels with semicolon
        alt_labels_str = ";".join(alt_labels) if alt_labels else ""
        
        # Add to results
        concepts_data.append([
            str(concept_uri),  # Full URI
            pref_label,        # Preferred label
            alt_labels_str     # Alternative labels/abbreviations
        ])
    
    # Sort by URI for consistent output
    concepts_data.sort(key=lambda x: x[0])
    
    # Write to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write header
        writer.writerow(['URI', 'prefLabel', 'altLabel_abbreviation'])
        
        # Write data
        writer.writerows(concepts_data)
    
    print(f"Extracted {len(concepts_data)} SKOS concepts to {output_csv_path}")
    return len(concepts_data)

if __name__ == "__main__":
    # Replace with your actual file paths
    ttl_file = "soil_health_KG.ttl"
    csv_output = "shkg.csv"
    
    try:
        # Use the first method (iterative approach)
        count = extract_skos_concepts_to_csv(ttl_file, csv_output)
        
        print(f"Successfully processed {count} concepts")
        
    except Exception as e:
        print(f"Error processing file: {e}")
        print("Make sure you have rdflib installed: pip install rdflib")

Extracted 1785 SKOS concepts to shkg.csv
Successfully processed 1785 concepts


In [7]:
def analyze_uris_from_csv(input_csv_path, output_csv_path):
    """
    Analyze URIs from the matched_concepts.csv file:
    1. Collect all different URIs that appear in the CSV
    2. Count how many times each URI appears in total
    3. Track which columns each URI appears in
    4. Rank URIs in descending order by frequency
    
    Args:
        input_csv_path (str): Path to the input CSV file
        output_csv_path (str): Path to save the analysis results
    """
    
    # Dictionary to store URI statistics
    # Structure: {uri: {'total_count': count, 'columns': set_of_columns}}
    uri_stats = defaultdict(lambda: {'total_count': 0, 'columns': set()})
    
    # Read the CSV file
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
        print(f"Columns: {list(df.columns)}")
        
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return
    
    # Process each row and column
    for row_idx, row in df.iterrows():
        for col_name in df.columns:
            cell_value = str(row[col_name])
            
            # Skip NaN values
            if pd.isna(row[col_name]) or cell_value == 'nan':
                continue
            
            # Split by comma to handle multiple URIs in one cell
            uris_in_cell = [uri.strip() for uri in cell_value.split(',') if uri.strip()]
            
            for uri in uris_in_cell:
                # Only process if it looks like a URI (contains http)
                if 'http' in uri:
                    uri_stats[uri]['total_count'] += 1
                    uri_stats[uri]['columns'].add(col_name)
    
    # Convert to list and sort by total count (descending)
    results = []
    for uri, stats in uri_stats.items():
        results.append({
            'uri': uri,
            'total_count': stats['total_count'],
            'columns': ', '.join(sorted(stats['columns']))
        })
    
    # Sort by total count in descending order
    results.sort(key=lambda x: x['total_count'], reverse=True)
    
    # Save results to CSV
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv_path, index=False)
    
    print(f"\nAnalysis complete!")
    print(f"Total unique URIs found: {len(results)}")
    print(f"Results saved to: {output_csv_path}")
    
    # Display top 10 results
    print(f"\nTop 10 most frequent URIs:")
    print("=" * 80)
    for i, result in enumerate(results[:10], 1):
        print(f"{i:2d}. {result['uri']}")
        print(f"    Count: {result['total_count']}, Appears in columns: {result['columns']}")
        print()
    
    return results

# Run the analysis
input_file = "candidate_vocabs/candidate_concepts.csv"
output_file = "candidate_vocabs/candidate_concepts_ranking.csv"

results = analyze_uris_from_csv(input_file, output_file)

Loaded CSV with 714 rows and 4 columns
Columns: ['keywords', 'thesauri', 'gpt-4o', 'gpt-4.1']

Analysis complete!
Total unique URIs found: 1093
Results saved to: candidate_vocabs/candidate_concepts_ranking.csv

Top 10 most frequent URIs:
 1. https://soilwise-he.github.io/soil-health#MicrobialBiomass
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 2. https://soilwise-he.github.io/soil-health#Leptosols
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 3. https://soilwise-he.github.io/soil-health#Clay
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 4. https://soilwise-he.github.io/soil-health#SoilPollution
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 5. https://soilwise-he.github.io/soil-health#SoilDegradation
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 6. https://soilwise-he.github.io/soil-health#SoilOrganicCarbon
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, 

#### LLM-as-a-judge

In [6]:
system_prompt_voc = """You are an expert AI assistant specializing in soil science and controlled vocabulary development. Your task is to analyze a given term and determine if it is suitable for inclusion in a formal, standardized soil science vocabulary.

**Your Goal:**
Classify each term you receive into one of two categories: "Vocabulary" or "Un-vocabulary".

**Definitions and Rules:**

1. **"Vocabulary" Term:**
   
   * Represents a standardized, reusable, and generic concept *within the domain of soil science*.
   * It is often a general concept that can have specific instances, values, or measurements.
   * It can be singular or plural.
   * Abbreviations or standard acronyms that refer directly to those concepts (e.g. `SOC`, `DDT`).
   * It should be a noun or a noun phrase that is broadly recognized and used in soil science literature, without evaluative or descriptive adjectives (avoid “high”, “moderate”, “low”, etc.).
   * *Examples of Vocabulary Terms:* `soil organic carbon`, `cation exchange capacity`, `soil texture`, `bulk density`, `soil horizon`, `parent material`, `silt loam`.
2. **"Un-vocabulary" Term:**
   A term is classified as "Un-vocabulary" if it meets **any** of the following criteria:
   
   * **Evaluative/descriptive instances:** It represents a specific *measurement*, *qualitative state*, or *quantitative description* of a vocabulary term (e.g. “moderate soil organic carbon content”, “high bulk density”, “poor CEC”).
   * **Too broad or out of scope:** The term is a generic concept that is not specific to soil science and lacks a direct, unique meaning within the domain (e.g. “time”, “location” when unqualified).
   * **Context-specific phrases:** The term is phrased as a statement or sentence fragment rather than a standardized standalone noun concept (e.g. “agricultural area under severe erosion”).
3. **Confidence:**
   
   * Provide a confidence score between 0 and 1 reflecting how certain you are in your Vocabulary/Un-vocabulary decision.

**Output Format:**
For every term you are given, you MUST respond in the strict JSON format. Do not add any extra conversation or pleasantries."""

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TermProcessor:
    def __init__(self, system_prompt: str, user_prompt_template: str):
        """
        Initialize the processor with prompts
        
        Args:
            system_prompt: The system prompt (unchanged for all calls)
            user_prompt_template: Template for user prompt with {term} placeholder
        """
        self.client = OpenAI()
        self.system_prompt = system_prompt
        self.user_prompt_template = user_prompt_template
        
    def get_term_from_row(self, row: pd.Series) -> str:
        """
        Extract the term from a CSV row (preferred label, fallback to alternative label)
        
        Args:
            row: Pandas Series representing a CSV row
            
        Returns:
            The term to process
        """
        # Assuming columns are: URL, preferred_label, alternative_label
        preferred_label = row.iloc[1] if len(row) > 1 else ""
        alternative_label = row.iloc[2] if len(row) > 2 else ""
        
        # Use preferred label if available and not empty, otherwise use alternative
        if pd.notna(preferred_label) and str(preferred_label).strip():
            return str(preferred_label).strip()
        elif pd.notna(alternative_label) and str(alternative_label).strip():
            return str(alternative_label).strip()
        else:
            return ""
    
    def call_llm_api(self, term: str) -> Optional[Dict[str, Any]]:
        """
        Call the LLM API with the given term
        
        Args:
            term: The term to evaluate
            
        Returns:
            JSON response from LLM or None if error
        """
        try:
            # Create the user prompt with the term
            prompt_voc = self.user_prompt_template.format(term=term)
            
            completion = self.client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": prompt_voc}
                ],
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "soil_vocab_review",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "term": {
                                    "type": "string",
                                    "description": "The specific term being evaluated, exactly as input."
                                },
                                "is_vocab_term": {
                                    "type": "boolean",
                                    "description": "Whether the term should be included in the controlled vocabulary for soil science."
                                },
                                "confidence_score": {
                                    "type": "number",
                                    "description": "Confidence score of the judgement, from 0 to 1 (inclusive).",
                                    "minimum": 0,
                                    "maximum": 1
                                }
                            },
                            "required": [
                                "term",
                                "is_vocab_term",
                                "confidence_score"
                            ],
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                }
            )
            
            # Parse the JSON response
            response_content = completion.choices[0].message.content
            return json.loads(response_content)
            
        except Exception as e:
            logger.error(f"Error calling LLM API for term '{term}': {str(e)}")
            return None
    
    def process_csv(self, input_csv_path: str, output_csv_path: str, 
                   delay_seconds: float = 1.0, resume_from_row: int = 0):
        """
        Process the CSV file and generate results
        
        Args:
            input_csv_path: Path to input CSV file
            output_csv_path: Path to output CSV file
            delay_seconds: Delay between API calls to respect rate limits
            resume_from_row: Row number to resume from (0-indexed)
        """
        # Read the input CSV
        try:
            df = pd.read_csv(input_csv_path)
            logger.info(f"Loaded CSV with {len(df)} rows")
        except Exception as e:
            logger.error(f"Error reading CSV file: {str(e)}")
            return
        
        # Prepare results list
        results = []
        
        # Load existing results if resuming
        if resume_from_row > 0:
            try:
                existing_df = pd.read_csv(output_csv_path)
                results = existing_df.to_dict('records')
                logger.info(f"Resuming from row {resume_from_row}, loaded {len(results)} existing results")
            except FileNotFoundError:
                logger.warning(f"Output file {output_csv_path} not found, starting fresh")
                resume_from_row = 0
        
        # Process each row starting from resume_from_row
        for idx, row in df.iloc[resume_from_row:].iterrows():
            actual_idx = idx if resume_from_row == 0 else resume_from_row + (idx - df.iloc[resume_from_row:].index[0])
            
            # Get the term from the row
            term = self.get_term_from_row(row)
            
            if not term:
                logger.warning(f"Row {actual_idx}: No valid term found, skipping")
                continue
            
            logger.info(f"Processing row {actual_idx}: '{term}'")
            
            # Call LLM API
            result = self.call_llm_api(term)
            
            if result:
                # Add original row data to the result
                result['original_url'] = row.iloc[0] if len(row) > 0 else ""
                result['preferred_label'] = row.iloc[1] if len(row) > 1 else ""
                result['alternative_label'] = row.iloc[2] if len(row) > 2 else ""
                result['row_index'] = actual_idx
                
                results.append(result)
                logger.info(f"Row {actual_idx}: Success - is_vocab_term: {result['is_vocab_term']}, confidence: {result['confidence_score']}")
            else:
                # Add error entry
                error_result = {
                    'term': term,
                    'is_vocab_term': None,
                    'confidence_score': None,
                    'original_url': row.iloc[0] if len(row) > 0 else "",
                    'preferred_label': row.iloc[1] if len(row) > 1 else "",
                    'alternative_label': row.iloc[2] if len(row) > 2 else "",
                    'row_index': actual_idx,
                    'error': 'API call failed'
                }
                results.append(error_result)
                logger.error(f"Row {actual_idx}: Failed to process term '{term}'")
            
            # Save results periodically (every 10 rows)
            if len(results) % 10 == 0:
                self.save_results(results, output_csv_path)
                logger.info(f"Saved intermediate results ({len(results)} rows)")
            
            # Delay between API calls
            if delay_seconds > 0:
                time.sleep(delay_seconds)
        
        # Save final results
        self.save_results(results, output_csv_path)
        logger.info(f"Processing complete. Results saved to {output_csv_path}")
        
        # Print summary
        successful_calls = sum(1 for r in results if r.get('is_vocab_term') is not None)
        failed_calls = len(results) - successful_calls
        vocab_terms = sum(1 for r in results if r.get('is_vocab_term') is True)
        
        logger.info(f"Summary: {successful_calls} successful, {failed_calls} failed, {vocab_terms} vocab terms identified")
    
    def save_results(self, results: list, output_csv_path: str):
        """Save results to CSV file"""
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(output_csv_path, index=False)
        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")

def main():
    # Configuration
    INPUT_CSV_PATH = "ontovocabs/soil_health_KG.csv"  # Change this to your input file path
    OUTPUT_CSV_PATH = "llm_results.csv"  # Change this to your desired output file path
    DELAY_SECONDS = 1.0  # Delay between API calls (adjust based on rate limits)
    RESUME_FROM_ROW = 0  # Set to row number if resuming from interruption
    
    # Define your prompts here
    SYSTEM_PROMPT = system_prompt_voc
    
    USER_PROMPT_TEMPLATE = """Now please determine if the following term is a vocabulary term or un-vocabulary term: {term}
    """
    
    # Initialize processor
    processor = TermProcessor(SYSTEM_PROMPT, USER_PROMPT_TEMPLATE)
    
    # Process the CSV
    processor.process_csv(
        input_csv_path=INPUT_CSV_PATH,
        output_csv_path=OUTPUT_CSV_PATH,
        delay_seconds=DELAY_SECONDS,
        resume_from_row=RESUME_FROM_ROW
    )

if __name__ == "__main__":
    main()

2025-07-14 17:11:37,022 - INFO - Loaded CSV with 1787 rows
2025-07-14 17:11:37,025 - INFO - Processing row 0: 'abiotic environment'
2025-07-14 17:11:37,940 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:37,957 - INFO - Row 0: Success - is_vocab_term: False, confidence: 0.8
2025-07-14 17:11:38,960 - INFO - Processing row 1: 'abundance'
2025-07-14 17:11:39,875 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:39,896 - INFO - Row 1: Success - is_vocab_term: False, confidence: 0.85
2025-07-14 17:11:40,899 - INFO - Processing row 2: 'abundance of species populations'
2025-07-14 17:11:42,676 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:42,690 - INFO - Row 2: Success - is_vocab_term: False, confidence: 0.85
2025-07-14 17:11:43,702 - INFO - Processing row 3: 'acceptable risk levels'
2025-07-14 17:11:44,241 - INFO - HTTP 

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class LLMResultsPostProcessor:
    def __init__(self, input_csv_path: str):
        """
        Initialize the post-processor with the LLM results CSV file
        
        Args:
            input_csv_path: Path to the CSV file containing LLM results
        """
        self.input_csv_path = input_csv_path
        self.df = None
        self.load_data()
    
    def load_data(self):
        """Load the CSV data and validate it"""
        try:
            self.df = pd.read_csv(self.input_csv_path)
            logger.info(f"Loaded {len(self.df)} rows from {self.input_csv_path}")
            
            # Validate required columns
            required_columns = ['term', 'is_vocab_term', 'confidence_score', 'original_url']
            missing_columns = [col for col in required_columns if col not in self.df.columns]
            
            if missing_columns:
                raise ValueError(f"Missing required columns: {missing_columns}")
            
            # Clean the data
            self.clean_data()
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            sys.exit(1)
    
    def clean_data(self):
        """Clean and validate the data"""
        original_count = len(self.df)
        
        # Remove rows with missing essential data
        self.df = self.df.dropna(subset=['term', 'is_vocab_term', 'confidence_score', 'original_url'])
        
        # Convert confidence_score to numeric, handling any string values
        self.df['confidence_score'] = pd.to_numeric(self.df['confidence_score'], errors='coerce')
        
        # Remove rows with invalid confidence scores
        self.df = self.df.dropna(subset=['confidence_score'])
        
        # Ensure confidence scores are within valid range [0, 1]
        self.df = self.df[
            (self.df['confidence_score'] >= 0) & 
            (self.df['confidence_score'] <= 1)
        ]
        
        cleaned_count = len(self.df)
        removed_count = original_count - cleaned_count
        
        if removed_count > 0:
            logger.warning(f"Removed {removed_count} rows with invalid data")
        
        logger.info(f"Data cleaned: {cleaned_count} valid rows remaining")
    
    def display_summary(self):
        """Display summary statistics of the data"""
        print("\n" + "="*50)
        print("DATA SUMMARY")
        print("="*50)
        
        total_rows = len(self.df)
        vocab_terms = len(self.df[self.df['is_vocab_term'] == True])
        non_vocab_terms = len(self.df[self.df['is_vocab_term'] == False])
        
        print(f"Total processed terms: {total_rows}")
        print(f"Vocab terms (True): {vocab_terms} ({vocab_terms/total_rows*100:.1f}%)")
        print(f"Non-vocab terms (False): {non_vocab_terms} ({non_vocab_terms/total_rows*100:.1f}%)")
        
        print(f"\nConfidence Score Statistics:")
        print(f"Mean: {self.df['confidence_score'].mean():.3f}")
        print(f"Median: {self.df['confidence_score'].median():.3f}")
        print(f"Min: {self.df['confidence_score'].min():.3f}")
        print(f"Max: {self.df['confidence_score'].max():.3f}")
        print(f"Std: {self.df['confidence_score'].std():.3f}")
        
        print(f"\nConfidence Score Distribution:")
        print(f"0.0-0.2: {len(self.df[self.df['confidence_score'] < 0.2])}")
        print(f"0.2-0.4: {len(self.df[(self.df['confidence_score'] >= 0.2) & (self.df['confidence_score'] < 0.4)])}")
        print(f"0.4-0.6: {len(self.df[(self.df['confidence_score'] >= 0.4) & (self.df['confidence_score'] < 0.6)])}")
        print(f"0.6-0.8: {len(self.df[(self.df['confidence_score'] >= 0.6) & (self.df['confidence_score'] < 0.8)])}")
        print(f"0.8-1.0: {len(self.df[self.df['confidence_score'] >= 0.8])}")
    
    def get_vocab_choice(self) -> Optional[bool]:
        """Get user's choice for vocab term filtering"""
        print("\n" + "="*50)
        print("VOCAB TERM FILTERING")
        print("="*50)
        print("Choose which terms to include:")
        print("1. Only vocab terms (is_vocab_term = True)")
        print("2. Only non-vocab terms (is_vocab_term = False)")
        print("3. Both vocab and non-vocab terms")
        
        while True:
            try:
                choice = input("\nEnter your choice (1/2/3): ").strip()
                if choice == '1':
                    return True
                elif choice == '2':
                    return False
                elif choice == '3':
                    return None
                else:
                    print("Invalid choice. Please enter 1, 2, or 3.")
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                sys.exit(0)
    
    def get_confidence_range(self) -> Tuple[float, float]:
        """Get user's choice for confidence score range"""
        print("\n" + "="*50)
        print("CONFIDENCE SCORE FILTERING")
        print("="*50)
        print("Current confidence score range: {:.3f} - {:.3f}".format(
            self.df['confidence_score'].min(),
            self.df['confidence_score'].max()
        ))
        
        while True:
            try:
                print("\nEnter confidence score range (0.0 to 1.0):")
                min_score = float(input("Minimum confidence score: "))
                max_score = float(input("Maximum confidence score: "))
                
                if min_score < 0 or min_score > 1:
                    print("Minimum score must be between 0.0 and 1.0")
                    continue
                if max_score < 0 or max_score > 1:
                    print("Maximum score must be between 0.0 and 1.0")
                    continue
                if min_score > max_score:
                    print("Minimum score cannot be greater than maximum score")
                    continue
                
                return min_score, max_score
                
            except ValueError:
                print("Invalid input. Please enter numeric values.")
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                sys.exit(0)
    
    def apply_filters(self, vocab_filter: Optional[bool], 
                     confidence_range: Tuple[float, float]) -> pd.DataFrame:
        """Apply the selected filters to the data"""
        filtered_df = self.df.copy()
        
        # Apply vocab term filter
        if vocab_filter is not None:
            filtered_df = filtered_df[filtered_df['is_vocab_term'] == vocab_filter]
        
        # Apply confidence score range filter
        min_conf, max_conf = confidence_range
        filtered_df = filtered_df[
            (filtered_df['confidence_score'] >= min_conf) & 
            (filtered_df['confidence_score'] <= max_conf)
        ]
        
        return filtered_df
    
    def save_filtered_results(self, filtered_df: pd.DataFrame, output_path: str):
        """Save the filtered results to a CSV file"""
        try:
            # Create output dataframe with selected columns
            output_df = filtered_df[['original_url', 'term', 'is_vocab_term', 'confidence_score']].copy()
            
            # Sort by confidence score (descending) for better organization
            output_df = output_df.sort_values('confidence_score', ascending=False)
            
            # Save to CSV
            output_df.to_csv(output_path, index=False)
            logger.info(f"Filtered results saved to {output_path}")
            
            return True
            
        except Exception as e:
            logger.error(f"Error saving filtered results: {str(e)}")
            return False
    
    def display_filter_summary(self, filtered_df: pd.DataFrame, 
                             vocab_filter: Optional[bool], 
                             confidence_range: Tuple[float, float]):
        """Display summary of filtered results"""
        print("\n" + "="*50)
        print("FILTER RESULTS SUMMARY")
        print("="*50)
        
        print(f"Applied filters:")
        if vocab_filter is None:
            print(f"  - Vocab terms: Both True and False")
        else:
            print(f"  - Vocab terms: {vocab_filter}")
        
        min_conf, max_conf = confidence_range
        print(f"  - Confidence range: {min_conf:.3f} - {max_conf:.3f}")
        
        print(f"\nResults:")
        print(f"  - Original dataset: {len(self.df)} terms")
        print(f"  - Filtered dataset: {len(filtered_df)} terms")
        print(f"  - Percentage retained: {len(filtered_df)/len(self.df)*100:.1f}%")
        
        if len(filtered_df) > 0:
            vocab_count = len(filtered_df[filtered_df['is_vocab_term'] == True])
            non_vocab_count = len(filtered_df[filtered_df['is_vocab_term'] == False])
            
            print(f"\nFiltered results breakdown:")
            print(f"  - Vocab terms: {vocab_count}")
            print(f"  - Non-vocab terms: {non_vocab_count}")
            print(f"  - Average confidence: {filtered_df['confidence_score'].mean():.3f}")
    
    def interactive_process(self):
        """Run the interactive post-processing workflow"""
        print("LLM Results Post-Processor")
        print("="*50)
        
        # Display summary
        self.display_summary()
        
        # Get user preferences
        vocab_filter = self.get_vocab_choice()
        confidence_range = self.get_confidence_range()
        
        # Apply filters
        filtered_df = self.apply_filters(vocab_filter, confidence_range)
        
        # Display filter summary
        self.display_filter_summary(filtered_df, vocab_filter, confidence_range)
        
        if len(filtered_df) == 0:
            print("\nNo terms match the specified criteria.")
            return
        
        # Get output filename
        print("\n" + "="*50)
        print("SAVE RESULTS")
        print("="*50)
        
        while True:
            try:
                output_filename = input("Enter output filename (e.g., 'filtered_terms.csv'): ").strip()
                if not output_filename:
                    print("Filename cannot be empty.")
                    continue
                
                if not output_filename.endswith('.csv'):
                    output_filename += '.csv'
                
                break
                
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                return
        
        # Save results
        if self.save_filtered_results(filtered_df, output_filename):
            print(f"\nSuccess! {len(filtered_df)} terms saved to '{output_filename}'")
            print("The output file contains: original_url, term, is_vocab_term, confidence_score")
        else:
            print("Error: Failed to save results.")

def main():
    """Main function to run the post-processor"""
    # Configuration - Update these paths as needed
    INPUT_CSV_PATH = "llm_results_v2.csv"  # Change this to your LLM results file path
    
    print("LLM Results Post-Processor (IDE Version)")
    print("="*50)
    print(f"Input file: {INPUT_CSV_PATH}")
    
    # Check if file exists
    import os
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"Error: File '{INPUT_CSV_PATH}' not found.")
        print("Please update the INPUT_CSV_PATH variable in the main() function.")
        return
    
    # Create and run the post-processor
    processor = LLMResultsPostProcessor(INPUT_CSV_PATH)
    processor.interactive_process()

if __name__ == "__main__":
    main()

2025-07-14 22:11:50,374 - INFO - Loaded 1786 rows from llm_results_v2.csv
2025-07-14 22:11:50,383 - INFO - Data cleaned: 1786 valid rows remaining


LLM Results Post-Processor (IDE Version)
Input file: llm_results_v2.csv
LLM Results Post-Processor

DATA SUMMARY
Total processed terms: 1786
Vocab terms (True): 714 (40.0%)
Non-vocab terms (False): 1072 (60.0%)

Confidence Score Statistics:
Mean: 0.901
Median: 0.920
Min: 0.150
Max: 1.000
Std: 0.069

Confidence Score Distribution:
0.0-0.2: 1
0.2-0.4: 0
0.4-0.6: 0
0.6-0.8: 67
0.8-1.0: 1718

VOCAB TERM FILTERING
Choose which terms to include:
1. Only vocab terms (is_vocab_term = True)
2. Only non-vocab terms (is_vocab_term = False)
3. Both vocab and non-vocab terms

CONFIDENCE SCORE FILTERING
Current confidence score range: 0.150 - 1.000

Enter confidence score range (0.0 to 1.0):

FILTER RESULTS SUMMARY
Applied filters:
  - Vocab terms: True
  - Confidence range: 0.000 - 1.000

Results:
  - Original dataset: 1786 terms
  - Filtered dataset: 714 terms
  - Percentage retained: 40.0%

Filtered results breakdown:
  - Vocab terms: 714
  - Non-vocab terms: 0
  - Average confidence: 0.915

SAVE

2025-07-14 22:12:14,675 - INFO - Filtered results saved to filtered_terms_2.csv



Success! 714 terms saved to 'filtered_terms_2.csv'
The output file contains: original_url, term, is_vocab_term, confidence_score


### Soil Property Process ontology

In [23]:
g = Graph()
g.parse("ontovocabs/soil_property_process.ttl", format='turtle')

<Graph identifier=N22edf6a00b28422bbec508194b81a6a8 (<class 'rdflib.graph.Graph'>)>

In [24]:
# Enhanced version to properly parse owl:propertyChainAxiom from blank nodes
def parse_property_chain(g, blank_node):
    """Parse a property chain from a blank node (RDF list)"""
    try:
        # Convert the blank node to a Collection (RDF list)
        collection = Collection(g, blank_node)
        chain = []
        for item in collection:
            # Get the short name of each property in the chain
            prop_name = str(item).split('#')[-1] if '#' in str(item) else str(item).split('/')[-1]
            chain.append(prop_name)
        return " → ".join(chain)
    except:
        return str(blank_node)

# Query for all object properties with enhanced property chain parsing
object_properties = []

for subject in g.subjects(predicate=RDF.type, object=OWL.ObjectProperty):
    property_info = {
        'uri': str(subject),
        'label': str(subject).split('#')[-1] if '#' in str(subject) else str(subject).split('/')[-1],
        'properties': [],
        'property_chains': []
    }
    
    # Collect all properties of this object property
    for pred, obj in g.predicate_objects(subject):
        if pred != RDF.type or obj != OWL.ObjectProperty:
            pred_label = str(pred).split('#')[-1] if '#' in str(pred) else str(pred).split('/')[-1]
            
            # Special handling for propertyChainAxiom
            if pred_label == "propertyChainAxiom":
                chain_description = parse_property_chain(g, obj)
                property_info['property_chains'].append(chain_description)
            else:
                obj_label = str(obj).split('#')[-1] if '#' in str(obj) else str(obj).split('/')[-1]
                property_info['properties'].append((pred_label, obj_label))
    
    object_properties.append(property_info)

# Print results with enhanced property chain display
print(f"Found {len(object_properties)} owl:ObjectProperty entries in the TTL file:\n")
print("=" * 80)

for i, prop in enumerate(object_properties, 1):
    print(f"{i:2d}. {prop['label']}")
    print(f"    URI: {prop['uri']}")
    
    if prop['properties']:
        print("    Additional properties:")
        for pred_label, obj_label in prop['properties']:
            print(f"      - {pred_label}: {obj_label}")
    
    if prop['property_chains']:
        print("    Property chain axioms:")
        for chain in prop['property_chains']:
            print(f"      - {chain}")
    
    print()

print("=" * 80)
print(f"Total: {len(object_properties)} owl:ObjectProperty entries")

# Summary of properties with property chains
properties_with_chains = [p for p in object_properties if p['property_chains']]
if properties_with_chains:
    print(f"\nProperties with property chain axioms ({len(properties_with_chains)}):")
    for prop in properties_with_chains:
        print(f"  - {prop['label']}:")
        for chain in prop['property_chains']:
            print(f"    • {chain}")

Found 16 owl:ObjectProperty entries in the TTL file:

 1. by
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#by

 2. dependsOn
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#dependsOn
    Additional properties:
      - subPropertyOf: influencedBy

 3. from
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#from

 4. hasComponent
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#hasComponent
    Additional properties:
      - subPropertyOf: hasPart

 5. hasImpactOn
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#hasImpactOn
    Additional properties:
      - type: TransitiveProperty
      - inverseOf: influencedBy
    Property chain axioms:
      - hasImpactOn → partOf
      - inverseProcessOf → hasImpactOn
      - measuredBy → hasImpactOn

 6. hasPart
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#hasPart
    Additional properties:
      - type: TransitiveProperty
      - inverseOf: partO

#### Convert OWL Class Hierarchy to SKOS Vocabulary

In [9]:
def convert_owl_to_skos(owl_file_path, skos_output_path):
    """
    Extract OWL class hierarchy and convert to SKOS vocabulary.
    - owl:Class -> skos:Concept
    - rdfs:subClassOf -> skos:broader
    
    Args:
        owl_file_path: Path to the OWL ontology file
        skos_output_path: Path to save the SKOS vocabulary
    """
    
    def split_camel_case(text):
        """
        Split camelCase string into lowercase words.
        Example: 'SoilPhysicalProcess' -> 'soil physical process'
        """
        # Insert space before uppercase letters
        result = re.sub(r'([A-Z])', r' \1', text)
        # Clean up and convert to lowercase
        return result.strip().lower()
    
    # Load the OWL ontology
    g = Graph()
    g.parse(owl_file_path, format='turtle')
    print(f"Loaded OWL ontology with {len(g)} triples")
    
    # Create a new graph for SKOS vocabulary
    skos_g = Graph()
    
    # Define namespaces
    SPP = Namespace("http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#")
    SHE = Namespace("https://soilwise-he.github.io/soil-health#")
    
    # Bind namespaces
    skos_g.bind("skos", SKOS)
    skos_g.bind("dcterms", DCTERMS)
    skos_g.bind("rdfs", RDFS)
    skos_g.bind("spp", SPP)
    skos_g.bind("she", SHE)
    skos_g.bind("owl", OWL)
    
    # Create a ConceptScheme for the vocabulary
    scheme_uri = URIRef("https://soilwise-he.github.io/soil-health/ConceptScheme")
    skos_g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    skos_g.add((scheme_uri, SKOS.prefLabel, Literal("Soil Property and Process Vocabulary", lang="en")))
    skos_g.add((scheme_uri, DCTERMS.title, Literal("Soil Property and Process Vocabulary", lang="en")))
    skos_g.add((scheme_uri, DCTERMS.description, Literal(
        "SKOS vocabulary derived from the Soil Property and Process OWL ontology. "
        "Describes soil physical properties and processes, as well as how they affect each other.", 
        lang="en")))
    skos_g.add((scheme_uri, DCTERMS.creator, Literal("Heshan Du, University of Leeds")))
    skos_g.add((scheme_uri, DCTERMS.created, Literal("April, 2016")))
    skos_g.add((scheme_uri, DCTERMS.license, Literal("Creative Commons Attribution 4.0 International (CC BY 4.0)")))
    
    # Extract all owl:Class instances (only URIRefs, not blank nodes)
    all_classes = list(g.subjects(RDF.type, OWL.Class))
    classes = [c for c in all_classes if isinstance(c, URIRef)]
    print(f"Found {len(all_classes)} OWL classes total, {len(classes)} named classes (excluding blank nodes)")
    
    # Create mapping from SPP URIs to SHE URIs
    uri_mapping = {}
    
    # Convert each class to a SKOS concept
    for spp_class_uri in classes:
        # Extract class label from URI
        class_label = str(spp_class_uri).split('#')[-1] if '#' in str(spp_class_uri) else str(spp_class_uri).split('/')[-1]
        
        # Create new SHE URI for this concept
        she_concept_uri = SHE[class_label]
        uri_mapping[spp_class_uri] = she_concept_uri
        
        # Add as skos:Concept
        skos_g.add((she_concept_uri, RDF.type, SKOS.Concept))
        skos_g.add((she_concept_uri, SKOS.inScheme, scheme_uri))
        
        # Add exactMatch to original OWL class
        skos_g.add((she_concept_uri, SKOS.exactMatch, spp_class_uri))
        
        # Split camelCase and add as prefLabel
        readable_label = split_camel_case(class_label)
        skos_g.add((she_concept_uri, SKOS.prefLabel, Literal(readable_label, lang="en")))
        
        # Check for rdfs:comment and convert to skos:definition
        for comment in g.objects(spp_class_uri, RDFS.comment):
            skos_g.add((she_concept_uri, SKOS.definition, comment))
    
    # Now add broader/narrower relationships using the new SHE URIs
    for spp_class_uri in classes:
        she_concept_uri = uri_mapping[spp_class_uri]
        
        # Extract rdfs:subClassOf relationships and convert to skos:broader
        for parent_class in g.objects(spp_class_uri, RDFS.subClassOf):
            # Only include if parent is a named class (not a blank node) and is in our mapping
            if isinstance(parent_class, URIRef) and parent_class in uri_mapping:
                she_parent_uri = uri_mapping[parent_class]
                skos_g.add((she_concept_uri, SKOS.broader, she_parent_uri))
                # Add inverse relationship
                skos_g.add((she_parent_uri, SKOS.narrower, she_concept_uri))
    
    # Identify top concepts (classes without rdfs:subClassOf to another named class)
    for spp_class_uri in classes:
        she_concept_uri = uri_mapping[spp_class_uri]
        has_parent = False
        
        for parent_class in g.objects(spp_class_uri, RDFS.subClassOf):
            if isinstance(parent_class, URIRef) and parent_class in uri_mapping:
                has_parent = True
                break
        
        if not has_parent:
            skos_g.add((scheme_uri, SKOS.hasTopConcept, she_concept_uri))
            skos_g.add((she_concept_uri, SKOS.topConceptOf, scheme_uri))
    
    # Save the SKOS vocabulary
    skos_g.serialize(destination=skos_output_path, format='turtle')
    print(f"SKOS vocabulary saved to {skos_output_path}")
    print(f"Total triples in SKOS vocabulary: {len(skos_g)}")
    
    # Print statistics
    concepts = list(skos_g.subjects(RDF.type, SKOS.Concept))
    top_concepts = list(skos_g.objects(scheme_uri, SKOS.hasTopConcept))
    broader_relations = list(skos_g.subject_objects(SKOS.broader))
    
    print(f"\nStatistics:")
    print(f"  - Total concepts: {len(concepts)}")
    print(f"  - Top concepts: {len(top_concepts)}")
    print(f"  - Broader relationships: {len(broader_relations)}")
    
    return skos_g

# Run the conversion
owl_file = "ontovocabs/spp/soil_property_process.ttl"
skos_output = "SoilVoc.ttl"

skos_graph = convert_owl_to_skos(owl_file, skos_output)

Loaded OWL ontology with 8606 triples
Found 606 OWL classes total, 592 named classes (excluding blank nodes)
SKOS vocabulary saved to SoilVoc.ttl
Total triples in SKOS vocabulary: 3677

Statistics:
  - Total concepts: 592
  - Top concepts: 16
  - Broader relationships: 635
SKOS vocabulary saved to SoilVoc.ttl
Total triples in SKOS vocabulary: 3677

Statistics:
  - Total concepts: 592
  - Top concepts: 16
  - Broader relationships: 635


In [13]:
# Display all top concepts with their one-layer narrower concepts
def display_skos_hierarchy(skos_graph):
    """
    Display all 16 top concepts with their direct narrower concepts (one layer only)
    """
    print("="*80)
    print("TOP CONCEPTS AND THEIR DIRECT NARROWER CONCEPTS")
    print("="*80)
    
    scheme_uri = URIRef("https://soilwise-he.github.io/soil-health/ConceptScheme")
    
    # Get all top concepts
    top_concepts = list(skos_graph.objects(scheme_uri, SKOS.hasTopConcept))
    
    # Sort top concepts by label for consistent display
    top_concepts_with_labels = []
    for concept in top_concepts:
        label = list(skos_graph.objects(concept, SKOS.prefLabel))
        label_str = str(label[0]) if label else str(concept).split('#')[-1]
        top_concepts_with_labels.append((concept, label_str))
    
    top_concepts_with_labels.sort(key=lambda x: x[1])
    
    print(f"\nTotal Top Concepts: {len(top_concepts_with_labels)}")
    print("="*80)
    
    for i, (concept, label_str) in enumerate(top_concepts_with_labels, 1):
        print(f"\n{i}. {label_str.upper()}")
        print(f"   URI: {concept}")
        
        # Get all direct narrower concepts
        narrower = list(skos_graph.objects(concept, SKOS.narrower))
        
        if narrower:
            # Sort narrower concepts by label
            narrower_with_labels = []
            for narrow in narrower:
                narrow_label = list(skos_graph.objects(narrow, SKOS.prefLabel))
                narrow_label_str = str(narrow_label[0]) if narrow_label else str(narrow).split('#')[-1]
                narrower_with_labels.append(narrow_label_str)
            
            narrower_with_labels.sort()
            
            print(f"   Narrower concepts ({len(narrower_with_labels)}):")
            for narrow_label in narrower_with_labels:
                print(f"      • {narrow_label}")
        else:
            print(f"   Narrower concepts: None")
    
    print("\n" + "="*80)
    print(f"Summary: {len(top_concepts_with_labels)} top-level concepts displayed")
    print("="*80)

# Display the hierarchy
display_skos_hierarchy(skos_graph)

TOP CONCEPTS AND THEIR DIRECT NARROWER CONCEPTS

Total Top Concepts: 16

1. HUMAN ACTIVITY
   URI: https://soilwise-he.github.io/soil-health#HumanActivity
   Narrower concepts (3):
      • land use
      • soil management
      • traffic

2. PHENOMENA
   URI: https://soilwise-he.github.io/soil-health#Phenomena
   Narrower concepts (2):
      • planetary phenomena
      • soil phenomena

3. PROCESS
   URI: https://soilwise-he.github.io/soil-health#Process
   Narrower concepts (3):
      • force
      • plant process
      • soil process

4. PROPERTY
   URI: https://soilwise-he.github.io/soil-health#Property
   Narrower concepts (5):
      • air property
      • general property
      • meteorological property
      • soil property
      • water property

5. SOIL FLUID RETENTION
   URI: https://soilwise-he.github.io/soil-health#SoilFluidRetention
   Narrower concepts: None

6. SOIL FLUID TRANSPORT
   URI: https://soilwise-he.github.io/soil-health#SoilFluidTransport
   Narrower concepts: 

#### Visualize SoilVoc into a mind map

In [13]:
def parse_skos_vocabulary(ttl_file_path):
    """
    Parse a SKOS vocabulary from a Turtle file and extract the hierarchy.
    
    Args:
        ttl_file_path: Path to the .ttl file
        
    Returns:
        dict: Dictionary containing the vocabulary structure
    """
    # Load the graph
    g = Graph()
    g.parse(ttl_file_path, format='turtle')
    
    # Find the ConceptScheme
    concept_schemes = list(g.subjects(RDF.type, SKOS.ConceptScheme))
    
    if not concept_schemes:
        raise ValueError("No SKOS ConceptScheme found in the file")
    
    # Use the first ConceptScheme
    scheme = concept_schemes[0]
    
    # Get scheme information
    scheme_label = str(g.value(scheme, SKOS.prefLabel) or 
                      g.value(scheme, rdflib.RDFS.label) or 
                      scheme.split('/')[-1].split('#')[-1])
    
    # Find top concepts
    top_concepts = []
    
    # Try hasTopConcept property
    for top_concept in g.objects(scheme, SKOS.hasTopConcept):
        top_concepts.append(top_concept)
    
    # Try topConceptOf property (inverse)
    for top_concept in g.subjects(SKOS.topConceptOf, scheme):
        if top_concept not in top_concepts:
            top_concepts.append(top_concept)
    
    # If no top concepts found, find concepts with no broader concepts
    if not top_concepts:
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        concepts_with_broader = set(g.subjects(SKOS.broader, None))
        top_concepts = list(all_concepts - concepts_with_broader)
    
    # Build the hierarchy
    def get_concept_info(concept_uri):
        """Extract information about a concept."""
        pref_label = g.value(concept_uri, SKOS.prefLabel)
        alt_label = g.value(concept_uri, SKOS.altLabel)
        definition = g.value(concept_uri, SKOS.definition)
        notation = g.value(concept_uri, SKOS.notation)
        
        label = str(pref_label or alt_label or concept_uri.split('/')[-1].split('#')[-1])
        
        # Get narrower concepts
        narrower = list(g.objects(concept_uri, SKOS.narrower))
        
        # Also check for concepts that have this as broader (inverse)
        for concept in g.subjects(SKOS.broader, concept_uri):
            if concept not in narrower:
                narrower.append(concept)
        
        concept_info = {
            'uri': str(concept_uri),
            'label': label,
            'notation': str(notation) if notation else None,
            'definition': str(definition) if definition else None,
            'narrower': [get_concept_info(n) for n in narrower] if narrower else []
        }
        
        return concept_info
    
    # Build the structure
    vocabulary = {
        'scheme_uri': str(scheme),
        'scheme_label': scheme_label,
        'top_concepts': [get_concept_info(tc) for tc in top_concepts]
    }
    
    return vocabulary


def generate_html_mindmap(vocabulary_data, output_file='mindmap.html'):
    """
    Generate an interactive HTML mind map from the vocabulary data.
    
    Args:
        vocabulary_data: Dictionary containing the vocabulary structure
        output_file: Output HTML file path
    """
    html_content = f'''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{vocabulary_data['scheme_label']} - Interactive Mind Map</title>
    <style>
        * {{
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }}
        
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 20px;
        }}
        
        .container {{
            max-width: 1400px;
            margin: 0 auto;
            background: white;
            border-radius: 12px;
            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
            overflow: hidden;
        }}
        
        .header {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 30px;
            text-align: center;
        }}
        
        .header h1 {{
            font-size: 2em;
            margin-bottom: 10px;
        }}
        
        .header p {{
            opacity: 0.9;
            font-size: 1.1em;
        }}
        
        .mindmap {{
            padding: 40px;
            overflow-x: auto;
        }}
        
        .concept {{
            margin: 10px 0;
            animation: fadeIn 0.3s ease-in;
        }}
        
        @keyframes fadeIn {{
            from {{ opacity: 0; transform: translateX(-10px); }}
            to {{ opacity: 1; transform: translateX(0); }}
        }}
        
        .concept-header {{
            display: flex;
            align-items: center;
            padding: 12px 16px;
            background: #f8f9fa;
            border-left: 4px solid #667eea;
            border-radius: 6px;
            cursor: pointer;
            transition: all 0.2s ease;
            margin-bottom: 5px;
        }}
        
        .concept-header:hover {{
            background: #e9ecef;
            border-left-color: #764ba2;
            transform: translateX(5px);
        }}
        
        .concept-header.active {{
            background: #667eea;
            color: white;
            border-left-color: #764ba2;
        }}
        
        .concept-header.highlighted {{
            background: #ffc107;
            border-left-color: #ff9800;
            animation: pulse 1s ease-in-out;
        }}
        
        @keyframes pulse {{
            0%, 100% {{ transform: scale(1); }}
            50% {{ transform: scale(1.02); }}
        }}
        
        .concept-header.no-children {{
            cursor: default;
            border-left-color: #adb5bd;
        }}
        
        .concept-header.no-children:hover {{
            background: #f8f9fa;
            transform: none;
        }}
        
        .toggle-icon {{
            width: 24px;
            height: 24px;
            margin-right: 12px;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 18px;
            font-weight: bold;
            transition: transform 0.2s ease;
        }}
        
        .toggle-icon.expanded {{
            transform: rotate(90deg);
        }}
        
        .concept-label {{
            flex: 1;
            font-weight: 500;
            font-size: 1.05em;
        }}
        
        .concept-notation {{
            background: rgba(102, 126, 234, 0.1);
            color: #667eea;
            padding: 4px 10px;
            border-radius: 4px;
            font-size: 0.9em;
            font-weight: 600;
            margin-right: 10px;
        }}
        
        .concept-header.active .concept-notation {{
            background: rgba(255, 255, 255, 0.2);
            color: white;
        }}
        
        .concept-count {{
            background: rgba(102, 126, 234, 0.1);
            color: #667eea;
            padding: 4px 10px;
            border-radius: 12px;
            font-size: 0.85em;
            font-weight: 600;
        }}
        
        .concept-header.active .concept-count {{
            background: rgba(255, 255, 255, 0.2);
            color: white;
        }}
        
        .concept-children {{
            margin-left: 30px;
            border-left: 2px solid #e9ecef;
            padding-left: 20px;
            display: none;
        }}
        
        .concept-children.expanded {{
            display: block;
        }}
        
        .concept-definition {{
            margin: 5px 0 10px 54px;
            padding: 10px 15px;
            background: #f8f9fa;
            border-radius: 4px;
            font-size: 0.9em;
            color: #495057;
            font-style: italic;
            display: none;
        }}
        
        .concept-definition.show {{
            display: block;
        }}
        
        .top-level {{
            margin-left: 0;
            padding-left: 0;
            border-left: none;
        }}
        
        .stats {{
            padding: 20px 40px;
            background: #f8f9fa;
            border-top: 1px solid #dee2e6;
            display: flex;
            justify-content: space-around;
            flex-wrap: wrap;
        }}
        
        .stat-item {{
            text-align: center;
            padding: 10px;
        }}
        
        .stat-value {{
            font-size: 2em;
            font-weight: bold;
            color: #667eea;
        }}
        
        .stat-label {{
            color: #6c757d;
            font-size: 0.9em;
            margin-top: 5px;
        }}
        
        .search-box {{
            padding: 20px 40px;
            background: #f8f9fa;
            border-bottom: 1px solid #dee2e6;
        }}
        
        .search-input {{
            width: 100%;
            padding: 12px 20px;
            font-size: 1em;
            border: 2px solid #dee2e6;
            border-radius: 6px;
            transition: all 0.2s ease;
        }}
        
        .search-input:focus {{
            outline: none;
            border-color: #667eea;
            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
        }}
        
        .search-results {{
            margin-top: 15px;
            display: none;
        }}
        
        .search-results.show {{
            display: block;
        }}
        
        .search-result-item {{
            padding: 10px 15px;
            background: white;
            border: 1px solid #dee2e6;
            border-radius: 6px;
            margin-bottom: 8px;
            cursor: pointer;
            transition: all 0.2s ease;
        }}
        
        .search-result-item:hover {{
            background: #f8f9fa;
            border-color: #667eea;
            transform: translateX(5px);
        }}
        
        .search-result-label {{
            font-weight: 500;
            color: #212529;
        }}
        
        .search-result-notation {{
            display: inline-block;
            background: rgba(102, 126, 234, 0.1);
            color: #667eea;
            padding: 2px 8px;
            border-radius: 3px;
            font-size: 0.85em;
            font-weight: 600;
            margin-right: 8px;
        }}
        
        .search-result-path {{
            font-size: 0.85em;
            color: #6c757d;
            margin-top: 5px;
        }}
        
        .search-info {{
            padding: 10px 15px;
            background: #e7f3ff;
            border: 1px solid #b8daff;
            border-radius: 6px;
            color: #004085;
            font-size: 0.9em;
            margin-bottom: 10px;
        }}
        
        .no-results {{
            text-align: center;
            padding: 40px;
            color: #6c757d;
            font-style: italic;
        }}
        
        .clear-search {{
            display: inline-block;
            margin-top: 10px;
            padding: 8px 16px;
            background: #667eea;
            color: white;
            border-radius: 6px;
            cursor: pointer;
            font-size: 0.9em;
            transition: all 0.2s ease;
        }}
        
        .clear-search:hover {{
            background: #764ba2;
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>SoilVoc</h1>
            <p>Interactive SKOS Vocabulary Mind Map</p>
        </div>
        
        <div class="search-box">
            <input type="text" class="search-input" id="searchInput" placeholder="Search all concepts by label or notation...">
            <div class="search-results" id="searchResults"></div>
        </div>
        
        <div class="mindmap" id="mindmap">
            <!-- Mind map will be generated here -->
        </div>
        
        <div class="stats" id="stats">
            <!-- Statistics will be generated here -->
        </div>
    </div>
    
    <script>
        const vocabularyData = {json.dumps(vocabulary_data, indent=2)};
        
        let allConcepts = [];
        let uniqueConceptUris = new Set();
        let conceptMap = new Map(); // Maps URI to concept object with path info
        
        function buildConceptMap(concepts, path = []) {{
            concepts.forEach(concept => {{
                const currentPath = [...path, concept];
                
                // Store concept with its path
                if (!conceptMap.has(concept.uri)) {{
                    conceptMap.set(concept.uri, {{
                        concept: concept,
                        path: currentPath
                    }});
                }}
                
                uniqueConceptUris.add(concept.uri);
                allConcepts.push(concept);
                
                if (concept.narrower && concept.narrower.length > 0) {{
                    buildConceptMap(concept.narrower, currentPath);
                }}
            }});
        }}
        
        function countConcepts(concepts) {{
            concepts.forEach(concept => {{
                uniqueConceptUris.add(concept.uri);
                allConcepts.push(concept);
                if (concept.narrower && concept.narrower.length > 0) {{
                    countConcepts(concept.narrower);
                }}
            }});
            return uniqueConceptUris.size;
        }}
        
        function getMaxDepth(concepts, depth = 1) {{
            let maxDepth = depth;
            concepts.forEach(concept => {{
                if (concept.narrower && concept.narrower.length > 0) {{
                    maxDepth = Math.max(maxDepth, getMaxDepth(concept.narrower, depth + 1));
                }}
            }});
            return maxDepth;
        }}
        
        function renderConcept(concept, level = 0) {{
            const hasNarrower = concept.narrower && concept.narrower.length > 0;
            const notation = concept.notation ? `<span class="concept-notation">${{concept.notation}}</span>` : '';
            const count = hasNarrower ? `<span class="concept-count">${{concept.narrower.length}}</span>` : '';
            const noChildClass = hasNarrower ? '' : 'no-children';
            const toggleIcon = hasNarrower ? '▶' : '●';
            
            let html = `
                <div class="concept" data-uri="${{concept.uri}}">
                    <div class="concept-header ${{noChildClass}}" onclick="toggleConcept(this)">
                        <span class="toggle-icon">${{toggleIcon}}</span>
                        ${{notation}}
                        <span class="concept-label">${{concept.label}}</span>
                        ${{count}}
                    </div>
            `;
            
            if (concept.definition) {{
                html += `<div class="concept-definition">${{concept.definition}}</div>`;
            }}
            
            if (hasNarrower) {{
                html += `<div class="concept-children">`;
                concept.narrower.forEach(narrower => {{
                    html += renderConcept(narrower, level + 1);
                }});
                html += `</div>`;
            }}
            
            html += `</div>`;
            return html;
        }}
        
        function toggleConcept(header) {{
            if (header.classList.contains('no-children')) return;
            
            const concept = header.parentElement;
            const children = concept.querySelector('.concept-children');
            const definition = concept.querySelector('.concept-definition');
            const icon = header.querySelector('.toggle-icon');
            
            if (children) {{
                children.classList.toggle('expanded');
                header.classList.toggle('active');
                icon.classList.toggle('expanded');
                
                if (definition) {{
                    definition.classList.toggle('show');
                }}
            }}
        }}
        
        function renderMindmap() {{
            const mindmapDiv = document.getElementById('mindmap');
            let html = '<div class="top-level">';
            
            vocabularyData.top_concepts.forEach(concept => {{
                html += renderConcept(concept, 0);
            }});
            
            html += '</div>';
            mindmapDiv.innerHTML = html;
        }}
        
        function renderStats() {{
            const totalConcepts = countConcepts(vocabularyData.top_concepts);
            const maxDepth = getMaxDepth(vocabularyData.top_concepts);
            const topConceptsCount = vocabularyData.top_concepts.length;
            
            const statsDiv = document.getElementById('stats');
            statsDiv.innerHTML = `
                <div class="stat-item">
                    <div class="stat-value">${{topConceptsCount}}</div>
                    <div class="stat-label">Top Concepts</div>
                </div>
                <div class="stat-item">
                    <div class="stat-value">${{totalConcepts}}</div>
                    <div class="stat-label">Total Concepts</div>
                </div>
                <div class="stat-item">
                    <div class="stat-value">${{maxDepth}}</div>
                    <div class="stat-label">Max Depth</div>
                </div>
            `;
        }}
        
        function searchConcepts() {{
            const searchTerm = document.getElementById('searchInput').value.toLowerCase().trim();
            const searchResultsDiv = document.getElementById('searchResults');
            
            if (searchTerm === '') {{
                searchResultsDiv.classList.remove('show');
                searchResultsDiv.innerHTML = '';
                clearHighlights();
                return;
            }}
            
            // Search in all concepts
            const matches = [];
            conceptMap.forEach((data, uri) => {{
                const concept = data.concept;
                const label = concept.label.toLowerCase();
                const notation = concept.notation ? concept.notation.toLowerCase() : '';
                
                if (label.includes(searchTerm) || notation.includes(searchTerm)) {{
                    matches.push({{
                        uri: uri,
                        concept: concept,
                        path: data.path
                    }});
                }}
            }});
            
            // Display results
            if (matches.length > 0) {{
                let html = `<div class="search-info">Found ${{matches.length}} matching concept(s). Click to navigate.</div>`;
                
                matches.forEach(match => {{
                    const pathLabels = match.path.map(c => c.notation ? `${{c.notation}} ${{c.label}}` : c.label).join(' → ');
                    const notation = match.concept.notation ? `<span class="search-result-notation">${{match.concept.notation}}</span>` : '';
                    
                    html += `
                        <div class="search-result-item" onclick="navigateToConcept('${{match.uri}}')">
                            <div class="search-result-label">
                                ${{notation}}${{match.concept.label}}
                            </div>
                            <div class="search-result-path">${{pathLabels}}</div>
                        </div>
                    `;
                }});
                
                html += `<div class="clear-search" onclick="clearSearch()">Clear Search</div>`;
                searchResultsDiv.innerHTML = html;
                searchResultsDiv.classList.add('show');
            }} else {{
                searchResultsDiv.innerHTML = `
                    <div class="search-info">No concepts found matching "${{searchTerm}}".</div>
                    <div class="clear-search" onclick="clearSearch()">Clear Search</div>
                `;
                searchResultsDiv.classList.add('show');
            }}
        }}
        
        function navigateToConcept(targetUri) {{
            // Get the path to this concept
            const conceptData = conceptMap.get(targetUri);
            if (!conceptData) return;
            
            // First, collapse everything
            document.querySelectorAll('.concept-children.expanded').forEach(el => {{
                el.classList.remove('expanded');
            }});
            document.querySelectorAll('.concept-header.active').forEach(el => {{
                el.classList.remove('active');
            }});
            document.querySelectorAll('.toggle-icon.expanded').forEach(el => {{
                el.classList.remove('expanded');
            }});
            document.querySelectorAll('.concept-definition.show').forEach(el => {{
                el.classList.remove('show');
            }});
            
            // Clear previous highlights
            clearHighlights();
            
            // Expand the path to the target concept
            const path = conceptData.path;
            for (let i = 0; i < path.length - 1; i++) {{
                const conceptUri = path[i].uri;
                const conceptElement = document.querySelector(`.concept[data-uri="${{conceptUri}}"]`);
                
                if (conceptElement) {{
                    const header = conceptElement.querySelector('.concept-header');
                    const children = conceptElement.querySelector('.concept-children');
                    const icon = header.querySelector('.toggle-icon');
                    
                    if (children && !children.classList.contains('expanded')) {{
                        children.classList.add('expanded');
                        header.classList.add('active');
                        icon.classList.add('expanded');
                    }}
                }}
            }}
            
            // Highlight and scroll to the target concept
            const targetElement = document.querySelector(`.concept[data-uri="${{targetUri}}"]`);
            if (targetElement) {{
                const targetHeader = targetElement.querySelector('.concept-header');
                targetHeader.classList.add('highlighted');
                
                // Show definition if exists
                const definition = targetElement.querySelector('.concept-definition');
                if (definition) {{
                    definition.classList.add('show');
                }}
                
                // Scroll to the target
                targetElement.scrollIntoView({{ behavior: 'smooth', block: 'center' }});
                
                // Remove highlight after animation
                setTimeout(() => {{
                    targetHeader.classList.remove('highlighted');
                }}, 2000);
            }}
        }}
        
        function clearHighlights() {{
            document.querySelectorAll('.concept-header.highlighted').forEach(el => {{
                el.classList.remove('highlighted');
            }});
        }}
        
        function clearSearch() {{
            document.getElementById('searchInput').value = '';
            searchConcepts();
        }}
        
        // Initialize
        buildConceptMap(vocabularyData.top_concepts);
        renderMindmap();
        renderStats();
        
        // Search functionality with debounce
        let searchTimeout;
        document.getElementById('searchInput').addEventListener('input', () => {{
            clearTimeout(searchTimeout);
            searchTimeout = setTimeout(searchConcepts, 300);
        }});
        
        // Keyboard navigation
        document.addEventListener('keydown', (e) => {{
            if (e.key === 'Escape') {{
                clearSearch();
            }}
        }});
    </script>
</body>
</html>'''
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"Interactive mind map generated: {output_file}")


# Example usage
if __name__ == '__main__':
    # Example: Parse and generate mind map
    ttl_file = 'SoilVoc.ttl'  # Replace with your .ttl file path
    
    try:
        print(f"Parsing SKOS vocabulary from: {ttl_file}")
        vocabulary = parse_skos_vocabulary(ttl_file)
        
        print(f"Found ConceptScheme: {vocabulary['scheme_label']}")
        print(f"Number of top concepts: {len(vocabulary['top_concepts'])}")
        
        output_file = 'soilvoc_mindmap.html'
        generate_html_mindmap(vocabulary, output_file)
        
        print(f"\nSuccess! Open {output_file} in your web browser to view the interactive mind map.")
        
    except FileNotFoundError:
        print(f"Error: File '{ttl_file}' not found.")
        print("\nPlease ensure your .ttl file is in the same directory as this script,")
        print("or provide the full path to the file.")
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

Parsing SKOS vocabulary from: SoilVoc.ttl
Found ConceptScheme: soil-health
Number of top concepts: 16
Interactive mind map generated: soilvoc_mindmap.html

Success! Open soilvoc_mindmap.html in your web browser to view the interactive mind map.


### GloSIS ontology

### SubKG extractor

In [10]:
def query_agrovoc_one_hop_neighbors(seed_label, endpoint="https://agrovoc.fao.org/sparql"):
    """
    Query all one-hop neighbors of a seed concept in AGROVOC using its prefLabel.
    
    Args:
        seed_label: Preferred label of the seed concept (e.g., "soil")
        endpoint: SPARQL endpoint URL
    
    Returns:
        list: List of unique dictionaries containing neighbor information (deduplicated by URI)
    """
    
    sparql = SPARQLWrapper(endpoint)
    
    # SPARQL query to get all one-hop neighbors
    # This includes both outgoing and incoming relationships
    # Only returns concepts with exactly one English prefLabel
    # Excludes specific predicates and predicates without English labels
    # Also retrieves altLabels if available
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
    SELECT DISTINCT ?neighbor ?prefLabel (GROUP_CONCAT(DISTINCT ?altLabel; separator="; ") AS ?altLabels) ?relation ?relationLabel
    WHERE {{
        # First, find the seed concept by its prefLabel
        ?seed skos:prefLabel ?seedLabel .
        FILTER(lcase(str(?seedLabel)) = "{seed_label.lower()}")
        FILTER(lang(?seedLabel) = "en")
        
        {{
            # Outgoing relationships (seed -> neighbor)
            ?seed ?relation ?neighbor .
            ?neighbor skos:prefLabel ?prefLabel .
            FILTER(lang(?prefLabel) = "en")
            OPTIONAL {{ ?relation rdfs:label ?relationLabel . FILTER(lang(?relationLabel) = "en") }}
        }}
        UNION
        {{
            # Incoming relationships (neighbor -> seed)
            ?neighbor ?relation ?seed .
            ?neighbor skos:prefLabel ?prefLabel .
            FILTER(lang(?prefLabel) = "en")
            OPTIONAL {{ ?relation rdfs:label ?relationLabel . FILTER(lang(?relationLabel) = "en") }}
        }}
        
        # Get alternative labels (optional - only English ones)
        OPTIONAL {{ 
            ?neighbor skos:altLabel ?altLabel . 
            FILTER(lang(?altLabel) = "en")
        }}
        
        # Filter to only include SKOS concepts (not literals)
        FILTER(isURI(?neighbor))
        FILTER(?neighbor != ?seed)
        
        # Exclude specific predicates
        FILTER(?relation != skos:inScheme)
        FILTER(?relation != skos:exactMatch)
        FILTER(?relation != skos:closeMatch)
        
        # Only include predicates that either:
        # 1. Have an English label, OR
        # 2. Are standard SKOS predicates (which may not have explicit labels in the endpoint)
        FILTER(
            EXISTS {{
                ?relation rdfs:label ?relLabelCheck .
                FILTER(lang(?relLabelCheck) = "en")
            }}
            ||
            ?relation IN (skos:broader, skos:narrower, skos:related, skos:broaderTransitive, 
                         skos:narrowerTransitive, skos:relatedMatch, skos:broadMatch, skos:narrowMatch)
        )
        
        # Ensure the neighbor has exactly one English prefLabel
        FILTER EXISTS {{
            ?neighbor skos:prefLabel ?enLabel .
            FILTER(lang(?enLabel) = "en")
        }}
        FILTER NOT EXISTS {{
            ?neighbor skos:prefLabel ?enLabel1 .
            ?neighbor skos:prefLabel ?enLabel2 .
            FILTER(lang(?enLabel1) = "en" && lang(?enLabel2) = "en" && ?enLabel1 != ?enLabel2)
        }}
    }}
    GROUP BY ?neighbor ?prefLabel ?relation ?relationLabel
    ORDER BY ?prefLabel
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
        
        # Use a dictionary to deduplicate by URI
        neighbors_dict = {}
        
        for result in results["results"]["bindings"]:
            uri = result['neighbor']['value']
            
            # Only add if this URI hasn't been seen before
            if uri not in neighbors_dict:
                pref_label = result['prefLabel']['value']
                alt_labels_str = result.get('altLabels', {}).get('value', '')
                
                # Parse altLabels (semicolon-separated) and format display name
                alt_labels = [label.strip() for label in alt_labels_str.split('; ') if label.strip()] if alt_labels_str else []
                
                # Create display name: "prefLabel (altLabel_1, altLabel_2, ...)" or just "prefLabel" if no altLabel
                if alt_labels:
                    alt_labels_formatted = ', '.join(alt_labels)
                    display_name = f"{pref_label} ({alt_labels_formatted})"
                else:
                    display_name = pref_label
                
                neighbor_info = {
                    'uri': uri,
                    'prefLabel': pref_label,
                    'altLabels': alt_labels,
                    'displayName': display_name,
                    'relations': []  # Store all relations for this neighbor
                }
                neighbors_dict[uri] = neighbor_info
            
            # Add the relation to this neighbor's relations list
            relation_info = {
                'relation': result['relation']['value'],
                'relationLabel': result.get('relationLabel', {}).get('value', result['relation']['value'])
            }
            
            # Avoid duplicate relations
            if relation_info not in neighbors_dict[uri]['relations']:
                neighbors_dict[uri]['relations'].append(relation_info)
        
        # Convert dictionary to list
        neighbors = list(neighbors_dict.values())
        
        return neighbors
    
    except Exception as e:
        print(f"Error querying AGROVOC: {e}")
        return []


def find_concept_uri_by_label(label, endpoint="https://agrovoc.fao.org/sparql"):
    """
    Find the URI of a concept by its preferred label.
    
    Args:
        label: The preferred label to search for (e.g., "soil")
        endpoint: SPARQL endpoint URL
    
    Returns:
        str: URI of the concept, or None if not found
    """
    
    sparql = SPARQLWrapper(endpoint)
    
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    
    SELECT ?concept ?prefLabel
    WHERE {{
        ?concept skos:prefLabel ?prefLabel .
        FILTER(lcase(str(?prefLabel)) = "{label.lower()}")
        FILTER(lang(?prefLabel) = "en" || lang(?prefLabel) = "")
    }}
    LIMIT 10
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
        
        if results["results"]["bindings"]:
            concepts = []
            for result in results["results"]["bindings"]:
                concepts.append({
                    'uri': result['concept']['value'],
                    'prefLabel': result['prefLabel']['value']
                })
            
            # If multiple results, show them
            if len(concepts) > 1:
                print(f"Found {len(concepts)} concepts matching '{label}':")
                for i, concept in enumerate(concepts, 1):
                    print(f"  {i}. {concept['prefLabel']} - {concept['uri']}")
            
            return concepts[0]['uri']
        else:
            print(f"No concept found with label '{label}'")
            return None
    
    except Exception as e:
        print(f"Error searching for concept: {e}")
        return None


# Example usage
if __name__ == "__main__":
    # Query one-hop neighbors using the prefLabel directly
    seed_label = "soil"
    
    print(f"Querying one-hop neighbors of '{seed_label}' in AGROVOC...")
    print("=" * 80)
    
    neighbors = query_agrovoc_one_hop_neighbors(seed_label)
    
    print(f"\nFound {len(neighbors)} unique one-hop neighbors:")
    print("=" * 80)
    
    # Display first 20 neighbors as examples
    for i, neighbor in enumerate(neighbors[:20], 1):
        print(f"{i:3d}. {neighbor['displayName']}")
        print(f"      URI: {neighbor['uri']}")
        if neighbor['altLabels']:
            print(f"      Alt Labels: {', '.join(neighbor['altLabels'])}")
        print(f"      Relations ({len(neighbor['relations'])}):")
        for rel in neighbor['relations']:
            relation_short = rel['relationLabel'].split('#')[-1].split('/')[-1]
            print(f"        - {relation_short}")
        print()
    
    if len(neighbors) > 20:
        print(f"... and {len(neighbors) - 20} more neighbors")
    
    # Summary statistics
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total unique concepts found: {len(neighbors)}")
    
    # Count concepts with altLabels
    concepts_with_alt = sum(1 for n in neighbors if n['altLabels'])
    print(f"Concepts with alternative labels: {concepts_with_alt}")
    
    # Count total relations
    total_relations = sum(len(n['relations']) for n in neighbors)
    print(f"Total relation instances: {total_relations}")
    print(f"Average relations per concept: {total_relations / len(neighbors):.2f}" if neighbors else "N/A")

Querying one-hop neighbors of 'soil' in AGROVOC...

Found 23 unique one-hop neighbors:
  1. edaphic factors (soil factors)
      URI: http://aims.fao.org/aos/agrovoc/c_15617
      Alt Labels: soil factors
      Relations (2):
        - Includes
        - Is included in

  2. evapotranspiration zone
      URI: http://aims.fao.org/aos/agrovoc/c_25346
      Relations (2):
        - Affects
        - Is affected by

  3. fine soil
      URI: http://aims.fao.org/aos/agrovoc/c_5749f32e
      Relations (2):
        - broader
        - narrower

  4. growing media (growth media, rooting media, soil substrates)
      URI: http://aims.fao.org/aos/agrovoc/c_3393
      Alt Labels: growth media, rooting media, soil substrates
      Relations (2):
        - Is use of
        - Is used as

  5. land cover
      URI: http://aims.fao.org/aos/agrovoc/c_37897
      Relations (2):
        - broader
        - narrower

  6. leachates
      URI: http://aims.fao.org/aos/agrovoc/c_35164
      Relations (2):
 

In [51]:
system_prompt_subkg = """# Sub-Knowledge Graph Node Relevance Classifier

## Task

You are a knowledge graph curator specializing in extracting topic-focused subgraphs. Evaluate multiple candidate nodes to determine whether each belongs in a subgraph centered around a seed topic.

## Input Format

- **Seed Topic**: [The central concept defining the subgraph scope]
- **Candidate Nodes**: [List of nodes, each formatted as: prefLabel (altLabel1, altLabel2, ...)]

## Core Inclusion Principle
If the seed topic appears as a primary modifier in a compound term, the term should generally be INCLUDED unless it clearly belongs to a completely different domain.

## Classification Criteria

Classify the candidate node as INCLUDE or EXCLUDE based on these principles:

### INCLUDE if the node represents:

1. **Core Concepts**: Direct properties, components, or types of the seed topic
2. **Essential Processes**: Intrinsic processes that define or transform the seed topic
3. **Direct Measurements**: Specific quantifiable properties of the seed topic
4. **Taxonomic Members**: Specific instances or subtypes within the seed topic's domain
5. **Constitutive Relations**: What the seed topic is made of or necessarily contains

### EXCLUDE if the node represents:

1. **Semantic Irrelevance**: No meaningful connection to the seed topic
2. **Overly Broad Terms**: Would encompass many unrelated concepts
3. **Domain Drift**: Belongs primarily to an adjacent but distinct domain
4. **Generic Terms**: Could apply to any scientific domain without specific connection
5. **Contextual Ambiguity**: Terms where the non-topic meaning dominates
6. **Administrative/Social Constructs**: Human organizational frameworks around the topic

## Decision Framework

1. **Primary Domain Test**: Does this node's primary conceptual home align with the seed topic?
2. **Essentiality Test**: Would removing this node leave a significant gap in understanding the seed topic?
3. **Scope Test**: Does including this node maintain appropriate granularity without scope creep?
4. **Coherence Test**: Does this node strengthen or dilute the semantic coherence of the subgraph?

Consider ALL provided labels (prefLabel and altLabels) when making decisions. If any label strongly indicates relevance, weight that appropriately.

## Output Format

For each candidate node, provide:

- **Node**: [prefLabel exactly as given]
- **Classification**: INCLUDE or EXCLUDE
- **Confidence**: [0.0-1.0]

Format as a JSON array for easy parsing.

## Example

**Seed Topic**: soil
**Candidate Nodes**: [agricultural economics (farm economics, agribusiness economics), pH meter (soil pH meter, pH measurement device), clay soils (heavy soils)]

**Output**:

```json
[
  {"node": "agricultural economics", "classification": "EXCLUDE", "confidence": 0.85},
  {"node": "pH meter", "classification": "EXCLUDE", "confidence": 0.70},
  {"node": "clay soils", "classification": "INCLUDE", "confidence": 0.94}
]
```

## Confidence Score Guidelines

- **0.9-1.0**: Clear, unambiguous fit or misfit with seed topic
- **0.7-0.89**: Strong indication but some edge cases exist
- **0.5-0.69**: Borderline case, could reasonably go either way
- **0.3-0.49**: Leaning toward classification but significant uncertainty
- **0.0-0.29**: Very uncertain, requires additional context"""

In [27]:
# Generate formatted list of all neighbor nodes for LLM evaluation
def format_neighbors_list(neighbors):
    """
    Format neighbors into a list structure: [prefLabel1 (altLabel1_1, altLabel1_2), prefLabel2, ...]
    
    Args:
        neighbors: List of neighbor dictionaries with prefLabel and altLabels
        
    Returns:
        str: Formatted string representation of the list
    """
    formatted_items = []
    
    for neighbor in neighbors:
        pref_label = neighbor['prefLabel']
        alt_labels = neighbor.get('altLabels', [])
        
        if alt_labels:
            # Format with altLabels: prefLabel (altLabel1, altLabel2, ...)
            alt_labels_str = ', '.join(alt_labels)
            formatted_items.append(f"{pref_label} ({alt_labels_str})")
        else:
            # Just prefLabel if no altLabels
            formatted_items.append(pref_label)
    
    return formatted_items

# Generate the formatted list
candidate_nodes_list = format_neighbors_list(neighbors)

print(f"Generated list of {len(candidate_nodes_list)} candidate nodes for evaluation")
print(f"\nFirst 10 nodes as preview:")
for i, node in enumerate(candidate_nodes_list[:10], 1):
    print(f"  {i}. {node}")

if len(candidate_nodes_list) > 10:
    print(f"  ... and {len(candidate_nodes_list) - 10} more nodes")

Generated list of 23 candidate nodes for evaluation

First 10 nodes as preview:
  1. edaphic factors (soil factors)
  2. evapotranspiration zone
  3. fine soil
  4. growing media (growth media, rooting media, soil substrates)
  5. land cover
  6. leachates
  7. losses from soil (losses from soil systems)
  8. minerals
  9. mud
  10. pedon
  ... and 13 more nodes


In [31]:
user_prompt_subkg = f"""**Seed Topic**: {seed_label}

**Candidate Nodes**: {candidate_nodes_list}

Please evaluate each candidate node and classify whether it should be INCLUDED or EXCLUDED from the sub-knowledge graph centered around "{seed_label}". Provide your response in the JSON array format specified in the system prompt."""

In [32]:
client = OpenAI()

# Define the model for a single node evaluation
class NodeEvaluation(BaseModel):
    node: str
    classification: str
    confidence: float

# Define the model for the complete response (list of evaluations)
class SubKGExtractorResponse(BaseModel):
    evaluations: List[NodeEvaluation]

response = client.responses.parse(
    model="gpt-5",
    input=[
        {"role": "system", "content": system_prompt_subkg},
        {
            "role": "user",
            "content": user_prompt_subkg,
        },
    ],
    text_format=SubKGExtractorResponse,
)

result = response.output_parsed
print(f"Evaluated {len(result.evaluations)} nodes:")
print("=" * 80)

# Display all evaluations
for i, eval in enumerate(result.evaluations, 1):
    print(f"{i:3d}. {eval.node}")
    print(f"      Classification: {eval.classification}")
    print(f"      Confidence: {eval.confidence}")
    print()

# Summary statistics
included = [e for e in result.evaluations if e.classification.upper() == "INCLUDE"]
excluded = [e for e in result.evaluations if e.classification.upper() == "EXCLUDE"]

print("=" * 80)
print(f"SUMMARY: {len(included)} INCLUDE, {len(excluded)} EXCLUDE")

Evaluated 23 nodes:
  1. edaphic factors
      Classification: INCLUDE
      Confidence: 0.95

  2. evapotranspiration zone
      Classification: EXCLUDE
      Confidence: 0.68

  3. fine soil
      Classification: INCLUDE
      Confidence: 0.9

  4. growing media
      Classification: INCLUDE
      Confidence: 0.72

  5. land cover
      Classification: EXCLUDE
      Confidence: 0.9

  6. leachates
      Classification: INCLUDE
      Confidence: 0.75

  7. losses from soil
      Classification: INCLUDE
      Confidence: 0.92

  8. minerals
      Classification: EXCLUDE
      Confidence: 0.8

  9. mud
      Classification: EXCLUDE
      Confidence: 0.7

 10. pedon
      Classification: INCLUDE
      Confidence: 0.95

 11. plant litter
      Classification: INCLUDE
      Confidence: 0.9

 12. rhizosphere
      Classification: INCLUDE
      Confidence: 0.97

 13. soil air
      Classification: INCLUDE
      Confidence: 0.95

 14. soil improvement
      Classification: INCLUDE
      Confi

In [None]:
# Iterative Sub-Knowledge Graph Extraction
# This cell runs the complete extraction process in one go

def iterative_subkg_extraction(seed_label, endpoint="https://agrovoc.fao.org/sparql", max_iterations=10, batch_size=30):
    """
    Iteratively extract a sub-knowledge graph centered around a seed concept.
    
    Args:
        seed_label: The preferred label of the seed concept
        endpoint: SPARQL endpoint URL
        max_iterations: Maximum number of iterations to prevent infinite loops
        batch_size: Maximum number of nodes to evaluate in one LLM call (default: 30)
    
    Returns:
        dict: Results containing all_visited, all_included, all_pruned nodes and iteration details
    """
    
    print("="*100)
    print(f"ITERATIVE SUB-KNOWLEDGE GRAPH EXTRACTION")
    print(f"Seed Topic: {seed_label}")
    print(f"Batch Size: {batch_size} nodes per LLM call")
    print("="*100)
    
    # Initialize tracking sets (using URIs to ensure uniqueness)
    all_visited_uris = set()  # Track by URI
    all_included_uris = set()  # Track by URI
    all_pruned_uris = set()   # Track by URI
    
    # Track full node information
    all_visited_nodes = {}    # URI -> node info
    all_included_nodes = {}   # URI -> node info
    all_pruned_nodes = {}     # URI -> node info
    
    # Add seed node to included list
    print(f"\n{'='*100}")
    print(f"INITIALIZATION: Adding seed node to included list")
    print(f"{'='*100}")
    
    # Query to get seed node information
    seed_neighbors = query_agrovoc_one_hop_neighbors(seed_label, endpoint)
    if seed_neighbors:
        # Get the seed node URI by querying it
        try:
            seed_uri = find_concept_uri_by_label(seed_label, endpoint)
            if seed_uri:
                seed_node = {
                    'uri': seed_uri,
                    'prefLabel': seed_label,
                    'altLabels': [],
                    'displayName': seed_label,
                    'relations': [],
                    'confidence': 1.0  # Seed node always has confidence 1.0
                }
                all_visited_uris.add(seed_uri)
                all_included_uris.add(seed_uri)
                all_visited_nodes[seed_uri] = seed_node
                all_included_nodes[seed_uri] = seed_node
                print(f"Seed node '{seed_label}' added to included list (URI: {seed_uri})")
        except Exception as e:
            print(f"Warning: Could not add seed node: {e}")
    
    # Start with the seed node - get its initial neighbors
    print(f"\n{'='*100}")
    print(f"ITERATION 0: Querying neighbors of seed node '{seed_label}'")
    print(f"{'='*100}")
    
    initial_neighbors = query_agrovoc_one_hop_neighbors(seed_label, endpoint)
    print(f"Found {len(initial_neighbors)} initial neighbors")
    
    if not initial_neighbors:
        print("No neighbors found for seed node. Exiting.")
        return {
            'all_visited': [],
            'all_included': [],
            'all_pruned': [],
            'iterations': 0
        }
    
    # Nodes to evaluate in the next iteration
    nodes_to_evaluate = initial_neighbors.copy()
    
    iteration = 0
    iteration_details = []
    
    while iteration < max_iterations:
        iteration += 1
        
        print(f"\n{'='*100}")
        print(f"ITERATION {iteration}")
        print(f"{'='*100}")
        
        # Filter out nodes we've already visited
        new_nodes_to_evaluate = [n for n in nodes_to_evaluate if n['uri'] not in all_visited_uris]
        
        if not new_nodes_to_evaluate:
            print(f"No new nodes to evaluate. All {len(nodes_to_evaluate)} nodes have been visited before.")
            print("Terminating: No unvisited nodes remaining.")
            break
        
        print(f"Evaluating {len(new_nodes_to_evaluate)} new nodes (filtered from {len(nodes_to_evaluate)} total)")
        
        # Mark these nodes as visited
        for node in new_nodes_to_evaluate:
            all_visited_uris.add(node['uri'])
            all_visited_nodes[node['uri']] = node
        
        # Split nodes into batches if needed
        num_batches = (len(new_nodes_to_evaluate) + batch_size - 1) // batch_size
        if num_batches > 1:
            print(f"Splitting into {num_batches} batches of max {batch_size} nodes each")
            print(f"Processing batches in parallel...")
        
        # Prepare batches
        batches = []
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(new_nodes_to_evaluate))
            batch_nodes = new_nodes_to_evaluate[start_idx:end_idx]
            batches.append({
                'idx': batch_idx + 1,
                'nodes': batch_nodes,
                'start': start_idx + 1,
                'end': end_idx
            })
        
        def evaluate_batch(batch_info):
            """
            Evaluate a single batch of nodes using LLM.
            This function will be called in parallel for each batch.
            """
            batch_idx = batch_info['idx']
            batch_nodes = batch_info['nodes']
            
            # Format nodes for LLM evaluation
            candidate_nodes_list = format_neighbors_list(batch_nodes)
            
            # Create user prompt
            user_prompt = f"""**Seed Topic**: {seed_label}

**Candidate Nodes**: {candidate_nodes_list}

Please evaluate each candidate node and classify whether it should be INCLUDED or EXCLUDED from the sub-knowledge graph centered around "{seed_label}". Provide your response in the JSON array format specified in the system prompt."""
            
            # Call LLM for evaluation
            try:
                response = client.responses.parse(
                    model="gpt-5",
                    input=[
                        {"role": "system", "content": system_prompt_subkg},
                        {"role": "user", "content": user_prompt},
                    ],
                    text_format=SubKGExtractorResponse
                )
                
                result = response.output_parsed
                return {
                    'success': True,
                    'batch_idx': batch_idx,
                    'evaluations': result.evaluations,
                    'count': len(result.evaluations)
                }
                
            except Exception as e:
                return {
                    'success': False,
                    'batch_idx': batch_idx,
                    'error': str(e)
                }
        
        # Execute batches in parallel using ThreadPoolExecutor
        from concurrent.futures import ThreadPoolExecutor, as_completed
        
        all_evaluations = []
        
        with ThreadPoolExecutor(max_workers=min(num_batches, 5)) as executor:
            # Submit all batch evaluation tasks
            future_to_batch = {executor.submit(evaluate_batch, batch): batch for batch in batches}
            
            # Process results as they complete
            completed = 0
            for future in as_completed(future_to_batch):
                completed += 1
                batch = future_to_batch[future]
                
                try:
                    result = future.result()
                    
                    if result['success']:
                        all_evaluations.extend(result['evaluations'])
                        print(f"✓ Batch {result['batch_idx']}/{num_batches} completed: "
                              f"{result['count']} evaluations received "
                              f"[{completed}/{num_batches} batches finished]")
                    else:
                        print(f"✗ Batch {result['batch_idx']}/{num_batches} failed: "
                              f"{result['error']} "
                              f"[{completed}/{num_batches} batches finished]")
                        
                except Exception as e:
                    print(f"✗ Batch {batch['idx']}/{num_batches} exception: {e} "
                          f"[{completed}/{num_batches} batches finished]")
        
        print(f"\nAll batches completed. Total evaluations collected: {len(all_evaluations)}")
        
        # Process results from all batches
        iteration_included = []
        iteration_excluded = []
        
        # Create multiple mappings for flexible lookup
        # Map by prefLabel, displayName, and also try to extract prefLabel from displayName format
        node_lookup_by_pref = {n['prefLabel']: n for n in new_nodes_to_evaluate}
        node_lookup_by_display = {n['displayName']: n for n in new_nodes_to_evaluate}
        
        def find_node(node_label):
            """
            Try to find a node by various matching strategies.
            Handles cases where LLM outputs prefLabel, displayName, or prefLabel (altLabels) format.
            """
            # Direct match by prefLabel
            if node_label in node_lookup_by_pref:
                return node_lookup_by_pref[node_label]
            
            # Direct match by displayName
            if node_label in node_lookup_by_display:
                return node_lookup_by_display[node_label]
            
            # Try to extract prefLabel from "prefLabel (altLabel1, altLabel2)" format
            if '(' in node_label and ')' in node_label:
                extracted_pref = node_label.split('(')[0].strip()
                if extracted_pref in node_lookup_by_pref:
                    return node_lookup_by_pref[extracted_pref]
            
            # Case-insensitive fallback
            node_label_lower = node_label.lower()
            for pref, node in node_lookup_by_pref.items():
                if pref.lower() == node_label_lower:
                    return node
            
            return None
        
        # Process all evaluations from all batches
        for eval_result in all_evaluations:
            node_label = eval_result.node
            
            # Find the corresponding node using flexible matching
            node_info = find_node(node_label)
            
            if node_info is None:
                print(f"Warning: Node '{node_label}' not found in original list")
                continue
            
            node_uri = node_info['uri']
            
            if eval_result.classification.upper() == "INCLUDE":
                all_included_uris.add(node_uri)
                # Store node info with confidence score
                node_with_confidence = node_info.copy()
                node_with_confidence['confidence'] = eval_result.confidence
                all_included_nodes[node_uri] = node_with_confidence
                iteration_included.append({
                    'node': node_info,
                    'confidence': eval_result.confidence
                })
            else:  # EXCLUDE
                all_pruned_uris.add(node_uri)
                # Store node info with confidence score
                node_with_confidence = node_info.copy()
                node_with_confidence['confidence'] = eval_result.confidence
                all_pruned_nodes[node_uri] = node_with_confidence
                iteration_excluded.append({
                    'node': node_info,
                    'confidence': eval_result.confidence
                })
        
        print(f"\nResults: {len(iteration_included)} INCLUDED, {len(iteration_excluded)} EXCLUDED")
        
        # Display included nodes
        if iteration_included:
            print(f"\nIncluded nodes:")
            for i, item in enumerate(iteration_included[:10], 1):
                print(f"  {i}. {item['node']['displayName']} (confidence: {item['confidence']:.2f})")
            if len(iteration_included) > 10:
                print(f"  ... and {len(iteration_included) - 10} more")
        
        # Store iteration details
        iteration_details.append({
            'iteration': iteration,
            'evaluated': len(new_nodes_to_evaluate),
            'included': len(iteration_included),
            'excluded': len(iteration_excluded)
        })
        
        # Check termination condition: no nodes included
        if not iteration_included:
            print("\nTerminating: No nodes were included in this iteration.")
            break
        
        # Query AGROVOC for neighbors of all included nodes
        print(f"\nQuerying neighbors for {len(iteration_included)} included nodes...")
        next_iteration_nodes = []
        
        for item in iteration_included:
            node_label = item['node']['prefLabel']
            try:
                neighbors = query_agrovoc_one_hop_neighbors(node_label, endpoint)
                next_iteration_nodes.extend(neighbors)
            except Exception as e:
                print(f"Error querying neighbors for '{node_label}': {e}")
        
        # Deduplicate by URI
        unique_next_nodes = {}
        for node in next_iteration_nodes:
            if node['uri'] not in unique_next_nodes:
                unique_next_nodes[node['uri']] = node
        
        nodes_to_evaluate = list(unique_next_nodes.values())
        print(f"Collected {len(nodes_to_evaluate)} unique neighbors for next iteration")
    
    # Final summary
    print(f"\n{'='*100}")
    print(f"EXTRACTION COMPLETE")
    print(f"{'='*100}")
    print(f"Total iterations: {iteration}")
    print(f"Total nodes visited: {len(all_visited_nodes)}")
    print(f"Total nodes included: {len(all_included_nodes)}")
    print(f"Total nodes pruned: {len(all_pruned_nodes)}")
    
    # Iteration summary
    print(f"\nIteration summary:")
    for detail in iteration_details:
        print(f"  Iteration {detail['iteration']}: {detail['evaluated']} evaluated, "
              f"{detail['included']} included, {detail['excluded']} excluded")
    
    return {
        'all_visited': list(all_visited_nodes.values()),
        'all_included': list(all_included_nodes.values()),
        'all_pruned': list(all_pruned_nodes.values()),
        'iterations': iteration,
        'iteration_details': iteration_details,
        'seed_label': seed_label
    }


# Run the extraction
extraction_results = iterative_subkg_extraction(seed_label="soil", max_iterations=20, batch_size=30)

ITERATIVE SUB-KNOWLEDGE GRAPH EXTRACTION
Seed Topic: soil
Batch Size: 30 nodes per LLM call

INITIALIZATION: Adding seed node to included list
Seed node 'soil' added to included list (URI: http://aims.fao.org/aos/agrovoc/c_7156)

ITERATION 0: Querying neighbors of seed node 'soil'
Seed node 'soil' added to included list (URI: http://aims.fao.org/aos/agrovoc/c_7156)

ITERATION 0: Querying neighbors of seed node 'soil'
Found 23 initial neighbors

ITERATION 1
Evaluating 23 new nodes (filtered from 23 total)
Found 23 initial neighbors

ITERATION 1
Evaluating 23 new nodes (filtered from 23 total)
✓ Batch 1/1 completed: 23 evaluations received [1/1 batches finished]

All batches completed. Total evaluations collected: 23

Results: 16 INCLUDED, 7 EXCLUDED

Included nodes:
  1. edaphic factors (soil factors) (confidence: 0.95)
  2. fine soil (confidence: 0.85)
  3. losses from soil (losses from soil systems) (confidence: 0.92)
  4. minerals (confidence: 0.80)
  5. pedon (confidence: 0.95)
  6.

In [47]:
# Display and analyze extraction results

print("="*100)
print("DETAILED RESULTS ANALYSIS")
print("="*100)

print(f"\nSeed Topic: {extraction_results['seed_label']}")
print(f"Total Iterations: {extraction_results['iterations']}")

print(f"\n{'='*100}")
print("ALL VISITED NODES")
print(f"{'='*100}")
print(f"Total: {len(extraction_results['all_visited'])}")
print("\nFirst 20 nodes:")
for i, node in enumerate(extraction_results['all_visited'][:20], 1):
    print(f"  {i:3d}. {node['displayName']}")
    print(f"       URI: {node['uri']}")

if len(extraction_results['all_visited']) > 20:
    print(f"  ... and {len(extraction_results['all_visited']) - 20} more")

print(f"\n{'='*100}")
print("ALL INCLUDED NODES (SUB-KG)")
print(f"{'='*100}")
print(f"Total: {len(extraction_results['all_included'])}")
print("\nAll included nodes:")
for i, node in enumerate(extraction_results['all_included'], 1):
    print(f"  {i:3d}. {node['displayName']}")
    print(f"       URI: {node['uri']}")

print(f"\n{'='*100}")
print("ALL PRUNED NODES")
print(f"{'='*100}")
print(f"Total: {len(extraction_results['all_pruned'])}")
print("\nFirst 20 pruned nodes:")
for i, node in enumerate(extraction_results['all_pruned'][:20], 1):
    print(f"  {i:3d}. {node['displayName']}")
    print(f"       URI: {node['uri']}")

if len(extraction_results['all_pruned']) > 20:
    print(f"  ... and {len(extraction_results['all_pruned']) - 20} more")

# Statistics
print(f"\n{'='*100}")
print("STATISTICS")
print(f"{'='*100}")
total = len(extraction_results['all_visited'])
included = len(extraction_results['all_included'])
pruned = len(extraction_results['all_pruned'])

print(f"Total nodes visited: {total}")
print(f"Nodes included in sub-KG: {included} ({included/total*100:.1f}%)")
print(f"Nodes pruned: {pruned} ({pruned/total*100:.1f}%)")

# Count nodes with altLabels
included_with_alt = sum(1 for n in extraction_results['all_included'] if n.get('altLabels'))
pruned_with_alt = sum(1 for n in extraction_results['all_pruned'] if n.get('altLabels'))

print(f"\nIncluded nodes with altLabels: {included_with_alt}")
print(f"Pruned nodes with altLabels: {pruned_with_alt}")

DETAILED RESULTS ANALYSIS

Seed Topic: soil
Total Iterations: 11

ALL VISITED NODES
Total: 1768

First 20 nodes:
    1. soil
       URI: http://aims.fao.org/aos/agrovoc/c_7156
    2. edaphic factors (soil factors)
       URI: http://aims.fao.org/aos/agrovoc/c_15617
    3. evapotranspiration zone
       URI: http://aims.fao.org/aos/agrovoc/c_25346
    4. fine soil
       URI: http://aims.fao.org/aos/agrovoc/c_5749f32e
    5. growing media (growth media, rooting media, soil substrates)
       URI: http://aims.fao.org/aos/agrovoc/c_3393
    6. land cover
       URI: http://aims.fao.org/aos/agrovoc/c_37897
    7. leachates
       URI: http://aims.fao.org/aos/agrovoc/c_35164
    8. losses from soil (losses from soil systems)
       URI: http://aims.fao.org/aos/agrovoc/c_36778
    9. minerals
       URI: http://aims.fao.org/aos/agrovoc/c_4857
   10. mud
       URI: http://aims.fao.org/aos/agrovoc/c_4973
   11. pedon
       URI: http://aims.fao.org/aos/agrovoc/c_7f60f575
   12. plant litter (

In [49]:
# Save included and pruned nodes to separate CSV files with confidence scores
def save_classification_results_to_csv(extraction_results, included_filename=None, pruned_filename=None):
    """
    Save included and pruned nodes to separate CSV files with confidence scores.
    Also identify and print nodes that were visited but not classified.
    
    Args:
        extraction_results: Results dictionary from iterative_subkg_extraction
        included_filename: Optional custom filename for included nodes. If None, generates timestamp-based name.
        pruned_filename: Optional custom filename for pruned nodes. If None, generates timestamp-based name.
    
    Returns:
        tuple: (included_csv_path, pruned_csv_path)
    """
    
    # Generate filenames if not provided
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    seed = extraction_results['seed_label'].replace(' ', '_')
    
    if included_filename is None:
        included_filename = f"subkg_{seed}_INCLUDED_{timestamp}.csv"
    if pruned_filename is None:
        pruned_filename = f"subkg_{seed}_PRUNED_{timestamp}.csv"
    
    included_nodes = extraction_results['all_included']
    pruned_nodes = extraction_results['all_pruned']
    all_visited_nodes = extraction_results['all_visited']
    
    # Function to prepare CSV data with confidence scores
    def prepare_csv_data(nodes):
        csv_data = []
        for node in nodes:
            row = {
                'prefLabel': node['prefLabel'],
                'altLabels': '; '.join(node.get('altLabels', [])) if node.get('altLabels') else '',
                'displayName': node['displayName'],
                'uri': node['uri'],
                'confidence': node.get('confidence', 0.0)  # Default to 0.0 if not present
            }
            csv_data.append(row)
        
        # Sort by confidence score in descending order
        csv_data.sort(key=lambda x: x['confidence'], reverse=True)
        return csv_data
    
    # Prepare data for both CSV files
    included_csv_data = prepare_csv_data(included_nodes)
    pruned_csv_data = prepare_csv_data(pruned_nodes)
    
    # Write included nodes to CSV
    with open(included_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['prefLabel', 'altLabels', 'displayName', 'uri', 'confidence']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(included_csv_data)
    
    # Write pruned nodes to CSV
    with open(pruned_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['prefLabel', 'altLabels', 'displayName', 'uri', 'confidence']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(pruned_csv_data)
    
    # Identify unclassified nodes (visited but neither included nor pruned)
    included_uris = {node['uri'] for node in included_nodes}
    pruned_uris = {node['uri'] for node in pruned_nodes}
    visited_uris = {node['uri'] for node in all_visited_nodes}
    
    unclassified_uris = visited_uris - (included_uris | pruned_uris)
    unclassified_nodes = [node for node in all_visited_nodes if node['uri'] in unclassified_uris]
    
    # Print summary
    print("="*100)
    print("SAVED CLASSIFICATION RESULTS TO CSV FILES")
    print("="*100)
    
    print(f"\n📁 INCLUDED NODES FILE")
    print(f"   Filename: {included_filename}")
    print(f"   Total nodes: {len(included_nodes)}")
    if included_csv_data:
        print(f"   Confidence range: {included_csv_data[-1]['confidence']:.2f} - {included_csv_data[0]['confidence']:.2f}")
        print(f"   Top 5 nodes by confidence:")
        for i, row in enumerate(included_csv_data[:5], 1):
            print(f"      {i}. {row['prefLabel']} (confidence: {row['confidence']:.2f})")
    
    print(f"\n📁 PRUNED NODES FILE")
    print(f"   Filename: {pruned_filename}")
    print(f"   Total nodes: {len(pruned_nodes)}")
    if pruned_csv_data:
        print(f"   Confidence range: {pruned_csv_data[-1]['confidence']:.2f} - {pruned_csv_data[0]['confidence']:.2f}")
        print(f"   Top 5 pruned nodes by confidence:")
        for i, row in enumerate(pruned_csv_data[:5], 1):
            print(f"      {i}. {row['prefLabel']} (confidence: {row['confidence']:.2f})")
    
    print(f"\n📊 CSV STRUCTURE")
    print(f"   - prefLabel: The preferred label of the concept")
    print(f"   - altLabels: Alternative labels (semicolon-separated)")
    print(f"   - displayName: Formatted display name (prefLabel with altLabels)")
    print(f"   - uri: The unique URI of the concept")
    print(f"   - confidence: Confidence score from LLM classification (0.0-1.0)")
    print(f"   - Sorted by: Confidence score (descending)")
    
    # Print unclassified nodes
    print(f"\n{'='*100}")
    print(f"UNCLASSIFIED NODES (Visited but not classified)")
    print(f"{'='*100}")
    print(f"Total: {len(unclassified_nodes)}")
    
    if unclassified_nodes:
        print(f"\nThese nodes were visited but not classified by the LLM:")
        for i, node in enumerate(unclassified_nodes, 1):
            print(f"   {i:3d}. {node['displayName']}")
            print(f"        URI: {node['uri']}")
        
        print(f"\n⚠ Note: Unclassified nodes may occur if:")
        print(f"   - The seed node was automatically added to included list")
        print(f"   - There were errors during LLM evaluation")
        print(f"   - Node matching failed between LLM output and original list")
    else:
        print(f"\n✓ All visited nodes have been classified (either included or pruned)")
    
    print(f"\n{'='*100}")
    print(f"Files saved successfully!")
    print(f"{'='*100}")
    
    return included_filename, pruned_filename


# Save the results to CSV files
included_csv, pruned_csv = save_classification_results_to_csv(extraction_results)

print(f"\n✅ EXTRACTION COMPLETE")
print(f"   Total iterations: {extraction_results['iterations']}")
print(f"   Total nodes visited: {len(extraction_results['all_visited'])}")
print(f"   Total nodes included: {len(extraction_results['all_included'])}")
print(f"   Total nodes pruned: {len(extraction_results['all_pruned'])}")


SAVED CLASSIFICATION RESULTS TO CSV FILES

📁 INCLUDED NODES FILE
   Filename: subkg_soil_INCLUDED_20251111_013208.csv
   Total nodes: 881
   Confidence range: 0.00 - 0.00
   Top 5 nodes by confidence:
      1. soil (confidence: 0.00)
      2. edaphic factors (confidence: 0.00)
      3. fine soil (confidence: 0.00)
      4. losses from soil (confidence: 0.00)
      5. minerals (confidence: 0.00)

📁 PRUNED NODES FILE
   Filename: subkg_soil_PRUNED_20251111_013208.csv
   Total nodes: 885
   Confidence range: 0.00 - 0.00
   Top 5 pruned nodes by confidence:
      1. evapotranspiration zone (confidence: 0.00)
      2. growing media (confidence: 0.00)
      3. land cover (confidence: 0.00)
      4. leachates (confidence: 0.00)
      5. mud (confidence: 0.00)

📊 CSV STRUCTURE
   - prefLabel: The preferred label of the concept
   - altLabels: Alternative labels (semicolon-separated)
   - displayName: Formatted display name (prefLabel with altLabels)
   - uri: The unique URI of the concept
   -

#### Query a local KG/SKOS vocab

In [53]:
# ===========================================================================================
# LOCAL KNOWLEDGE GRAPH VERSION - Query from SoilVoc.ttl
# ===========================================================================================

def query_local_skos_one_hop_neighbors(seed_label, ttl_file="SoilVoc.ttl"):
    """
    Query all one-hop neighbors of a seed concept from a local SKOS vocabulary file.
    
    Args:
        seed_label: Preferred label of the seed concept (e.g., "Soil")
        ttl_file: Path to the TTL file containing the SKOS vocabulary
    
    Returns:
        list: List of unique dictionaries containing neighbor information (deduplicated by URI)
    """
    
    # Load the RDF graph from the TTL file
    g = Graph()
    g.parse(ttl_file, format="turtle")
    
    # Find the seed concept by its prefLabel (case-insensitive)
    seed_uri = None
    for s, p, o in g.triples((None, SKOS.prefLabel, None)):
        if str(o).lower() == seed_label.lower():
            seed_uri = s
            break
    
    if not seed_uri:
        print(f"Seed concept '{seed_label}' not found in {ttl_file}")
        return []
    
    print(f"Found seed concept: {seed_uri}")
    
    # Collect all neighbors
    neighbors_dict = {}  # Use dict to deduplicate by URI
    
    # Standard SKOS predicates to follow
    skos_predicates = [
        SKOS.broader, SKOS.narrower, SKOS.related,
        SKOS.broaderTransitive, SKOS.narrowerTransitive,
        SKOS.relatedMatch, SKOS.broadMatch, SKOS.narrowMatch
    ]
    
    # Predicates to exclude
    exclude_predicates = [SKOS.inScheme, SKOS.exactMatch, SKOS.closeMatch]
    
    # Query outgoing relationships (seed -> neighbor)
    for relation in skos_predicates:
        for neighbor in g.objects(seed_uri, relation):
            if neighbor != seed_uri and isinstance(neighbor, URIRef):
                # Get prefLabel
                pref_labels = list(g.objects(neighbor, SKOS.prefLabel))
                if not pref_labels:
                    continue  # Skip if no prefLabel
                
                pref_label = str(pref_labels[0])
                
                # Get altLabels
                alt_labels = [str(alt) for alt in g.objects(neighbor, SKOS.altLabel)]
                
                # Get relation label
                relation_labels = list(g.objects(relation, RDFS.label))
                relation_label = str(relation_labels[0]) if relation_labels else str(relation).split('#')[-1].split('/')[-1]
                
                # Store neighbor info
                neighbor_uri = str(neighbor)
                if neighbor_uri not in neighbors_dict:
                    neighbors_dict[neighbor_uri] = {
                        'uri': neighbor_uri,
                        'prefLabel': pref_label,
                        'altLabels': alt_labels,
                        'displayName': f"{pref_label} ({', '.join(alt_labels)})" if alt_labels else pref_label,
                        'relations': []
                    }
                
                # Add this relation
                neighbors_dict[neighbor_uri]['relations'].append({
                    'predicate': str(relation),
                    'label': relation_label,
                    'direction': 'outgoing'
                })
    
    # Query incoming relationships (neighbor -> seed)
    for relation in skos_predicates:
        for neighbor in g.subjects(relation, seed_uri):
            if neighbor != seed_uri and isinstance(neighbor, URIRef):
                # Get prefLabel
                pref_labels = list(g.objects(neighbor, SKOS.prefLabel))
                if not pref_labels:
                    continue  # Skip if no prefLabel
                
                pref_label = str(pref_labels[0])
                
                # Get altLabels
                alt_labels = [str(alt) for alt in g.objects(neighbor, SKOS.altLabel)]
                
                # Get relation label
                relation_labels = list(g.objects(relation, RDFS.label))
                relation_label = str(relation_labels[0]) if relation_labels else str(relation).split('#')[-1].split('/')[-1]
                
                # Store neighbor info
                neighbor_uri = str(neighbor)
                if neighbor_uri not in neighbors_dict:
                    neighbors_dict[neighbor_uri] = {
                        'uri': neighbor_uri,
                        'prefLabel': pref_label,
                        'altLabels': alt_labels,
                        'displayName': f"{pref_label} ({', '.join(alt_labels)})" if alt_labels else pref_label,
                        'relations': []
                    }
                
                # Add this relation
                neighbors_dict[neighbor_uri]['relations'].append({
                    'predicate': str(relation),
                    'label': relation_label,
                    'direction': 'incoming'
                })
    
    # Convert dict to list
    neighbors_list = list(neighbors_dict.values())
    
    print(f"Found {len(neighbors_list)} unique neighbors")
    
    return neighbors_list


def find_local_concept_uri_by_label(label, ttl_file="SoilVoc.ttl"):
    """
    Find the URI of a concept in a local TTL file by its preferred label.
    
    Args:
        label: The preferred label to search for (e.g., "Soil")
        ttl_file: Path to the TTL file containing the SKOS vocabulary
    
    Returns:
        str: URI of the concept, or None if not found
    """
    g = Graph()
    g.parse(ttl_file, format="turtle")
    
    for s, p, o in g.triples((None, SKOS.prefLabel, None)):
        if str(o).lower() == label.lower():
            return str(s)
    
    return None


def iterative_subkg_extraction_local(seed_label, ttl_file="SoilVoc.ttl", max_iterations=10, batch_size=30):
    """
    Iteratively extract a sub-knowledge graph centered around a seed concept from a LOCAL TTL file.
    
    Args:
        seed_label: The preferred label of the seed concept
        ttl_file: Path to the local TTL file containing the SKOS vocabulary
        max_iterations: Maximum number of iterations to prevent infinite loops
        batch_size: Maximum number of nodes to evaluate in one LLM call (default: 30)
    
    Returns:
        dict: Results containing all_visited, all_included, all_pruned nodes and iteration details
    """
    
    print("="*100)
    print(f"ITERATIVE SUB-KNOWLEDGE GRAPH EXTRACTION (LOCAL TTL FILE)")
    print(f"Source File: {ttl_file}")
    print(f"Seed Topic: {seed_label}")
    print(f"Batch Size: {batch_size} nodes per LLM call")
    print("="*100)
    
    # Initialize tracking sets (using URIs to ensure uniqueness)
    all_visited_uris = set()  # Track by URI
    all_included_uris = set()  # Track by URI
    all_pruned_uris = set()   # Track by URI
    
    # Track full node information
    all_visited_nodes = {}    # URI -> node info
    all_included_nodes = {}   # URI -> node info
    all_pruned_nodes = {}     # URI -> node info
    
    # Add seed node to included list
    print(f"\n{'='*100}")
    print(f"INITIALIZATION: Adding seed node to included list")
    print(f"{'='*100}")
    
    # Query to get seed node information
    seed_neighbors = query_local_skos_one_hop_neighbors(seed_label, ttl_file)
    if seed_neighbors:
        # Get the seed node URI
        try:
            seed_uri = find_local_concept_uri_by_label(seed_label, ttl_file)
            if seed_uri:
                seed_node = {
                    'uri': seed_uri,
                    'prefLabel': seed_label,
                    'altLabels': [],
                    'displayName': seed_label,
                    'relations': [],
                    'confidence': 1.0  # Seed node always has confidence 1.0
                }
                all_visited_uris.add(seed_uri)
                all_included_uris.add(seed_uri)
                all_visited_nodes[seed_uri] = seed_node
                all_included_nodes[seed_uri] = seed_node
                print(f"Seed node '{seed_label}' added to included list (URI: {seed_uri})")
        except Exception as e:
            print(f"Warning: Could not add seed node: {e}")
    
    # Start with the seed node - get its initial neighbors
    print(f"\n{'='*100}")
    print(f"ITERATION 0: Querying neighbors of seed node '{seed_label}'")
    print(f"{'='*100}")
    
    initial_neighbors = query_local_skos_one_hop_neighbors(seed_label, ttl_file)
    print(f"Found {len(initial_neighbors)} initial neighbors")
    
    if not initial_neighbors:
        print("No neighbors found for seed node. Exiting.")
        return {
            'all_visited': [],
            'all_included': [],
            'all_pruned': [],
            'iterations': 0
        }
    
    # Nodes to evaluate in the next iteration
    nodes_to_evaluate = initial_neighbors.copy()
    
    iteration = 0
    iteration_details = []
    
    while iteration < max_iterations:
        iteration += 1
        
        print(f"\n{'='*100}")
        print(f"ITERATION {iteration}")
        print(f"{'='*100}")
        
        # Filter out nodes we've already visited
        new_nodes_to_evaluate = [n for n in nodes_to_evaluate if n['uri'] not in all_visited_uris]
        
        if not new_nodes_to_evaluate:
            print(f"No new nodes to evaluate. All {len(nodes_to_evaluate)} nodes have been visited before.")
            print("Terminating: No unvisited nodes remaining.")
            break
        
        print(f"Evaluating {len(new_nodes_to_evaluate)} new nodes (filtered from {len(nodes_to_evaluate)} total)")
        
        # Mark these nodes as visited
        for node in new_nodes_to_evaluate:
            all_visited_uris.add(node['uri'])
            all_visited_nodes[node['uri']] = node
        
        # Split nodes into batches if needed
        num_batches = (len(new_nodes_to_evaluate) + batch_size - 1) // batch_size
        if num_batches > 1:
            print(f"Splitting into {num_batches} batches of max {batch_size} nodes each")
            print(f"Processing batches in parallel...")
        
        # Prepare batches
        batches = []
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(new_nodes_to_evaluate))
            batch_nodes = new_nodes_to_evaluate[start_idx:end_idx]
            batches.append({
                'idx': batch_idx + 1,
                'nodes': batch_nodes,
                'start': start_idx + 1,
                'end': end_idx
            })
        
        def evaluate_batch(batch_info):
            """
            Evaluate a single batch of nodes using LLM.
            This function will be called in parallel for each batch.
            """
            batch_idx = batch_info['idx']
            batch_nodes = batch_info['nodes']
            
            # Format nodes for LLM evaluation
            candidate_nodes_list = format_neighbors_list(batch_nodes)
            
            # Create user prompt
            user_prompt = f"""**Seed Topic**: {seed_label}

**Candidate Nodes**: {candidate_nodes_list}

Please evaluate each candidate node and classify whether it should be INCLUDED or EXCLUDED from the sub-knowledge graph centered around "{seed_label}". Provide your response in the JSON array format specified in the system prompt."""
            
            # Call LLM for evaluation
            try:
                response = client.responses.parse(
                    model="gpt-5",
                    input=[
                        {"role": "system", "content": system_prompt_subkg},
                        {"role": "user", "content": user_prompt},
                    ],
                    text_format=SubKGExtractorResponse
                )
                
                result = response.output_parsed
                return {
                    'success': True,
                    'batch_idx': batch_idx,
                    'evaluations': result.evaluations,
                    'count': len(result.evaluations)
                }
                
            except Exception as e:
                return {
                    'success': False,
                    'batch_idx': batch_idx,
                    'error': str(e)
                }
        
        # Execute batches in parallel using ThreadPoolExecutor
        from concurrent.futures import ThreadPoolExecutor, as_completed
        
        all_evaluations = []
        
        with ThreadPoolExecutor(max_workers=min(num_batches, 5)) as executor:
            # Submit all batch evaluation tasks
            future_to_batch = {executor.submit(evaluate_batch, batch): batch for batch in batches}
            
            # Process results as they complete
            completed = 0
            for future in as_completed(future_to_batch):
                completed += 1
                batch = future_to_batch[future]
                
                try:
                    result = future.result()
                    
                    if result['success']:
                        all_evaluations.extend(result['evaluations'])
                        print(f"✓ Batch {result['batch_idx']}/{num_batches} completed: "
                              f"{result['count']} evaluations received "
                              f"[{completed}/{num_batches} batches finished]")
                    else:
                        print(f"✗ Batch {result['batch_idx']}/{num_batches} failed: "
                              f"{result['error']} "
                              f"[{completed}/{num_batches} batches finished]")
                        
                except Exception as e:
                    print(f"✗ Batch {batch['idx']}/{num_batches} exception: {e} "
                          f"[{completed}/{num_batches} batches finished]")
        
        print(f"\nAll batches completed. Total evaluations collected: {len(all_evaluations)}")
        
        # Process results from all batches
        iteration_included = []
        iteration_excluded = []
        
        # Create multiple mappings for flexible lookup
        # Map by prefLabel, displayName, and also try to extract prefLabel from displayName format
        node_lookup_by_pref = {n['prefLabel']: n for n in new_nodes_to_evaluate}
        node_lookup_by_display = {n['displayName']: n for n in new_nodes_to_evaluate}
        
        def find_node(node_label):
            """
            Try to find a node by various matching strategies.
            Handles cases where LLM outputs prefLabel, displayName, or prefLabel (altLabels) format.
            """
            # Direct match by prefLabel
            if node_label in node_lookup_by_pref:
                return node_lookup_by_pref[node_label]
            
            # Direct match by displayName
            if node_label in node_lookup_by_display:
                return node_lookup_by_display[node_label]
            
            # Try to extract prefLabel from "prefLabel (altLabel1, altLabel2)" format
            if '(' in node_label and ')' in node_label:
                extracted_pref = node_label.split('(')[0].strip()
                if extracted_pref in node_lookup_by_pref:
                    return node_lookup_by_pref[extracted_pref]
            
            # Case-insensitive fallback
            node_label_lower = node_label.lower()
            for pref, node in node_lookup_by_pref.items():
                if pref.lower() == node_label_lower:
                    return node
            
            return None
        
        # Process all evaluations from all batches
        for eval_result in all_evaluations:
            node_label = eval_result.node
            
            # Find the corresponding node using flexible matching
            node_info = find_node(node_label)
            
            if node_info is None:
                print(f"Warning: Node '{node_label}' not found in original list")
                continue
            
            node_uri = node_info['uri']
            
            if eval_result.classification.upper() == "INCLUDE":
                all_included_uris.add(node_uri)
                # Store node info with confidence score
                node_with_confidence = node_info.copy()
                node_with_confidence['confidence'] = eval_result.confidence
                all_included_nodes[node_uri] = node_with_confidence
                iteration_included.append({
                    'node': node_info,
                    'confidence': eval_result.confidence
                })
            else:  # EXCLUDE
                all_pruned_uris.add(node_uri)
                # Store node info with confidence score
                node_with_confidence = node_info.copy()
                node_with_confidence['confidence'] = eval_result.confidence
                all_pruned_nodes[node_uri] = node_with_confidence
                iteration_excluded.append({
                    'node': node_info,
                    'confidence': eval_result.confidence
                })
        
        print(f"\nResults: {len(iteration_included)} INCLUDED, {len(iteration_excluded)} EXCLUDED")
        
        # Display included nodes
        if iteration_included:
            print(f"\nIncluded nodes:")
            for i, item in enumerate(iteration_included[:10], 1):
                print(f"  {i}. {item['node']['displayName']} (confidence: {item['confidence']:.2f})")
            if len(iteration_included) > 10:
                print(f"  ... and {len(iteration_included) - 10} more")
        
        # Store iteration details
        iteration_details.append({
            'iteration': iteration,
            'evaluated': len(new_nodes_to_evaluate),
            'included': len(iteration_included),
            'excluded': len(iteration_excluded)
        })
        
        # Check termination condition: no nodes included
        if not iteration_included:
            print("\nTerminating: No nodes were included in this iteration.")
            break
        
        # Query local TTL for neighbors of all included nodes
        print(f"\nQuerying neighbors for {len(iteration_included)} included nodes...")
        next_iteration_nodes = []
        
        for item in iteration_included:
            node_label = item['node']['prefLabel']
            try:
                neighbors = query_local_skos_one_hop_neighbors(node_label, ttl_file)
                next_iteration_nodes.extend(neighbors)
            except Exception as e:
                print(f"Error querying neighbors for '{node_label}': {e}")
        
        # Deduplicate by URI
        unique_next_nodes = {}
        for node in next_iteration_nodes:
            if node['uri'] not in unique_next_nodes:
                unique_next_nodes[node['uri']] = node
        
        nodes_to_evaluate = list(unique_next_nodes.values())
        print(f"Collected {len(nodes_to_evaluate)} unique neighbors for next iteration")
    
    # Final summary
    print(f"\n{'='*100}")
    print(f"EXTRACTION COMPLETE")
    print(f"{'='*100}")
    print(f"Total iterations: {iteration}")
    print(f"Total nodes visited: {len(all_visited_nodes)}")
    print(f"Total nodes included: {len(all_included_nodes)}")
    print(f"Total nodes pruned: {len(all_pruned_nodes)}")
    
    # Iteration summary
    print(f"\nIteration summary:")
    for detail in iteration_details:
        print(f"  Iteration {detail['iteration']}: {detail['evaluated']} evaluated, "
              f"{detail['included']} included, {detail['excluded']} excluded")
    
    return {
        'all_visited': list(all_visited_nodes.values()),
        'all_included': list(all_included_nodes.values()),
        'all_pruned': list(all_pruned_nodes.values()),
        'iterations': iteration,
        'iteration_details': iteration_details,
        'seed_label': seed_label,
        'source_file': ttl_file
    }


# Example usage (commented out - uncomment to run):
extraction_results_local = iterative_subkg_extraction_local(seed_label="soil property", ttl_file="SoilVoc.ttl", max_iterations=20, batch_size=30)

ITERATIVE SUB-KNOWLEDGE GRAPH EXTRACTION (LOCAL TTL FILE)
Source File: SoilVoc.ttl
Seed Topic: soil property
Batch Size: 30 nodes per LLM call

INITIALIZATION: Adding seed node to included list
Found seed concept: https://soilwise-he.github.io/soil-health#SoilProperty
Found 4 unique neighbors
Found seed concept: https://soilwise-he.github.io/soil-health#SoilProperty
Found 4 unique neighbors
Seed node 'soil property' added to included list (URI: https://soilwise-he.github.io/soil-health#SoilProperty)

ITERATION 0: Querying neighbors of seed node 'soil property'
Found seed concept: https://soilwise-he.github.io/soil-health#SoilProperty
Found 4 unique neighbors
Found 4 initial neighbors

ITERATION 1
Evaluating 4 new nodes (filtered from 4 total)
Seed node 'soil property' added to included list (URI: https://soilwise-he.github.io/soil-health#SoilProperty)

ITERATION 0: Querying neighbors of seed node 'soil property'
Found seed concept: https://soilwise-he.github.io/soil-health#SoilProperty