In [None]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, OWL, split_uri
from rdflib.collection import Collection
from datetime import datetime

from openai import OpenAI
from pydantic import BaseModel
import re

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import pandas as pd

import time
from typing import Dict, Any, Optional, Tuple, List
import logging
import sys
from collections import defaultdict

from pathlib import Path
from SPARQLWrapper import SPARQLWrapper, JSON
from concurrent.futures import ThreadPoolExecutor, as_completed
import traceback

In [3]:
# Opening config file, the config structure is:
# {"openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [4]:
def load_graph(data):
    g = Graph()
    g.parse(data=data, format="turtle")
    return g

In [5]:
def print_rdf(rdf):
    g = Graph()
    g.parse(data=rdf, format="turtle")

    for s, p, o in g:
        print(s, p, o)

In [6]:
# Namespaces
she = Namespace("https://soilwise-he.github.io/soil-health#")
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
agrontology = Namespace("http://aims.fao.org/aos/agrontology#")
sio = Namespace("http://semanticscience.org/resource/")
glosis_lh = Namespace("http://w3id.org/glosis/model/layerhorizon/")
glosis_sp = Namespace("http://w3id.org/glosis/model/siteplot/")
glosis_cm = Namespace("http://w3id.org/glosis/model/common/")
glosis_pr = Namespace("http://w3id.org/glosis/model/profile/")
glosis_su = Namespace("http://w3id.org/glosis/model/surface/")
glosis_cl = Namespace("http://w3id.org/glosis/model/codelists/")
glosis_proc = Namespace("http://w3id.org/glosis/model/procedure/")
qudt = Namespace("http://qudt.org/schema/qudt/")
unit = Namespace("http://qudt.org/vocab/unit/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
obo = Namespace("http://purl.obolibrary.org/obo/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
biolink = Namespace("https://w3id.org/biolink/vocab/")
afox = Namespace("http://purl.allotrope.org/ontologies/property#")
afor = Namespace("http://purl.allotrope.org/ontologies/result#")
sorelsc = Namespace("http://sweetontology.net/relaSci/")
sorelpr = Namespace("http://sweetontology.net/relaProvenance/")
sohuj = Namespace("http://sweetontology.net/humanJurisdiction/")
sorelph = Namespace("http://sweetontology.net/relaPhysical/")
sorelm = Namespace("http://sweetontology.net/relaMath/")
sorepsg = Namespace("http://sweetontology.net/reprSpaceGeometry/")
bao = Namespace("http://www.bioassayontology.org/bao#")
repr = Namespace("https://w3id.org/reproduceme#")
sorelch = Namespace("http://sweetontology.net/relaChemical/")
sorelsp = Namespace("http://sweetontology.net/relaSpace/")
om = Namespace("http://www.ontology-of-units-of-measure.org/resource/om-2/")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")

# Scheme URI
scheme_uri = URIRef("https://soilwise-he.github.io/soil-health")

### Vocabs or not vocabs

In [7]:
def extract_skos_concepts_to_csv(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts from a TTL file and save to CSV.
    
    Args:
        ttl_file_path (str): Path to the input TTL file
        output_csv_path (str): Path to the output CSV file
    """
    
    # Create a graph and parse the TTL file
    g = Graph()
    g.parse(ttl_file_path, format='turtle')
    
    # Bind namespaces for cleaner output (optional)
    g.bind("skos", SKOS)
    g.bind("agrontology", agrontology)
    
    # Query for all SKOS concepts
    concepts_data = []
    
    # Find all subjects that are of type skos:Concept
    for concept_uri in g.subjects(predicate=URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), 
                                 object=SKOS.Concept):
        
        # Get prefLabel (should be unique)
        pref_label = ""
        for pref in g.objects(concept_uri, SKOS.prefLabel):
            pref_label = str(pref)
            break  # Take the first one if multiple exist
        
        # Get altLabels and abbreviations
        alt_labels = []
        
        # Collect skos:altLabel
        for alt in g.objects(concept_uri, SKOS.altLabel):
            alt_labels.append(str(alt))
        
        # Collect agrontology:hasAbbreviation
        for abbrev in g.objects(concept_uri, agrontology.hasAbbreviation):
            alt_labels.append(str(abbrev))
        
        # Join alternative labels with semicolon
        alt_labels_str = ";".join(alt_labels) if alt_labels else ""
        
        # Add to results
        concepts_data.append([
            str(concept_uri),  # Full URI
            pref_label,        # Preferred label
            alt_labels_str     # Alternative labels/abbreviations
        ])
    
    # Sort by URI for consistent output
    concepts_data.sort(key=lambda x: x[0])
    
    # Write to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write header
        writer.writerow(['URI', 'prefLabel', 'altLabel_abbreviation'])
        
        # Write data
        writer.writerows(concepts_data)
    
    print(f"Extracted {len(concepts_data)} SKOS concepts to {output_csv_path}")
    return len(concepts_data)

if __name__ == "__main__":
    # Replace with your actual file paths
    ttl_file = "soil_health_KG.ttl"
    csv_output = "shkg.csv"
    
    try:
        # Use the first method (iterative approach)
        count = extract_skos_concepts_to_csv(ttl_file, csv_output)
        
        print(f"Successfully processed {count} concepts")
        
    except Exception as e:
        print(f"Error processing file: {e}")
        print("Make sure you have rdflib installed: pip install rdflib")

Extracted 1785 SKOS concepts to shkg.csv
Successfully processed 1785 concepts


In [7]:
def analyze_uris_from_csv(input_csv_path, output_csv_path):
    """
    Analyze URIs from the matched_concepts.csv file:
    1. Collect all different URIs that appear in the CSV
    2. Count how many times each URI appears in total
    3. Track which columns each URI appears in
    4. Rank URIs in descending order by frequency
    
    Args:
        input_csv_path (str): Path to the input CSV file
        output_csv_path (str): Path to save the analysis results
    """
    
    # Dictionary to store URI statistics
    # Structure: {uri: {'total_count': count, 'columns': set_of_columns}}
    uri_stats = defaultdict(lambda: {'total_count': 0, 'columns': set()})
    
    # Read the CSV file
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
        print(f"Columns: {list(df.columns)}")
        
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return
    
    # Process each row and column
    for row_idx, row in df.iterrows():
        for col_name in df.columns:
            cell_value = str(row[col_name])
            
            # Skip NaN values
            if pd.isna(row[col_name]) or cell_value == 'nan':
                continue
            
            # Split by comma to handle multiple URIs in one cell
            uris_in_cell = [uri.strip() for uri in cell_value.split(',') if uri.strip()]
            
            for uri in uris_in_cell:
                # Only process if it looks like a URI (contains http)
                if 'http' in uri:
                    uri_stats[uri]['total_count'] += 1
                    uri_stats[uri]['columns'].add(col_name)
    
    # Convert to list and sort by total count (descending)
    results = []
    for uri, stats in uri_stats.items():
        results.append({
            'uri': uri,
            'total_count': stats['total_count'],
            'columns': ', '.join(sorted(stats['columns']))
        })
    
    # Sort by total count in descending order
    results.sort(key=lambda x: x['total_count'], reverse=True)
    
    # Save results to CSV
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv_path, index=False)
    
    print(f"\nAnalysis complete!")
    print(f"Total unique URIs found: {len(results)}")
    print(f"Results saved to: {output_csv_path}")
    
    # Display top 10 results
    print(f"\nTop 10 most frequent URIs:")
    print("=" * 80)
    for i, result in enumerate(results[:10], 1):
        print(f"{i:2d}. {result['uri']}")
        print(f"    Count: {result['total_count']}, Appears in columns: {result['columns']}")
        print()
    
    return results

# Run the analysis
input_file = "candidate_vocabs/candidate_concepts.csv"
output_file = "candidate_vocabs/candidate_concepts_ranking.csv"

results = analyze_uris_from_csv(input_file, output_file)

Loaded CSV with 714 rows and 4 columns
Columns: ['keywords', 'thesauri', 'gpt-4o', 'gpt-4.1']

Analysis complete!
Total unique URIs found: 1093
Results saved to: candidate_vocabs/candidate_concepts_ranking.csv

Top 10 most frequent URIs:
 1. https://soilwise-he.github.io/soil-health#MicrobialBiomass
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 2. https://soilwise-he.github.io/soil-health#Leptosols
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 3. https://soilwise-he.github.io/soil-health#Clay
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 4. https://soilwise-he.github.io/soil-health#SoilPollution
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 5. https://soilwise-he.github.io/soil-health#SoilDegradation
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, keywords, thesauri

 6. https://soilwise-he.github.io/soil-health#SoilOrganicCarbon
    Count: 4, Appears in columns: gpt-4.1, gpt-4o, 

#### LLM-as-a-judge

In [6]:
system_prompt_voc = """You are an expert AI assistant specializing in soil science and controlled vocabulary development. Your task is to analyze a given term and determine if it is suitable for inclusion in a formal, standardized soil science vocabulary.

**Your Goal:**
Classify each term you receive into one of two categories: "Vocabulary" or "Un-vocabulary".

**Definitions and Rules:**

1. **"Vocabulary" Term:**
   
   * Represents a standardized, reusable, and generic concept *within the domain of soil science*.
   * It is often a general concept that can have specific instances, values, or measurements.
   * It can be singular or plural.
   * Abbreviations or standard acronyms that refer directly to those concepts (e.g. `SOC`, `DDT`).
   * It should be a noun or a noun phrase that is broadly recognized and used in soil science literature, without evaluative or descriptive adjectives (avoid “high”, “moderate”, “low”, etc.).
   * *Examples of Vocabulary Terms:* `soil organic carbon`, `cation exchange capacity`, `soil texture`, `bulk density`, `soil horizon`, `parent material`, `silt loam`.
2. **"Un-vocabulary" Term:**
   A term is classified as "Un-vocabulary" if it meets **any** of the following criteria:
   
   * **Evaluative/descriptive instances:** It represents a specific *measurement*, *qualitative state*, or *quantitative description* of a vocabulary term (e.g. “moderate soil organic carbon content”, “high bulk density”, “poor CEC”).
   * **Too broad or out of scope:** The term is a generic concept that is not specific to soil science and lacks a direct, unique meaning within the domain (e.g. “time”, “location” when unqualified).
   * **Context-specific phrases:** The term is phrased as a statement or sentence fragment rather than a standardized standalone noun concept (e.g. “agricultural area under severe erosion”).
3. **Confidence:**
   
   * Provide a confidence score between 0 and 1 reflecting how certain you are in your Vocabulary/Un-vocabulary decision.

**Output Format:**
For every term you are given, you MUST respond in the strict JSON format. Do not add any extra conversation or pleasantries."""

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TermProcessor:
    def __init__(self, system_prompt: str, user_prompt_template: str):
        """
        Initialize the processor with prompts
        
        Args:
            system_prompt: The system prompt (unchanged for all calls)
            user_prompt_template: Template for user prompt with {term} placeholder
        """
        self.client = OpenAI()
        self.system_prompt = system_prompt
        self.user_prompt_template = user_prompt_template
        
    def get_term_from_row(self, row: pd.Series) -> str:
        """
        Extract the term from a CSV row (preferred label, fallback to alternative label)
        
        Args:
            row: Pandas Series representing a CSV row
            
        Returns:
            The term to process
        """
        # Assuming columns are: URL, preferred_label, alternative_label
        preferred_label = row.iloc[1] if len(row) > 1 else ""
        alternative_label = row.iloc[2] if len(row) > 2 else ""
        
        # Use preferred label if available and not empty, otherwise use alternative
        if pd.notna(preferred_label) and str(preferred_label).strip():
            return str(preferred_label).strip()
        elif pd.notna(alternative_label) and str(alternative_label).strip():
            return str(alternative_label).strip()
        else:
            return ""
    
    def call_llm_api(self, term: str) -> Optional[Dict[str, Any]]:
        """
        Call the LLM API with the given term
        
        Args:
            term: The term to evaluate
            
        Returns:
            JSON response from LLM or None if error
        """
        try:
            # Create the user prompt with the term
            prompt_voc = self.user_prompt_template.format(term=term)
            
            completion = self.client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": prompt_voc}
                ],
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "soil_vocab_review",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "term": {
                                    "type": "string",
                                    "description": "The specific term being evaluated, exactly as input."
                                },
                                "is_vocab_term": {
                                    "type": "boolean",
                                    "description": "Whether the term should be included in the controlled vocabulary for soil science."
                                },
                                "confidence_score": {
                                    "type": "number",
                                    "description": "Confidence score of the judgement, from 0 to 1 (inclusive).",
                                    "minimum": 0,
                                    "maximum": 1
                                }
                            },
                            "required": [
                                "term",
                                "is_vocab_term",
                                "confidence_score"
                            ],
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                }
            )
            
            # Parse the JSON response
            response_content = completion.choices[0].message.content
            return json.loads(response_content)
            
        except Exception as e:
            logger.error(f"Error calling LLM API for term '{term}': {str(e)}")
            return None
    
    def process_csv(self, input_csv_path: str, output_csv_path: str, 
                   delay_seconds: float = 1.0, resume_from_row: int = 0):
        """
        Process the CSV file and generate results
        
        Args:
            input_csv_path: Path to input CSV file
            output_csv_path: Path to output CSV file
            delay_seconds: Delay between API calls to respect rate limits
            resume_from_row: Row number to resume from (0-indexed)
        """
        # Read the input CSV
        try:
            df = pd.read_csv(input_csv_path)
            logger.info(f"Loaded CSV with {len(df)} rows")
        except Exception as e:
            logger.error(f"Error reading CSV file: {str(e)}")
            return
        
        # Prepare results list
        results = []
        
        # Load existing results if resuming
        if resume_from_row > 0:
            try:
                existing_df = pd.read_csv(output_csv_path)
                results = existing_df.to_dict('records')
                logger.info(f"Resuming from row {resume_from_row}, loaded {len(results)} existing results")
            except FileNotFoundError:
                logger.warning(f"Output file {output_csv_path} not found, starting fresh")
                resume_from_row = 0
        
        # Process each row starting from resume_from_row
        for idx, row in df.iloc[resume_from_row:].iterrows():
            actual_idx = idx if resume_from_row == 0 else resume_from_row + (idx - df.iloc[resume_from_row:].index[0])
            
            # Get the term from the row
            term = self.get_term_from_row(row)
            
            if not term:
                logger.warning(f"Row {actual_idx}: No valid term found, skipping")
                continue
            
            logger.info(f"Processing row {actual_idx}: '{term}'")
            
            # Call LLM API
            result = self.call_llm_api(term)
            
            if result:
                # Add original row data to the result
                result['original_url'] = row.iloc[0] if len(row) > 0 else ""
                result['preferred_label'] = row.iloc[1] if len(row) > 1 else ""
                result['alternative_label'] = row.iloc[2] if len(row) > 2 else ""
                result['row_index'] = actual_idx
                
                results.append(result)
                logger.info(f"Row {actual_idx}: Success - is_vocab_term: {result['is_vocab_term']}, confidence: {result['confidence_score']}")
            else:
                # Add error entry
                error_result = {
                    'term': term,
                    'is_vocab_term': None,
                    'confidence_score': None,
                    'original_url': row.iloc[0] if len(row) > 0 else "",
                    'preferred_label': row.iloc[1] if len(row) > 1 else "",
                    'alternative_label': row.iloc[2] if len(row) > 2 else "",
                    'row_index': actual_idx,
                    'error': 'API call failed'
                }
                results.append(error_result)
                logger.error(f"Row {actual_idx}: Failed to process term '{term}'")
            
            # Save results periodically (every 10 rows)
            if len(results) % 10 == 0:
                self.save_results(results, output_csv_path)
                logger.info(f"Saved intermediate results ({len(results)} rows)")
            
            # Delay between API calls
            if delay_seconds > 0:
                time.sleep(delay_seconds)
        
        # Save final results
        self.save_results(results, output_csv_path)
        logger.info(f"Processing complete. Results saved to {output_csv_path}")
        
        # Print summary
        successful_calls = sum(1 for r in results if r.get('is_vocab_term') is not None)
        failed_calls = len(results) - successful_calls
        vocab_terms = sum(1 for r in results if r.get('is_vocab_term') is True)
        
        logger.info(f"Summary: {successful_calls} successful, {failed_calls} failed, {vocab_terms} vocab terms identified")
    
    def save_results(self, results: list, output_csv_path: str):
        """Save results to CSV file"""
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(output_csv_path, index=False)
        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")

def main():
    # Configuration
    INPUT_CSV_PATH = "ontovocabs/soil_health_KG.csv"  # Change this to your input file path
    OUTPUT_CSV_PATH = "llm_results.csv"  # Change this to your desired output file path
    DELAY_SECONDS = 1.0  # Delay between API calls (adjust based on rate limits)
    RESUME_FROM_ROW = 0  # Set to row number if resuming from interruption
    
    # Define your prompts here
    SYSTEM_PROMPT = system_prompt_voc
    
    USER_PROMPT_TEMPLATE = """Now please determine if the following term is a vocabulary term or un-vocabulary term: {term}
    """
    
    # Initialize processor
    processor = TermProcessor(SYSTEM_PROMPT, USER_PROMPT_TEMPLATE)
    
    # Process the CSV
    processor.process_csv(
        input_csv_path=INPUT_CSV_PATH,
        output_csv_path=OUTPUT_CSV_PATH,
        delay_seconds=DELAY_SECONDS,
        resume_from_row=RESUME_FROM_ROW
    )

if __name__ == "__main__":
    main()

2025-07-14 17:11:37,022 - INFO - Loaded CSV with 1787 rows
2025-07-14 17:11:37,025 - INFO - Processing row 0: 'abiotic environment'
2025-07-14 17:11:37,940 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:37,957 - INFO - Row 0: Success - is_vocab_term: False, confidence: 0.8
2025-07-14 17:11:38,960 - INFO - Processing row 1: 'abundance'
2025-07-14 17:11:39,875 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:39,896 - INFO - Row 1: Success - is_vocab_term: False, confidence: 0.85
2025-07-14 17:11:40,899 - INFO - Processing row 2: 'abundance of species populations'
2025-07-14 17:11:42,676 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:42,690 - INFO - Row 2: Success - is_vocab_term: False, confidence: 0.85
2025-07-14 17:11:43,702 - INFO - Processing row 3: 'acceptable risk levels'
2025-07-14 17:11:44,241 - INFO - HTTP 

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class LLMResultsPostProcessor:
    def __init__(self, input_csv_path: str):
        """
        Initialize the post-processor with the LLM results CSV file
        
        Args:
            input_csv_path: Path to the CSV file containing LLM results
        """
        self.input_csv_path = input_csv_path
        self.df = None
        self.load_data()
    
    def load_data(self):
        """Load the CSV data and validate it"""
        try:
            self.df = pd.read_csv(self.input_csv_path)
            logger.info(f"Loaded {len(self.df)} rows from {self.input_csv_path}")
            
            # Validate required columns
            required_columns = ['term', 'is_vocab_term', 'confidence_score', 'original_url']
            missing_columns = [col for col in required_columns if col not in self.df.columns]
            
            if missing_columns:
                raise ValueError(f"Missing required columns: {missing_columns}")
            
            # Clean the data
            self.clean_data()
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            sys.exit(1)
    
    def clean_data(self):
        """Clean and validate the data"""
        original_count = len(self.df)
        
        # Remove rows with missing essential data
        self.df = self.df.dropna(subset=['term', 'is_vocab_term', 'confidence_score', 'original_url'])
        
        # Convert confidence_score to numeric, handling any string values
        self.df['confidence_score'] = pd.to_numeric(self.df['confidence_score'], errors='coerce')
        
        # Remove rows with invalid confidence scores
        self.df = self.df.dropna(subset=['confidence_score'])
        
        # Ensure confidence scores are within valid range [0, 1]
        self.df = self.df[
            (self.df['confidence_score'] >= 0) & 
            (self.df['confidence_score'] <= 1)
        ]
        
        cleaned_count = len(self.df)
        removed_count = original_count - cleaned_count
        
        if removed_count > 0:
            logger.warning(f"Removed {removed_count} rows with invalid data")
        
        logger.info(f"Data cleaned: {cleaned_count} valid rows remaining")
    
    def display_summary(self):
        """Display summary statistics of the data"""
        print("\n" + "="*50)
        print("DATA SUMMARY")
        print("="*50)
        
        total_rows = len(self.df)
        vocab_terms = len(self.df[self.df['is_vocab_term'] == True])
        non_vocab_terms = len(self.df[self.df['is_vocab_term'] == False])
        
        print(f"Total processed terms: {total_rows}")
        print(f"Vocab terms (True): {vocab_terms} ({vocab_terms/total_rows*100:.1f}%)")
        print(f"Non-vocab terms (False): {non_vocab_terms} ({non_vocab_terms/total_rows*100:.1f}%)")
        
        print(f"\nConfidence Score Statistics:")
        print(f"Mean: {self.df['confidence_score'].mean():.3f}")
        print(f"Median: {self.df['confidence_score'].median():.3f}")
        print(f"Min: {self.df['confidence_score'].min():.3f}")
        print(f"Max: {self.df['confidence_score'].max():.3f}")
        print(f"Std: {self.df['confidence_score'].std():.3f}")
        
        print(f"\nConfidence Score Distribution:")
        print(f"0.0-0.2: {len(self.df[self.df['confidence_score'] < 0.2])}")
        print(f"0.2-0.4: {len(self.df[(self.df['confidence_score'] >= 0.2) & (self.df['confidence_score'] < 0.4)])}")
        print(f"0.4-0.6: {len(self.df[(self.df['confidence_score'] >= 0.4) & (self.df['confidence_score'] < 0.6)])}")
        print(f"0.6-0.8: {len(self.df[(self.df['confidence_score'] >= 0.6) & (self.df['confidence_score'] < 0.8)])}")
        print(f"0.8-1.0: {len(self.df[self.df['confidence_score'] >= 0.8])}")
    
    def get_vocab_choice(self) -> Optional[bool]:
        """Get user's choice for vocab term filtering"""
        print("\n" + "="*50)
        print("VOCAB TERM FILTERING")
        print("="*50)
        print("Choose which terms to include:")
        print("1. Only vocab terms (is_vocab_term = True)")
        print("2. Only non-vocab terms (is_vocab_term = False)")
        print("3. Both vocab and non-vocab terms")
        
        while True:
            try:
                choice = input("\nEnter your choice (1/2/3): ").strip()
                if choice == '1':
                    return True
                elif choice == '2':
                    return False
                elif choice == '3':
                    return None
                else:
                    print("Invalid choice. Please enter 1, 2, or 3.")
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                sys.exit(0)
    
    def get_confidence_range(self) -> Tuple[float, float]:
        """Get user's choice for confidence score range"""
        print("\n" + "="*50)
        print("CONFIDENCE SCORE FILTERING")
        print("="*50)
        print("Current confidence score range: {:.3f} - {:.3f}".format(
            self.df['confidence_score'].min(),
            self.df['confidence_score'].max()
        ))
        
        while True:
            try:
                print("\nEnter confidence score range (0.0 to 1.0):")
                min_score = float(input("Minimum confidence score: "))
                max_score = float(input("Maximum confidence score: "))
                
                if min_score < 0 or min_score > 1:
                    print("Minimum score must be between 0.0 and 1.0")
                    continue
                if max_score < 0 or max_score > 1:
                    print("Maximum score must be between 0.0 and 1.0")
                    continue
                if min_score > max_score:
                    print("Minimum score cannot be greater than maximum score")
                    continue
                
                return min_score, max_score
                
            except ValueError:
                print("Invalid input. Please enter numeric values.")
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                sys.exit(0)
    
    def apply_filters(self, vocab_filter: Optional[bool], 
                     confidence_range: Tuple[float, float]) -> pd.DataFrame:
        """Apply the selected filters to the data"""
        filtered_df = self.df.copy()
        
        # Apply vocab term filter
        if vocab_filter is not None:
            filtered_df = filtered_df[filtered_df['is_vocab_term'] == vocab_filter]
        
        # Apply confidence score range filter
        min_conf, max_conf = confidence_range
        filtered_df = filtered_df[
            (filtered_df['confidence_score'] >= min_conf) & 
            (filtered_df['confidence_score'] <= max_conf)
        ]
        
        return filtered_df
    
    def save_filtered_results(self, filtered_df: pd.DataFrame, output_path: str):
        """Save the filtered results to a CSV file"""
        try:
            # Create output dataframe with selected columns
            output_df = filtered_df[['original_url', 'term', 'is_vocab_term', 'confidence_score']].copy()
            
            # Sort by confidence score (descending) for better organization
            output_df = output_df.sort_values('confidence_score', ascending=False)
            
            # Save to CSV
            output_df.to_csv(output_path, index=False)
            logger.info(f"Filtered results saved to {output_path}")
            
            return True
            
        except Exception as e:
            logger.error(f"Error saving filtered results: {str(e)}")
            return False
    
    def display_filter_summary(self, filtered_df: pd.DataFrame, 
                             vocab_filter: Optional[bool], 
                             confidence_range: Tuple[float, float]):
        """Display summary of filtered results"""
        print("\n" + "="*50)
        print("FILTER RESULTS SUMMARY")
        print("="*50)
        
        print(f"Applied filters:")
        if vocab_filter is None:
            print(f"  - Vocab terms: Both True and False")
        else:
            print(f"  - Vocab terms: {vocab_filter}")
        
        min_conf, max_conf = confidence_range
        print(f"  - Confidence range: {min_conf:.3f} - {max_conf:.3f}")
        
        print(f"\nResults:")
        print(f"  - Original dataset: {len(self.df)} terms")
        print(f"  - Filtered dataset: {len(filtered_df)} terms")
        print(f"  - Percentage retained: {len(filtered_df)/len(self.df)*100:.1f}%")
        
        if len(filtered_df) > 0:
            vocab_count = len(filtered_df[filtered_df['is_vocab_term'] == True])
            non_vocab_count = len(filtered_df[filtered_df['is_vocab_term'] == False])
            
            print(f"\nFiltered results breakdown:")
            print(f"  - Vocab terms: {vocab_count}")
            print(f"  - Non-vocab terms: {non_vocab_count}")
            print(f"  - Average confidence: {filtered_df['confidence_score'].mean():.3f}")
    
    def interactive_process(self):
        """Run the interactive post-processing workflow"""
        print("LLM Results Post-Processor")
        print("="*50)
        
        # Display summary
        self.display_summary()
        
        # Get user preferences
        vocab_filter = self.get_vocab_choice()
        confidence_range = self.get_confidence_range()
        
        # Apply filters
        filtered_df = self.apply_filters(vocab_filter, confidence_range)
        
        # Display filter summary
        self.display_filter_summary(filtered_df, vocab_filter, confidence_range)
        
        if len(filtered_df) == 0:
            print("\nNo terms match the specified criteria.")
            return
        
        # Get output filename
        print("\n" + "="*50)
        print("SAVE RESULTS")
        print("="*50)
        
        while True:
            try:
                output_filename = input("Enter output filename (e.g., 'filtered_terms.csv'): ").strip()
                if not output_filename:
                    print("Filename cannot be empty.")
                    continue
                
                if not output_filename.endswith('.csv'):
                    output_filename += '.csv'
                
                break
                
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                return
        
        # Save results
        if self.save_filtered_results(filtered_df, output_filename):
            print(f"\nSuccess! {len(filtered_df)} terms saved to '{output_filename}'")
            print("The output file contains: original_url, term, is_vocab_term, confidence_score")
        else:
            print("Error: Failed to save results.")

def main():
    """Main function to run the post-processor"""
    # Configuration - Update these paths as needed
    INPUT_CSV_PATH = "llm_results_v2.csv"  # Change this to your LLM results file path
    
    print("LLM Results Post-Processor (IDE Version)")
    print("="*50)
    print(f"Input file: {INPUT_CSV_PATH}")
    
    # Check if file exists
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"Error: File '{INPUT_CSV_PATH}' not found.")
        print("Please update the INPUT_CSV_PATH variable in the main() function.")
        return
    
    # Create and run the post-processor
    processor = LLMResultsPostProcessor(INPUT_CSV_PATH)
    processor.interactive_process()

if __name__ == "__main__":
    main()

2025-07-14 22:11:50,374 - INFO - Loaded 1786 rows from llm_results_v2.csv
2025-07-14 22:11:50,383 - INFO - Data cleaned: 1786 valid rows remaining


LLM Results Post-Processor (IDE Version)
Input file: llm_results_v2.csv
LLM Results Post-Processor

DATA SUMMARY
Total processed terms: 1786
Vocab terms (True): 714 (40.0%)
Non-vocab terms (False): 1072 (60.0%)

Confidence Score Statistics:
Mean: 0.901
Median: 0.920
Min: 0.150
Max: 1.000
Std: 0.069

Confidence Score Distribution:
0.0-0.2: 1
0.2-0.4: 0
0.4-0.6: 0
0.6-0.8: 67
0.8-1.0: 1718

VOCAB TERM FILTERING
Choose which terms to include:
1. Only vocab terms (is_vocab_term = True)
2. Only non-vocab terms (is_vocab_term = False)
3. Both vocab and non-vocab terms

CONFIDENCE SCORE FILTERING
Current confidence score range: 0.150 - 1.000

Enter confidence score range (0.0 to 1.0):

FILTER RESULTS SUMMARY
Applied filters:
  - Vocab terms: True
  - Confidence range: 0.000 - 1.000

Results:
  - Original dataset: 1786 terms
  - Filtered dataset: 714 terms
  - Percentage retained: 40.0%

Filtered results breakdown:
  - Vocab terms: 714
  - Non-vocab terms: 0
  - Average confidence: 0.915

SAVE

2025-07-14 22:12:14,675 - INFO - Filtered results saved to filtered_terms_2.csv



Success! 714 terms saved to 'filtered_terms_2.csv'
The output file contains: original_url, term, is_vocab_term, confidence_score


### Soil Property Process ontology

In [23]:
g = Graph()
g.parse("ontovocabs/soil_property_process.ttl", format='turtle')

<Graph identifier=N22edf6a00b28422bbec508194b81a6a8 (<class 'rdflib.graph.Graph'>)>

In [24]:
# Enhanced version to properly parse owl:propertyChainAxiom from blank nodes
def parse_property_chain(g, blank_node):
    """Parse a property chain from a blank node (RDF list)"""
    try:
        # Convert the blank node to a Collection (RDF list)
        collection = Collection(g, blank_node)
        chain = []
        for item in collection:
            # Get the short name of each property in the chain
            prop_name = str(item).split('#')[-1] if '#' in str(item) else str(item).split('/')[-1]
            chain.append(prop_name)
        return " → ".join(chain)
    except:
        return str(blank_node)

# Query for all object properties with enhanced property chain parsing
object_properties = []

for subject in g.subjects(predicate=RDF.type, object=OWL.ObjectProperty):
    property_info = {
        'uri': str(subject),
        'label': str(subject).split('#')[-1] if '#' in str(subject) else str(subject).split('/')[-1],
        'properties': [],
        'property_chains': []
    }
    
    # Collect all properties of this object property
    for pred, obj in g.predicate_objects(subject):
        if pred != RDF.type or obj != OWL.ObjectProperty:
            pred_label = str(pred).split('#')[-1] if '#' in str(pred) else str(pred).split('/')[-1]
            
            # Special handling for propertyChainAxiom
            if pred_label == "propertyChainAxiom":
                chain_description = parse_property_chain(g, obj)
                property_info['property_chains'].append(chain_description)
            else:
                obj_label = str(obj).split('#')[-1] if '#' in str(obj) else str(obj).split('/')[-1]
                property_info['properties'].append((pred_label, obj_label))
    
    object_properties.append(property_info)

# Print results with enhanced property chain display
print(f"Found {len(object_properties)} owl:ObjectProperty entries in the TTL file:\n")
print("=" * 80)

for i, prop in enumerate(object_properties, 1):
    print(f"{i:2d}. {prop['label']}")
    print(f"    URI: {prop['uri']}")
    
    if prop['properties']:
        print("    Additional properties:")
        for pred_label, obj_label in prop['properties']:
            print(f"      - {pred_label}: {obj_label}")
    
    if prop['property_chains']:
        print("    Property chain axioms:")
        for chain in prop['property_chains']:
            print(f"      - {chain}")
    
    print()

print("=" * 80)
print(f"Total: {len(object_properties)} owl:ObjectProperty entries")

# Summary of properties with property chains
properties_with_chains = [p for p in object_properties if p['property_chains']]
if properties_with_chains:
    print(f"\nProperties with property chain axioms ({len(properties_with_chains)}):")
    for prop in properties_with_chains:
        print(f"  - {prop['label']}:")
        for chain in prop['property_chains']:
            print(f"    • {chain}")

Found 16 owl:ObjectProperty entries in the TTL file:

 1. by
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#by

 2. dependsOn
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#dependsOn
    Additional properties:
      - subPropertyOf: influencedBy

 3. from
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#from

 4. hasComponent
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#hasComponent
    Additional properties:
      - subPropertyOf: hasPart

 5. hasImpactOn
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#hasImpactOn
    Additional properties:
      - type: TransitiveProperty
      - inverseOf: influencedBy
    Property chain axioms:
      - hasImpactOn → partOf
      - inverseProcessOf → hasImpactOn
      - measuredBy → hasImpactOn

 6. hasPart
    URI: http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#hasPart
    Additional properties:
      - type: TransitiveProperty
      - inverseOf: partO

#### Convert OWL Class Hierarchy to SKOS Vocabulary

In [9]:
def convert_owl_to_skos(owl_file_path, skos_output_path):
    """
    Extract OWL class hierarchy and convert to SKOS vocabulary.
    - owl:Class -> skos:Concept
    - rdfs:subClassOf -> skos:broader
    
    Args:
        owl_file_path: Path to the OWL ontology file
        skos_output_path: Path to save the SKOS vocabulary
    """
    
    def split_camel_case(text):
        """
        Split camelCase string into lowercase words.
        Example: 'SoilPhysicalProcess' -> 'soil physical process'
        """
        # Insert space before uppercase letters
        result = re.sub(r'([A-Z])', r' \1', text)
        # Clean up and convert to lowercase
        return result.strip().lower()
    
    # Load the OWL ontology
    g = Graph()
    g.parse(owl_file_path, format='turtle')
    print(f"Loaded OWL ontology with {len(g)} triples")
    
    # Create a new graph for SKOS vocabulary
    skos_g = Graph()
    
    # Define namespaces
    SPP = Namespace("http://imash.leeds.ac.uk/ontologies/atu/SoilPhysics.owl#")
    SHE = Namespace("https://soilwise-he.github.io/soil-health#")
    
    # Bind namespaces
    skos_g.bind("skos", SKOS)
    skos_g.bind("dcterms", DCTERMS)
    skos_g.bind("rdfs", RDFS)
    skos_g.bind("spp", SPP)
    skos_g.bind("she", SHE)
    skos_g.bind("owl", OWL)
    
    # Create a ConceptScheme for the vocabulary
    scheme_uri = URIRef("https://soilwise-he.github.io/soil-health/ConceptScheme")
    skos_g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    skos_g.add((scheme_uri, SKOS.prefLabel, Literal("Soil Property and Process Vocabulary", lang="en")))
    skos_g.add((scheme_uri, DCTERMS.title, Literal("Soil Property and Process Vocabulary", lang="en")))
    skos_g.add((scheme_uri, DCTERMS.description, Literal(
        "SKOS vocabulary derived from the Soil Property and Process OWL ontology. "
        "Describes soil physical properties and processes, as well as how they affect each other.", 
        lang="en")))
    skos_g.add((scheme_uri, DCTERMS.creator, Literal("Heshan Du, University of Leeds")))
    skos_g.add((scheme_uri, DCTERMS.created, Literal("April, 2016")))
    skos_g.add((scheme_uri, DCTERMS.license, Literal("Creative Commons Attribution 4.0 International (CC BY 4.0)")))
    
    # Extract all owl:Class instances (only URIRefs, not blank nodes)
    all_classes = list(g.subjects(RDF.type, OWL.Class))
    classes = [c for c in all_classes if isinstance(c, URIRef)]
    print(f"Found {len(all_classes)} OWL classes total, {len(classes)} named classes (excluding blank nodes)")
    
    # Create mapping from SPP URIs to SHE URIs
    uri_mapping = {}
    
    # Convert each class to a SKOS concept
    for spp_class_uri in classes:
        # Extract class label from URI
        class_label = str(spp_class_uri).split('#')[-1] if '#' in str(spp_class_uri) else str(spp_class_uri).split('/')[-1]
        
        # Create new SHE URI for this concept
        she_concept_uri = SHE[class_label]
        uri_mapping[spp_class_uri] = she_concept_uri
        
        # Add as skos:Concept
        skos_g.add((she_concept_uri, RDF.type, SKOS.Concept))
        skos_g.add((she_concept_uri, SKOS.inScheme, scheme_uri))
        
        # Add exactMatch to original OWL class
        skos_g.add((she_concept_uri, SKOS.exactMatch, spp_class_uri))
        
        # Split camelCase and add as prefLabel
        readable_label = split_camel_case(class_label)
        skos_g.add((she_concept_uri, SKOS.prefLabel, Literal(readable_label, lang="en")))
        
        # Check for rdfs:comment and convert to skos:definition
        for comment in g.objects(spp_class_uri, RDFS.comment):
            skos_g.add((she_concept_uri, SKOS.definition, comment))
    
    # Now add broader/narrower relationships using the new SHE URIs
    for spp_class_uri in classes:
        she_concept_uri = uri_mapping[spp_class_uri]
        
        # Extract rdfs:subClassOf relationships and convert to skos:broader
        for parent_class in g.objects(spp_class_uri, RDFS.subClassOf):
            # Only include if parent is a named class (not a blank node) and is in our mapping
            if isinstance(parent_class, URIRef) and parent_class in uri_mapping:
                she_parent_uri = uri_mapping[parent_class]
                skos_g.add((she_concept_uri, SKOS.broader, she_parent_uri))
                # Add inverse relationship
                skos_g.add((she_parent_uri, SKOS.narrower, she_concept_uri))
    
    # Identify top concepts (classes without rdfs:subClassOf to another named class)
    for spp_class_uri in classes:
        she_concept_uri = uri_mapping[spp_class_uri]
        has_parent = False
        
        for parent_class in g.objects(spp_class_uri, RDFS.subClassOf):
            if isinstance(parent_class, URIRef) and parent_class in uri_mapping:
                has_parent = True
                break
        
        if not has_parent:
            skos_g.add((scheme_uri, SKOS.hasTopConcept, she_concept_uri))
            skos_g.add((she_concept_uri, SKOS.topConceptOf, scheme_uri))
    
    # Save the SKOS vocabulary
    skos_g.serialize(destination=skos_output_path, format='turtle')
    print(f"SKOS vocabulary saved to {skos_output_path}")
    print(f"Total triples in SKOS vocabulary: {len(skos_g)}")
    
    # Print statistics
    concepts = list(skos_g.subjects(RDF.type, SKOS.Concept))
    top_concepts = list(skos_g.objects(scheme_uri, SKOS.hasTopConcept))
    broader_relations = list(skos_g.subject_objects(SKOS.broader))
    
    print(f"\nStatistics:")
    print(f"  - Total concepts: {len(concepts)}")
    print(f"  - Top concepts: {len(top_concepts)}")
    print(f"  - Broader relationships: {len(broader_relations)}")
    
    return skos_g

# Run the conversion
owl_file = "ontovocabs/spp/soil_property_process.ttl"
skos_output = "SoilVoc.ttl"

skos_graph = convert_owl_to_skos(owl_file, skos_output)

Loaded OWL ontology with 8606 triples
Found 606 OWL classes total, 592 named classes (excluding blank nodes)
SKOS vocabulary saved to SoilVoc.ttl
Total triples in SKOS vocabulary: 3677

Statistics:
  - Total concepts: 592
  - Top concepts: 16
  - Broader relationships: 635
SKOS vocabulary saved to SoilVoc.ttl
Total triples in SKOS vocabulary: 3677

Statistics:
  - Total concepts: 592
  - Top concepts: 16
  - Broader relationships: 635


In [13]:
# Display all top concepts with their one-layer narrower concepts
def display_skos_hierarchy(skos_graph):
    """
    Display all 16 top concepts with their direct narrower concepts (one layer only)
    """
    print("="*80)
    print("TOP CONCEPTS AND THEIR DIRECT NARROWER CONCEPTS")
    print("="*80)
    
    scheme_uri = URIRef("https://soilwise-he.github.io/soil-health/ConceptScheme")
    
    # Get all top concepts
    top_concepts = list(skos_graph.objects(scheme_uri, SKOS.hasTopConcept))
    
    # Sort top concepts by label for consistent display
    top_concepts_with_labels = []
    for concept in top_concepts:
        label = list(skos_graph.objects(concept, SKOS.prefLabel))
        label_str = str(label[0]) if label else str(concept).split('#')[-1]
        top_concepts_with_labels.append((concept, label_str))
    
    top_concepts_with_labels.sort(key=lambda x: x[1])
    
    print(f"\nTotal Top Concepts: {len(top_concepts_with_labels)}")
    print("="*80)
    
    for i, (concept, label_str) in enumerate(top_concepts_with_labels, 1):
        print(f"\n{i}. {label_str.upper()}")
        print(f"   URI: {concept}")
        
        # Get all direct narrower concepts
        narrower = list(skos_graph.objects(concept, SKOS.narrower))
        
        if narrower:
            # Sort narrower concepts by label
            narrower_with_labels = []
            for narrow in narrower:
                narrow_label = list(skos_graph.objects(narrow, SKOS.prefLabel))
                narrow_label_str = str(narrow_label[0]) if narrow_label else str(narrow).split('#')[-1]
                narrower_with_labels.append(narrow_label_str)
            
            narrower_with_labels.sort()
            
            print(f"   Narrower concepts ({len(narrower_with_labels)}):")
            for narrow_label in narrower_with_labels:
                print(f"      • {narrow_label}")
        else:
            print(f"   Narrower concepts: None")
    
    print("\n" + "="*80)
    print(f"Summary: {len(top_concepts_with_labels)} top-level concepts displayed")
    print("="*80)

# Display the hierarchy
display_skos_hierarchy(skos_graph)

TOP CONCEPTS AND THEIR DIRECT NARROWER CONCEPTS

Total Top Concepts: 16

1. HUMAN ACTIVITY
   URI: https://soilwise-he.github.io/soil-health#HumanActivity
   Narrower concepts (3):
      • land use
      • soil management
      • traffic

2. PHENOMENA
   URI: https://soilwise-he.github.io/soil-health#Phenomena
   Narrower concepts (2):
      • planetary phenomena
      • soil phenomena

3. PROCESS
   URI: https://soilwise-he.github.io/soil-health#Process
   Narrower concepts (3):
      • force
      • plant process
      • soil process

4. PROPERTY
   URI: https://soilwise-he.github.io/soil-health#Property
   Narrower concepts (5):
      • air property
      • general property
      • meteorological property
      • soil property
      • water property

5. SOIL FLUID RETENTION
   URI: https://soilwise-he.github.io/soil-health#SoilFluidRetention
   Narrower concepts: None

6. SOIL FLUID TRANSPORT
   URI: https://soilwise-he.github.io/soil-health#SoilFluidTransport
   Narrower concepts: 

### Visualize SoilVoc in HTML page

In [11]:
"""
Merge GloSIS SKOS hierarchical files with SoilVoc.ttl into a new file.
- Does NOT modify the original SoilVoc.ttl
- Keeps ALL information from both original SoilVoc and GloSIS modules
- Add skos:inScheme to all concepts
- Add top concepts to ConceptScheme skos:hasTopConcept
- Ensure all skos:narrower/skos:broader relationships are bidirectional
"""

def load_glosis_skos_files():
    """Load all GloSIS SKOS hierarchical files from the folder"""
    print("Loading GloSIS SKOS files...")

    folder_path = 'ontovocabs/glosis/glosis_skos_hier/'
    
    # Get all .ttl files from the folder
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.ttl')]
    
    print(f"  Found {len(files)} TTL files in {folder_path}")

    # Merge all files into one graph
    merged_graph = Graph()
    merged_graph.bind("she", she)
    merged_graph.bind("skos", SKOS)
    merged_graph.bind("glosis_lh", glosis_lh)
    merged_graph.bind("glosis_cl", glosis_cl)
    merged_graph.bind("glosis_proc", glosis_proc)
    merged_graph.bind("glosis_cm", glosis_cm)
    merged_graph.bind("glosis_pr", glosis_pr)
    merged_graph.bind("glosis_su", glosis_su)
    merged_graph.bind("glosis_sp", glosis_sp)

    total_concepts = 0
    for file_path in files:
        if not os.path.exists(file_path):
            print(f"  Warning: {file_path} not found, skipping")
            continue

        print(f"  Loading {os.path.basename(file_path)}...")
        g = Graph()
        g.parse(file_path, format='turtle')

        # Count concepts
        concept_count = len(list(g.subjects(RDF.type, SKOS.Concept)))
        total_concepts += concept_count
        print(f"    {concept_count} concepts")

        # Merge into main graph
        for s, p, o in g:
            merged_graph.add((s, p, o))

    print(f"\nTotal: {total_concepts} concepts loaded from GloSIS\n")
    return merged_graph

def add_in_scheme_to_concepts(graph):
    """Add skos:inScheme to all concepts"""
    print("Adding skos:inScheme to all concepts...")

    count = 0
    for concept in graph.subjects(RDF.type, SKOS.Concept):
        # Only add if not already present
        if (concept, SKOS.inScheme, None) not in graph:
            graph.add((concept, SKOS.inScheme, scheme_uri))
            count += 1

    print(f"  Added skos:inScheme to {count} concepts\n")
    return graph

def ensure_bidirectional_relations(graph):
    """
    Ensure all skos:narrower/skos:broader relationships are bidirectional.
    - If A skos:narrower B, then add B skos:broader A if not exists
    - If A skos:broader B, then add B skos:narrower A if not exists
    """
    print("Ensuring bidirectional skos:narrower/skos:broader relationships...")

    narrower_added = 0
    broader_added = 0

    # Process narrower relationships: A narrower B -> B broader A
    narrower_triples = list(graph.triples((None, SKOS.narrower, None)))
    for s, p, o in narrower_triples:
        if (o, SKOS.broader, s) not in graph:
            graph.add((o, SKOS.broader, s))
            broader_added += 1

    # Process broader relationships: A broader B -> B narrower A
    broader_triples = list(graph.triples((None, SKOS.broader, None)))
    for s, p, o in broader_triples:
        if (o, SKOS.narrower, s) not in graph:
            graph.add((o, SKOS.narrower, s))
            narrower_added += 1

    print(f"  Added {broader_added} skos:broader relationships (inverse of narrower)")
    print(f"  Added {narrower_added} skos:narrower relationships (inverse of broader)")
    print()
    return graph

def identify_top_concepts(graph):
    """Identify top concepts (have narrower but no broader)"""
    print("Identifying top concepts...")

    top_concepts = []

    for concept in graph.subjects(RDF.type, SKOS.Concept):
        # Check if concept has narrower concepts
        has_narrower = (concept, SKOS.narrower, None) in graph
        # Check if concept has broader concepts
        has_broader = (concept, SKOS.broader, None) in graph

        # Top concept: has narrower but no broader
        if has_narrower and not has_broader:
            top_concepts.append(concept)
            concept_name = str(concept).replace(str(she), 'she:')
            print(f"  Found top concept: {concept_name}")

    print(f"\nTotal: {len(top_concepts)} top concepts\n")
    return top_concepts

def merge_with_soilvoc(glosis_graph, top_concepts, soilvoc_path, output_path):
    """Merge GloSIS concepts with SoilVoc.ttl and save to a new file.
    
    IMPORTANT: This function MERGES triples, keeping all information from both
    SoilVoc and GloSIS. It does NOT replace existing triples.
    """
    print(f"Merging with {soilvoc_path} (original file will NOT be modified)...")

    # Load existing SoilVoc
    print("  Loading existing SoilVoc.ttl...")
    soilvoc = Graph()
    soilvoc.parse(soilvoc_path, format='turtle')

    # Preserve namespace bindings
    for prefix, namespace in soilvoc.namespace_manager.namespaces():
        soilvoc.bind(prefix, namespace)

    # Add GloSIS namespace bindings
    soilvoc.bind("glosis_lh", glosis_lh)
    soilvoc.bind("glosis_cl", glosis_cl)
    soilvoc.bind("glosis_proc", glosis_proc)
    soilvoc.bind("glosis_cm", glosis_cm)
    soilvoc.bind("glosis_pr", glosis_pr)
    soilvoc.bind("glosis_su", glosis_su)
    soilvoc.bind("glosis_sp", glosis_sp)

    # Count existing concepts in SoilVoc
    existing_concepts = set(soilvoc.subjects(RDF.type, SKOS.Concept))
    print(f"  Existing concepts in SoilVoc: {len(existing_concepts)}")

    # Merge GloSIS concepts - ADD triples instead of replacing
    print("  Merging GloSIS concepts (keeping all original SoilVoc triples)...")
    new_concept_count = 0
    enriched_concept_count = 0
    new_triple_count = 0

    glosis_concepts = set(glosis_graph.subjects(RDF.type, SKOS.Concept))
    
    for concept in glosis_concepts:
        if concept in existing_concepts:
            enriched_concept_count += 1
        else:
            new_concept_count += 1
        
        # Add all triples for this concept from GloSIS
        # This ADDS to existing triples, not replaces them
        for p, o in glosis_graph.predicate_objects(concept):
            if (concept, p, o) not in soilvoc:
                soilvoc.add((concept, p, o))
                new_triple_count += 1

    print(f"    New concepts from GloSIS: {new_concept_count}")
    print(f"    Existing concepts enriched with GloSIS data: {enriched_concept_count}")
    print(f"    New triples added: {new_triple_count}")

    # Add top concepts to ConceptScheme
    print("  Adding top concepts to ConceptScheme...")

    # Get existing top concepts
    existing_top_concepts = set(soilvoc.objects(scheme_uri, SKOS.hasTopConcept))
    print(f"    Existing top concepts: {len(existing_top_concepts)}")

    new_top_concepts = 0
    for concept in top_concepts:
        if concept not in existing_top_concepts:
            soilvoc.add((scheme_uri, SKOS.hasTopConcept, concept))
            new_top_concepts += 1

    print(f"    Added {new_top_concepts} new top concepts")

    # Ensure bidirectional relationships in the merged graph
    print("\n  Ensuring bidirectional relationships in merged graph...")
    soilvoc = ensure_bidirectional_relations(soilvoc)

    # Count total concepts after merge
    total_concepts = len(set(soilvoc.subjects(RDF.type, SKOS.Concept)))
    print(f"  Total concepts in merged graph: {total_concepts}")

    # Save to new file
    print(f"\n  Saving merged result to {output_path}...")
    soilvoc.serialize(destination=output_path, format='turtle')
    print(f"  Done! Original SoilVoc.ttl is unchanged.\n")

def main():
    print("=" * 80)
    print("Merging GloSIS SKOS vocabularies with SoilVoc.ttl")
    print("(Original SoilVoc.ttl will NOT be modified)")
    print("(All information from both sources will be preserved)")
    print("=" * 80 + "\n")

    # Load all GloSIS SKOS files from the folder
    glosis_graph = load_glosis_skos_files()

    # Add skos:inScheme to all concepts
    glosis_graph = add_in_scheme_to_concepts(glosis_graph)

    # Identify top concepts
    top_concepts = identify_top_concepts(glosis_graph)

    # Merge with SoilVoc.ttl and save to new file
    soilvoc_path = 'ontovocabs/spp/spp_skos.ttl'
    output_path = 'SoilVoc.ttl'
    merge_with_soilvoc(glosis_graph, top_concepts, soilvoc_path, output_path)

    print("=" * 80)
    print(f"MERGE COMPLETE - Result saved to: {output_path}")
    print("=" * 80)

if __name__ == '__main__':
    main()

Merging GloSIS SKOS vocabularies with SoilVoc.ttl
(Original SoilVoc.ttl will NOT be modified)
(All information from both sources will be preserved)

Loading GloSIS SKOS files...
  Found 5 TTL files in ontovocabs/glosis/glosis_skos_hier/
  Loading glosis_common_skos_hierarchical.ttl...
    23 concepts
  Loading glosis_layer_horizon_skos_hierarchical_with_procedures.ttl...
    406 concepts
  Loading glosis_profile_skos_hierarchical.ttl...
    6 concepts
  Loading glosis_siteplot_skos_hierarchical.ttl...
    50 concepts
  Loading glosis_surface_skos_hierarchical.ttl...
    7 concepts

Total: 492 concepts loaded from GloSIS

Adding skos:inScheme to all concepts...
  Added skos:inScheme to 490 concepts

Identifying top concepts...
  Found top concept: she:Fragment
  Found top concept: she:SoilDepth
  Found top concept: she:Molybdenum
  Found top concept: she:pHProcedure-pHKCl
  Found top concept: she:Voids
  Found top concept: she:Copper
  Found top concept: she:pHProcedure-pHH2O
  Found to

In [12]:
"""
Fix top concepts in SoilVoc.ttl:
1. Include concepts with narrower but no broader (hierarchy roots)
2. Include orphan concepts (no broader, no narrower)
3. Exclude any concepts with exactMatch to glosis_proc
"""

def identify_correct_top_concepts(graph):
    """
    Identify top concepts:
    1. Concepts with narrower but no broader (hierarchy roots)
    2. Orphan concepts (no broader, no narrower)
    3. Exclude concepts with exactMatch to glosis_proc
    """
    print("Identifying correct top concepts...")

    top_concepts = []

    for concept in graph.subjects(RDF.type, SKOS.Concept):
        # Check if concept has exact match to glosis_proc - if so, skip
        exact_matches = list(graph.objects(concept, SKOS.exactMatch))
        is_procedure = any('glosis/model/procedure/' in str(match) for match in exact_matches)

        if is_procedure:
            continue

        # Check if concept has narrower concepts
        has_narrower = (concept, SKOS.narrower, None) in graph
        # Check if concept has broader concepts
        has_broader = (concept, SKOS.broader, None) in graph

        # Top concept conditions:
        # 1. Has narrower but no broader (hierarchy root)
        # 2. No narrower and no broader (orphan)
        if (has_narrower and not has_broader) or (not has_narrower and not has_broader):
            top_concepts.append(concept)
            concept_name = str(concept).replace(str(she), 'she:')
            category = "hierarchy root" if has_narrower else "orphan"
            print(f"  {concept_name} ({category})")

    print(f"\nTotal: {len(top_concepts)} top concepts\n")
    return top_concepts

def update_soilvoc_top_concepts(soilvoc_path):
    """Update top concepts in SoilVoc.ttl"""
    print(f"Loading {soilvoc_path}...")

    # Load SoilVoc
    soilvoc = Graph()
    soilvoc.parse(soilvoc_path, format='turtle')

    # Preserve namespace bindings
    for prefix, namespace in soilvoc.namespace_manager.namespaces():
        soilvoc.bind(prefix, namespace)

    print(f"Loaded {len(list(soilvoc.subjects(RDF.type, SKOS.Concept)))} concepts\n")

    # Identify correct top concepts
    correct_top_concepts = identify_correct_top_concepts(soilvoc)

    # Remove existing hasTopConcept triples
    print("Removing existing top concept declarations...")
    existing_top_concepts = list(soilvoc.objects(scheme_uri, SKOS.hasTopConcept))
    print(f"  Removing {len(existing_top_concepts)} existing top concepts")

    for top_concept in existing_top_concepts:
        soilvoc.remove((scheme_uri, SKOS.hasTopConcept, top_concept))

    # Add new top concepts
    print(f"\nAdding {len(correct_top_concepts)} new top concepts...")
    for concept in sorted(correct_top_concepts, key=lambda c: str(c)):
        soilvoc.add((scheme_uri, SKOS.hasTopConcept, concept))

    # Save updated SoilVoc
    print(f"\nSaving updated {soilvoc_path}...")
    soilvoc.serialize(destination=soilvoc_path, format='turtle')
    print("Done!\n")

    # Summary
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total concepts: {len(list(soilvoc.subjects(RDF.type, SKOS.Concept)))}")
    print(f"Top concepts: {len(correct_top_concepts)}")

    # Count hierarchy roots vs orphans
    hierarchy_roots = 0
    orphans = 0
    for concept in correct_top_concepts:
        has_narrower = (concept, SKOS.narrower, None) in soilvoc
        if has_narrower:
            hierarchy_roots += 1
        else:
            orphans += 1

    print(f"  - Hierarchy roots: {hierarchy_roots}")
    print(f"  - Orphan concepts: {orphans}")
    print("=" * 80)

def main():
    print("=" * 80)
    print("Fixing top concepts in SoilVoc.ttl")
    print("=" * 80 + "\n")

    soilvoc_path = 'SoilVoc.ttl'
    update_soilvoc_top_concepts(soilvoc_path)

if __name__ == '__main__':
    main()

Fixing top concepts in SoilVoc.ttl

Loading SoilVoc.ttl...
Loaded 1067 concepts

Identifying correct top concepts...
  she:BaseSaturation (orphan)
  she:CropClass (orphan)
  she:FragmentClass (orphan)
  she:Geology (orphan)
  she:GroundwaterDepth (orphan)
  she:HumanInfluenceClass (orphan)
  she:HydrogenExchangeableBases (orphan)
  she:KoeppenClass (orphan)
  she:LandUseClass (orphan)
  she:LandformComplex (orphan)
  she:Lithology (orphan)
  she:MajorLandForm (orphan)
  she:MoistureContent (orphan)
  she:NitrogenTotal (orphan)
  she:OxalateExtractableOpticalDensity (orphan)
  she:Physiography (orphan)
  she:ProfileDescriptionStatus (orphan)
  she:SlopeGradientClass (orphan)
  she:SlopeOrientationClass (orphan)
  she:SoilDepthRootableClass (orphan)
  she:SoilOrganicMatterClass (orphan)
  she:SoilWaterInfiltrationRateClass (orphan)
  she:Stickiness (orphan)
  she:TotalCarbonateEquivalent (orphan)
  she:VegetationClass (orphan)
  she:SoilFluidRetention (orphan)
  she:SoilFluidTransport (o

In [13]:
#!/usr/bin/env python3
"""
Generate enhanced interactive HTML mind map from SoilVoc.ttl with:
- skos:definition display
- skos:exactMatch with clickable links
- Copy URI button for concepts
- she:hasProcedure relationships
- Visual differentiation for procedures
"""


def parse_skos_vocabulary_enhanced(ttl_file_path):
    """
    Parse a SKOS vocabulary from a Turtle file and extract the hierarchy with procedures.

    Args:
        ttl_file_path: Path to the .ttl file

    Returns:
        dict: Dictionary containing the vocabulary structure
    """
    # Load the graph
    g = Graph()
    g.parse(ttl_file_path, format='turtle')

    # Find the ConceptScheme
    concept_schemes = list(g.subjects(RDF.type, SKOS.ConceptScheme))

    if not concept_schemes:
        raise ValueError("No SKOS ConceptScheme found in the file")

    # Use the first ConceptScheme
    scheme = concept_schemes[0]

    # Get scheme information
    scheme_label = str(g.value(scheme, SKOS.prefLabel) or
                      g.value(scheme, RDFS.label) or
                      scheme.split('/')[-1].split('#')[-1])

    # Find top concepts
    top_concepts = []

    # Try hasTopConcept property
    for top_concept in g.objects(scheme, SKOS.hasTopConcept):
        top_concepts.append(top_concept)

    # Try topConceptOf property (inverse)
    for top_concept in g.subjects(SKOS.topConceptOf, scheme):
        if top_concept not in top_concepts:
            top_concepts.append(top_concept)

    # If no top concepts found, find concepts with no broader concepts
    if not top_concepts:
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        concepts_with_broader = set(g.subjects(SKOS.broader, None))
        top_concepts = list(all_concepts - concepts_with_broader)

    # Build the hierarchy
    def get_concept_info(concept_uri):
        """Extract information about a concept."""
        pref_label = g.value(concept_uri, SKOS.prefLabel)
        alt_label = g.value(concept_uri, SKOS.altLabel)
        definition = g.value(concept_uri, SKOS.definition)
        notation = g.value(concept_uri, SKOS.notation)

        label = str(pref_label or concept_uri.split('/')[-1].split('#')[-1])
        alt_label_str = str(alt_label) if alt_label else None

        # Get exactMatch links
        exact_matches = []
        for match_uri in g.objects(concept_uri, SKOS.exactMatch):
            match_str = str(match_uri)
            # Extract a readable label from the URI
            match_label = match_str.split('/')[-1].split('#')[-1]
            exact_matches.append({
                'uri': match_str,
                'label': match_label
            })

        # Check if this is a procedure (exactMatch to glosis_proc)
        is_procedure = any('glosis/model/procedure/' in m['uri'] for m in exact_matches)

        # Get narrower concepts
        narrower = list(g.objects(concept_uri, SKOS.narrower))

        # Also check for concepts that have this as broader (inverse)
        for concept in g.subjects(SKOS.broader, concept_uri):
            if concept not in narrower:
                narrower.append(concept)

        # Get procedures linked via she:hasProcedure
        procedures = []
        for proc_uri in g.objects(concept_uri, she.hasProcedure):
            proc_info = get_concept_info(proc_uri)
            procedures.append(proc_info)

        concept_info = {
            'uri': str(concept_uri),
            'label': label,
            'altLabel': alt_label_str,
            'notation': str(notation) if notation else None,
            'definition': str(definition) if definition else None,
            'exactMatch': exact_matches,
            'isProcedure': is_procedure,
            'procedures': procedures,
            'narrower': [get_concept_info(n) for n in narrower] if narrower else []
        }

        return concept_info

    # Build the structure
    vocabulary = {
        'scheme_uri': str(scheme),
        'scheme_label': scheme_label,
        'top_concepts': [get_concept_info(tc) for tc in top_concepts]
    }

    return vocabulary


def generate_html_mindmap_enhanced(vocabulary_data, output_file='soilvoc_mindmap.html'):
    """
    Generate an enhanced interactive HTML mind map from the vocabulary data.

    Args:
        vocabulary_data: Dictionary containing the vocabulary structure
        output_file: Output HTML file path
    """
    html_content = f'''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{vocabulary_data['scheme_label']} - Interactive Mind Map</title>
    <style>
        * {{
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }}

        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            background: linear-gradient(135deg, #5a8e6b 0%, #8b6f47 100%);
            min-height: 100vh;
            padding: 20px;
        }}

        .container {{
            max-width: 1400px;
            margin: 0 auto;
            background: #faf8f3;
            border-radius: 12px;
            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
            overflow: hidden;
        }}

        .header {{
            background: linear-gradient(135deg, #4a7c59 0%, #5a8e6b 100%);
            color: white;
            padding: 40px;
            text-align: center;
            border-bottom: 4px solid #8b6f47;
        }}

        .header h1 {{
            font-size: 2.5em;
            margin-bottom: 12px;
            font-weight: 700;
            text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
        }}

        .header p {{
            opacity: 0.95;
            font-size: 1.15em;
            font-weight: 300;
        }}

        .mindmap {{
            padding: 40px;
            overflow-x: auto;
        }}

        .concept {{
            margin: 10px 0;
            animation: fadeIn 0.3s ease-in;
        }}

        @keyframes fadeIn {{
            from {{ opacity: 0; transform: translateX(-10px); }}
            to {{ opacity: 1; transform: translateX(0); }}
        }}

        .concept-header {{
            display: flex;
            align-items: center;
            padding: 12px 16px;
            background: #ffffff;
            border-left: 4px solid #5a8e6b;
            border-radius: 6px;
            cursor: pointer;
            transition: all 0.2s ease;
            margin-bottom: 5px;
            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06);
        }}

        .concept-header.procedure {{
            background: #fff8e8;
            border-left-color: #d4a259;
        }}

        .concept-header.procedure:hover {{
            background: #ffedc9;
            border-left-color: #c4923f;
        }}

        .concept-header:hover {{
            background: #f0f5f1;
            border-left-color: #4a7c59;
            transform: translateX(5px);
            box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1);
        }}

        .concept-header.active {{
            background: #5a8e6b;
            color: white;
            border-left-color: #4a7c59;
        }}

        .concept-header.procedure.active {{
            background: #d4a259;
            color: #2c1810;
            border-left-color: #c4923f;
        }}

        .concept-header.highlighted {{
            background: #d4a259;
            border-left-color: #c4923f;
            animation: pulse 1s ease-in-out;
        }}

        @keyframes pulse {{
            0%, 100% {{ transform: scale(1); }}
            50% {{ transform: scale(1.02); }}
        }}

        .concept-header.no-children {{
            cursor: default;
            border-left-color: #a8b5a3;
        }}

        .concept-header.no-children:hover {{
            background: #ffffff;
            transform: none;
            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06);
        }}

        .concept-header.procedure.no-children:hover {{
            background: #fff8e8;
            transform: none;
        }}

        .toggle-icon {{
            width: 24px;
            height: 24px;
            margin-right: 12px;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 18px;
            font-weight: bold;
            transition: transform 0.2s ease;
        }}

        .toggle-icon.expanded {{
            transform: rotate(90deg);
        }}

        .concept-label {{
            flex: 1;
            font-weight: 500;
            font-size: 1.05em;
        }}

        .concept-alt-label {{
            display: block;
            font-size: 0.85em;
            font-weight: 400;
            color: #6c757d;
            font-style: italic;
            margin-top: 2px;
        }}

        .copy-uri-btn {{
            background: rgba(90, 142, 107, 0.1);
            color: #4a7c59;
            padding: 4px 8px;
            border-radius: 4px;
            font-size: 0.8em;
            margin-left: 8px;
            cursor: pointer;
            transition: all 0.2s ease;
            border: 1px solid transparent;
        }}

        .copy-uri-btn:hover {{
            background: #5a8e6b;
            color: white;
            border-color: #5a8e6b;
        }}

        .concept-header.active .copy-uri-btn {{
            background: rgba(255, 255, 255, 0.2);
            color: white;
        }}

        .concept-header.active .copy-uri-btn:hover {{
            background: rgba(255, 255, 255, 0.3);
        }}

        .concept-notation {{
            background: rgba(90, 142, 107, 0.15);
            color: #3d6b4d;
            padding: 4px 10px;
            border-radius: 4px;
            font-size: 0.9em;
            font-weight: 600;
            margin-right: 10px;
        }}

        .concept-header.active .concept-notation {{
            background: rgba(255, 255, 255, 0.2);
            color: white;
        }}

        .concept-count {{
            background: rgba(90, 142, 107, 0.15);
            color: #3d6b4d;
            padding: 4px 10px;
            border-radius: 12px;
            font-size: 0.85em;
            font-weight: 600;
        }}

        .concept-header.active .concept-count {{
            background: rgba(255, 255, 255, 0.2);
            color: white;
        }}

        .concept-children {{
            margin-left: 30px;
            border-left: 2px solid #d4e0cf;
            padding-left: 20px;
            display: none;
        }}

        .concept-children.expanded {{
            display: block;
        }}

        .concept-definition {{
            margin: 5px 0 10px 54px;
            padding: 10px 15px;
            background: #f0f5f1;
            border-radius: 4px;
            font-size: 0.9em;
            color: #3d5a3f;
            font-style: italic;
            display: none;
            border-left: 3px solid #a8b5a3;
        }}

        .concept-definition.show {{
            display: block;
        }}

        .exact-match-info {{
            margin: 5px 0 10px 54px;
            padding: 8px 12px;
            background: #e8f4f0;
            border-left: 3px solid #4a7c59;
            border-radius: 4px;
            font-size: 0.85em;
            color: #2c4a35;
            display: none;
        }}

        .exact-match-info.show {{
            display: block;
        }}

        .exact-match-link {{
            color: #4a7c59;
            text-decoration: underline;
            cursor: pointer;
            font-weight: 500;
        }}

        .exact-match-link:hover {{
            color: #3d6b4d;
        }}

        .procedure-badge {{
            background: #d4a259;
            color: #2c1810;
            padding: 2px 8px;
            border-radius: 4px;
            font-size: 0.75em;
            font-weight: 700;
            margin-left: 8px;
            text-transform: uppercase;
        }}

        .procedures-section {{
            margin: 5px 0 10px 54px;
            padding: 10px 15px;
            background: #fff8e8;
            border-left: 3px solid #d4a259;
            border-radius: 4px;
            font-size: 0.9em;
            display: none;
        }}

        .procedures-section.show {{
            display: block;
        }}

        .procedures-title {{
            font-weight: 600;
            color: #8b6f47;
            margin-bottom: 8px;
        }}

        .top-level {{
            margin-left: 0;
            padding-left: 0;
            border-left: none;
        }}

        .stats {{
            padding: 20px 40px;
            background: linear-gradient(135deg, #f0f5f1 0%, #f5f1e8 100%);
            border-top: 3px solid #8b6f47;
            display: flex;
            justify-content: space-around;
            flex-wrap: wrap;
        }}

        .stat-item {{
            text-align: center;
            padding: 10px;
        }}

        .stat-value {{
            font-size: 2em;
            font-weight: bold;
            color: #5a8e6b;
            text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.05);
        }}

        .stat-label {{
            color: #6b5840;
            font-size: 0.9em;
            margin-top: 5px;
            font-weight: 500;
        }}

        .search-box {{
            padding: 20px 40px;
            background: linear-gradient(135deg, #f5f1e8 0%, #f0f5f1 100%);
            border-bottom: 2px solid #d4e0cf;
        }}

        .search-input {{
            width: 100%;
            padding: 12px 20px;
            font-size: 1em;
            border: 2px solid #d4e0cf;
            border-radius: 6px;
            transition: all 0.2s ease;
            background: white;
        }}

        .search-input:focus {{
            outline: none;
            border-color: #5a8e6b;
            box-shadow: 0 0 0 3px rgba(90, 142, 107, 0.15);
        }}

        .search-results {{
            margin-top: 15px;
            display: none;
        }}

        .search-results.show {{
            display: block;
        }}

        .search-result-item {{
            padding: 10px 15px;
            background: white;
            border: 1px solid #d4e0cf;
            border-radius: 6px;
            margin-bottom: 8px;
            cursor: pointer;
            transition: all 0.2s ease;
        }}

        .search-result-item:hover {{
            background: #f0f5f1;
            border-color: #5a8e6b;
            transform: translateX(5px);
            box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1);
        }}

        .search-result-label {{
            font-weight: 500;
            color: #2c3e2d;
        }}

        .search-result-notation {{
            display: inline-block;
            background: rgba(90, 142, 107, 0.15);
            color: #3d6b4d;
            padding: 2px 8px;
            border-radius: 3px;
            font-size: 0.85em;
            font-weight: 600;
            margin-right: 8px;
        }}

        .search-result-path {{
            font-size: 0.85em;
            color: #6b5840;
            margin-top: 5px;
        }}

        .search-info {{
            padding: 10px 15px;
            background: #e8f4f0;
            border: 1px solid #a8cbba;
            border-radius: 6px;
            color: #2c4a35;
            font-size: 0.9em;
            margin-bottom: 10px;
        }}

        .no-results {{
            text-align: center;
            padding: 40px;
            color: #6b5840;
            font-style: italic;
        }}

        .clear-search {{
            display: inline-block;
            margin-top: 10px;
            padding: 8px 16px;
            background: #5a8e6b;
            color: white;
            border-radius: 6px;
            cursor: pointer;
            font-size: 0.9em;
            transition: all 0.2s ease;
        }}

        .clear-search:hover {{
            background: #4a7c59;
            box-shadow: 0 2px 6px rgba(0, 0, 0, 0.15);
        }}

        .toast {{
            position: fixed;
            bottom: 30px;
            right: 30px;
            background: #5a8e6b;
            color: white;
            padding: 12px 24px;
            border-radius: 6px;
            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
            opacity: 0;
            transition: opacity 0.3s ease;
            pointer-events: none;
            z-index: 1000;
        }}

        .toast.show {{
            opacity: 1;
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>🌱 SoilVoc</h1>
            <p>Interactive Soil Vocabulary Thesaurus · SoilWise-HE Project</p>
        </div>

        <div class="search-box">
            <input type="text" class="search-input" id="searchInput" placeholder="Search all concepts by label or notation...">
            <div class="search-results" id="searchResults"></div>
        </div>

        <div class="mindmap" id="mindmap">
            <!-- Mind map will be generated here -->
        </div>

        <div class="stats" id="stats">
            <!-- Statistics will be generated here -->
        </div>
    </div>

    <div class="toast" id="toast">URI copied to clipboard!</div>

    <script>
        const vocabularyData = {json.dumps(vocabulary_data, indent=2)};

        let allConcepts = [];
        let uniqueConceptUris = new Set();
        let conceptMap = new Map(); // Maps URI to concept object with path info

        function buildConceptMap(concepts, path = []) {{
            concepts.forEach(concept => {{
                const currentPath = [...path, concept];

                // Store concept with its path
                if (!conceptMap.has(concept.uri)) {{
                    conceptMap.set(concept.uri, {{
                        concept: concept,
                        path: currentPath
                    }});
                }}

                uniqueConceptUris.add(concept.uri);
                allConcepts.push(concept);

                if (concept.narrower && concept.narrower.length > 0) {{
                    buildConceptMap(concept.narrower, currentPath);
                }}

                if (concept.procedures && concept.procedures.length > 0) {{
                    buildConceptMap(concept.procedures, currentPath);
                }}
            }});
        }}

        function countConcepts(concepts) {{
            concepts.forEach(concept => {{
                uniqueConceptUris.add(concept.uri);
                allConcepts.push(concept);
                if (concept.narrower && concept.narrower.length > 0) {{
                    countConcepts(concept.narrower);
                }}
                if (concept.procedures && concept.procedures.length > 0) {{
                    countConcepts(concept.procedures);
                }}
            }});
            return uniqueConceptUris.size;
        }}

        function getMaxDepth(concepts, depth = 1) {{
            let maxDepth = depth;
            concepts.forEach(concept => {{
                if (concept.narrower && concept.narrower.length > 0) {{
                    maxDepth = Math.max(maxDepth, getMaxDepth(concept.narrower, depth + 1));
                }}
                if (concept.procedures && concept.procedures.length > 0) {{
                    maxDepth = Math.max(maxDepth, getMaxDepth(concept.procedures, depth + 1));
                }}
            }});
            return maxDepth;
        }}

        function copyToClipboard(text) {{
            navigator.clipboard.writeText(text).then(() => {{
                showToast();
            }}).catch(err => {{
                console.error('Failed to copy:', err);
            }});
        }}

        function showToast() {{
            const toast = document.getElementById('toast');
            toast.classList.add('show');
            setTimeout(() => {{
                toast.classList.remove('show');
            }}, 2000);
        }}

        function renderConcept(concept, level = 0) {{
            const hasNarrower = concept.narrower && concept.narrower.length > 0;
            const hasProcedures = concept.procedures && concept.procedures.length > 0;
            const hasChildren = hasNarrower || hasProcedures;
            const hasDefinition = concept.definition !== null && concept.definition !== undefined;
            const hasExactMatch = concept.exactMatch && concept.exactMatch.length > 0;

            // Concept is clickable if it has children OR has definition/exactMatch
            const isClickable = hasChildren || hasDefinition || hasExactMatch;

            const notation = concept.notation ? `<span class="concept-notation">${{concept.notation}}</span>` : '';
            const procedureBadge = concept.isProcedure ? '<span class="procedure-badge">Procedure</span>' : '';
            const altLabelHtml = concept.altLabel ? `<span class="concept-alt-label">${{concept.altLabel}}</span>` : '';
            const childrenCount = hasNarrower ? concept.narrower.length + (hasProcedures ? concept.procedures.length : 0) : (hasProcedures ? concept.procedures.length : 0);
            const count = hasChildren ? `<span class="concept-count">${{childrenCount}}</span>` : '';
            const noChildClass = !isClickable ? 'no-children' : '';
            const procedureClass = concept.isProcedure ? 'procedure' : '';
            const toggleIcon = hasChildren ? '▶' : '●';

            let html = `
                <div class="concept" data-uri="${{concept.uri}}">
                    <div class="concept-header ${{noChildClass}} ${{procedureClass}}" onclick="toggleConcept(this)">
                        <span class="toggle-icon">${{toggleIcon}}</span>
                        ${{notation}}
                        <span class="concept-label">${{concept.label}}${{procedureBadge}}${{altLabelHtml}}</span>
                        <button class="copy-uri-btn" onclick="event.stopPropagation(); copyToClipboard('${{concept.uri}}')">📋 Copy URI</button>
                        ${{count}}
                    </div>
            `;

            if (concept.definition) {{
                html += `<div class="concept-definition">${{concept.definition}}</div>`;
            }}

            if (concept.exactMatch && concept.exactMatch.length > 0) {{
                const exactMatchLinks = concept.exactMatch.map(m =>
                    `<a href="${{m.uri}}" target="_blank" class="exact-match-link">${{m.label}}</a>`
                ).join(', ');
                html += `<div class="exact-match-info">See also: ${{exactMatchLinks}}</div>`;
            }}

            if (hasProcedures) {{
                html += `<div class="procedures-section">
                    <div class="procedures-title">📋 Procedures:</div>
                    <div class="concept-children">`;
                concept.procedures.forEach(proc => {{
                    html += renderConcept(proc, level + 1);
                }});
                html += `</div></div>`;
            }}

            if (hasNarrower) {{
                html += `<div class="concept-children">`;
                concept.narrower.forEach(narrower => {{
                    html += renderConcept(narrower, level + 1);
                }});
                html += `</div>`;
            }}

            html += `</div>`;
            return html;
        }}

        function toggleConcept(header) {{
            const concept = header.parentElement;
            const children = concept.querySelectorAll(':scope > .concept-children');
            const definition = concept.querySelector(':scope > .concept-definition');
            const exactMatch = concept.querySelector(':scope > .exact-match-info');
            const procedures = concept.querySelector(':scope > .procedures-section');
            const icon = header.querySelector('.toggle-icon');

            // Check if there's anything to show
            const hasChildren = children.length > 0;
            const hasDefinition = definition !== null;
            const hasExactMatch = exactMatch !== null;
            const hasProcedures = procedures !== null;

            // If no children and no info to display, do nothing
            if (!hasChildren && !hasDefinition && !hasExactMatch && !hasProcedures) {{
                return;
            }}

            const isExpanding = !header.classList.contains('active');

            children.forEach(childDiv => {{
                childDiv.classList.toggle('expanded');
            }});

            header.classList.toggle('active');
            if (hasChildren || hasProcedures) {{
                icon.classList.toggle('expanded');
            }}

            if (definition) {{
                definition.classList.toggle('show');
            }}

            if (exactMatch) {{
                exactMatch.classList.toggle('show');
            }}

            if (procedures) {{
                procedures.classList.toggle('show');
                const procChildren = procedures.querySelector('.concept-children');
                if (procChildren && isExpanding) {{
                    procChildren.classList.add('expanded');
                }}
            }}
        }}

        function renderMindmap() {{
            const mindmapDiv = document.getElementById('mindmap');
            let html = '<div class="top-level">';

            vocabularyData.top_concepts.forEach(concept => {{
                html += renderConcept(concept, 0);
            }});

            html += '</div>';
            mindmapDiv.innerHTML = html;
        }}

        function renderStats() {{
            const totalConcepts = countConcepts(vocabularyData.top_concepts);
            const maxDepth = getMaxDepth(vocabularyData.top_concepts);
            const topConceptsCount = vocabularyData.top_concepts.length;

            const statsDiv = document.getElementById('stats');
            statsDiv.innerHTML = `
                <div class="stat-item">
                    <div class="stat-value">${{topConceptsCount}}</div>
                    <div class="stat-label">Top Concepts</div>
                </div>
                <div class="stat-item">
                    <div class="stat-value">${{totalConcepts}}</div>
                    <div class="stat-label">Total Concepts</div>
                </div>
                <div class="stat-item">
                    <div class="stat-value">${{maxDepth}}</div>
                    <div class="stat-label">Max Depth</div>
                </div>
            `;
        }}

        function searchConcepts() {{
            const searchTerm = document.getElementById('searchInput').value.toLowerCase().trim();
            const searchResultsDiv = document.getElementById('searchResults');

            if (searchTerm === '') {{
                searchResultsDiv.classList.remove('show');
                searchResultsDiv.innerHTML = '';
                clearHighlights();
                return;
            }}

            // Search in all concepts
            const matches = [];
            conceptMap.forEach((data, uri) => {{
                const concept = data.concept;
                const label = concept.label.toLowerCase();
                const altLabel = concept.altLabel ? concept.altLabel.toLowerCase() : '';
                const notation = concept.notation ? concept.notation.toLowerCase() : '';

                if (label.includes(searchTerm) || altLabel.includes(searchTerm) || notation.includes(searchTerm)) {{
                    matches.push({{
                        uri: uri,
                        concept: concept,
                        path: data.path
                    }});
                }}
            }});

            // Display results
            if (matches.length > 0) {{
                let html = `<div class="search-info">Found ${{matches.length}} matching concept(s). Click to navigate.</div>`;

                matches.forEach(match => {{
                    const pathLabels = match.path.map(c => c.notation ? `${{c.notation}} ${{c.label}}` : c.label).join(' → ');
                    const notation = match.concept.notation ? `<span class="search-result-notation">${{match.concept.notation}}</span>` : '';

                    html += `
                        <div class="search-result-item" onclick="navigateToConcept('${{match.uri}}')">
                            <div class="search-result-label">
                                ${{notation}}${{match.concept.label}}
                            </div>
                            <div class="search-result-path">${{pathLabels}}</div>
                        </div>
                    `;
                }});

                html += `<div class="clear-search" onclick="clearSearch()">Clear Search</div>`;
                searchResultsDiv.innerHTML = html;
                searchResultsDiv.classList.add('show');
            }} else {{
                searchResultsDiv.innerHTML = `
                    <div class="search-info">No concepts found matching "${{searchTerm}}".</div>
                    <div class="clear-search" onclick="clearSearch()">Clear Search</div>
                `;
                searchResultsDiv.classList.add('show');
            }}
        }}

        function navigateToConcept(targetUri) {{
            // Get the path to this concept
            const conceptData = conceptMap.get(targetUri);
            if (!conceptData) return;

            // First, collapse everything
            document.querySelectorAll('.concept-children.expanded').forEach(el => {{
                el.classList.remove('expanded');
            }});
            document.querySelectorAll('.concept-header.active').forEach(el => {{
                el.classList.remove('active');
            }});
            document.querySelectorAll('.toggle-icon.expanded').forEach(el => {{
                el.classList.remove('expanded');
            }});
            document.querySelectorAll('.concept-definition.show').forEach(el => {{
                el.classList.remove('show');
            }});
            document.querySelectorAll('.exact-match-info.show').forEach(el => {{
                el.classList.remove('show');
            }});
            document.querySelectorAll('.procedures-section.show').forEach(el => {{
                el.classList.remove('show');
            }});

            // Clear previous highlights
            clearHighlights();

            // Expand the path to the target concept
            const path = conceptData.path;
            for (let i = 0; i < path.length - 1; i++) {{
                const conceptUri = path[i].uri;
                const conceptElement = document.querySelector(`.concept[data-uri="${{conceptUri}}"]`);

                if (conceptElement) {{
                    const header = conceptElement.querySelector('.concept-header');
                    const children = conceptElement.querySelectorAll(':scope > .concept-children');
                    const icon = header.querySelector('.toggle-icon');
                    const procedures = conceptElement.querySelector(':scope > .procedures-section');

                    children.forEach(childDiv => {{
                        if (!childDiv.classList.contains('expanded')) {{
                            childDiv.classList.add('expanded');
                        }}
                    }});

                    if (!header.classList.contains('active')) {{
                        header.classList.add('active');
                        icon.classList.add('expanded');
                    }}

                    if (procedures) {{
                        procedures.classList.add('show');
                        const procChildren = procedures.querySelector('.concept-children');
                        if (procChildren) {{
                            procChildren.classList.add('expanded');
                        }}
                    }}
                }}
            }}

            // Highlight and scroll to the target concept
            const targetElement = document.querySelector(`.concept[data-uri="${{targetUri}}"]`);
            if (targetElement) {{
                const targetHeader = targetElement.querySelector('.concept-header');
                targetHeader.classList.add('highlighted');

                // Show definition, exact match, procedures if exists
                const definition = targetElement.querySelector(':scope > .concept-definition');
                if (definition) {{
                    definition.classList.add('show');
                }}

                const exactMatch = targetElement.querySelector(':scope > .exact-match-info');
                if (exactMatch) {{
                    exactMatch.classList.add('show');
                }}

                const procedures = targetElement.querySelector(':scope > .procedures-section');
                if (procedures) {{
                    procedures.classList.add('show');
                    const procChildren = procedures.querySelector('.concept-children');
                    if (procChildren) {{
                        procChildren.classList.add('expanded');
                    }}
                }}

                // Scroll to the target
                targetElement.scrollIntoView({{ behavior: 'smooth', block: 'center' }});

                // Remove highlight after animation
                setTimeout(() => {{
                    targetHeader.classList.remove('highlighted');
                }}, 2000);
            }}
        }}

        function clearHighlights() {{
            document.querySelectorAll('.concept-header.highlighted').forEach(el => {{
                el.classList.remove('highlighted');
            }});
        }}

        function clearSearch() {{
            document.getElementById('searchInput').value = '';
            searchConcepts();
        }}

        // Initialize
        buildConceptMap(vocabularyData.top_concepts);
        renderMindmap();
        renderStats();

        // Search functionality with debounce
        let searchTimeout;
        document.getElementById('searchInput').addEventListener('input', () => {{
            clearTimeout(searchTimeout);
            searchTimeout = setTimeout(searchConcepts, 300);
        }});

        // Keyboard navigation
        document.addEventListener('keydown', (e) => {{
            if (e.key === 'Escape') {{
                clearSearch();
            }}
        }});
    </script>
</body>
</html>'''

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"Enhanced interactive mind map generated: {output_file}")


# Main execution
if __name__ == '__main__':
    ttl_file = 'SoilVoc.ttl'

    try:
        print(f"Parsing SKOS vocabulary from: {ttl_file}")
        vocabulary = parse_skos_vocabulary_enhanced(ttl_file)

        print(f"Found ConceptScheme: {vocabulary['scheme_label']}")
        print(f"Number of top concepts: {len(vocabulary['top_concepts'])}")

        output_file = 'index.html'
        generate_html_mindmap_enhanced(vocabulary, output_file)

        print(f"\nSuccess! Open {output_file} in your web browser to view the enhanced interactive mind map.")
        print("\nNew features:")
        print("- ✓ Definitions displayed for all concepts")
        print("- ✓ Exact matches shown with clickable links")
        print("- ✓ Copy URI button for each concept")
        print("- ✓ Procedures displayed in hierarchy")
        print("- ✓ Visual differentiation for procedures (yellow background)")

    except FileNotFoundError:
        print(f"Error: File '{ttl_file}' not found.")
    except Exception as e:
        print(f"Error: {e}")
        traceback.print_exc()

Parsing SKOS vocabulary from: SoilVoc.ttl
Found ConceptScheme: soil-health
Number of top concepts: 81
Enhanced interactive mind map generated: index.html

Success! Open index.html in your web browser to view the enhanced interactive mind map.

New features:
- ✓ Definitions displayed for all concepts
- ✓ Exact matches shown with clickable links
- ✓ Copy URI button for each concept
- ✓ Procedures displayed in hierarchy
- ✓ Visual differentiation for procedures (yellow background)
Found ConceptScheme: soil-health
Number of top concepts: 81
Enhanced interactive mind map generated: index.html

Success! Open index.html in your web browser to view the enhanced interactive mind map.

New features:
- ✓ Definitions displayed for all concepts
- ✓ Exact matches shown with clickable links
- ✓ Copy URI button for each concept
- ✓ Procedures displayed in hierarchy
- ✓ Visual differentiation for procedures (yellow background)


In [17]:
# Load the SoilVoc.ttl file
g = Graph()
g.parse("SoilVoc.ttl", format="turtle")

# Define namespaces
SHE = Namespace("https://soilwise-he.github.io/soil-health#")
SCHEME_URI = URIRef("https://soilwise-he.github.io/soil-health")
HAS_PROCEDURE = SHE.hasProcedure

# Get all top concepts from the scheme
top_concepts = set()
for tc in g.objects(SCHEME_URI, SKOS.hasTopConcept):
    top_concepts.add(tc)
for concept in g.subjects(SKOS.topConceptOf, SCHEME_URI):
    top_concepts.add(concept)

# Build broader/narrower hierarchy maps
broader_map = defaultdict(set)
narrower_map = defaultdict(set)
for s, p, o in g.triples((None, SKOS.broader, None)):
    broader_map[s].add(o)
    narrower_map[o].add(s)

# Build the reverse relationship map for she:hasProcedure
# procedure_of: procedure_concept -> list of concepts that have hasProcedure pointing to it
procedure_of = defaultdict(list)
for s, p, o in g.triples((None, HAS_PROCEDURE, None)):
    procedure_of[o].append(s)

# Helper functions
def get_pref_label(concept):
    """Get the prefLabel of a concept, preferring English."""
    for label in g.objects(concept, SKOS.prefLabel):
        if hasattr(label, 'language') and label.language == 'en':
            return str(label)
    for label in g.objects(concept, SKOS.prefLabel):
        return str(label)
    return str(concept).split('#')[-1] if '#' in str(concept) else str(concept).split('/')[-1]

def get_definition(concept):
    """Get the definition of a concept, preferring English."""
    for defn in g.objects(concept, SKOS.definition):
        if hasattr(defn, 'language') and defn.language == 'en':
            return str(defn)
    for defn in g.objects(concept, SKOS.definition):
        return str(defn)
    return ""

def find_all_paths_to_top(concept, visited=None):
    """Find ALL paths from a concept up to top concepts.
    Returns a list of paths, where each path is a list of concepts from top to current."""
    if visited is None:
        visited = set()
    if concept in visited:
        return []
    visited.add(concept)
    
    # Check if it's a valid top concept (has narrower concepts)
    if concept in top_concepts and len(narrower_map.get(concept, set())) > 0:
        return [[concept]]
    
    broader_concepts = broader_map.get(concept, set())
    if not broader_concepts:
        return []
    
    all_paths = []
    for broader in broader_concepts:
        paths = find_all_paths_to_top(broader, visited.copy())
        for path in paths:
            all_paths.append(path + [concept])
    
    return all_paths

def get_hierarchy_path(concept):
    """Get the hierarchy path description for a concept.
    Returns all possible paths if there are multiple."""
    
    # Check if it's a top concept with narrower concepts (valid top concept)
    if concept in top_concepts:
        if len(narrower_map.get(concept, set())) > 0:
            return "Top Concept"
        else:
            # Top concept without narrower = orphan
            return "Orphan Concept"
    
    # Check if it's a procedure concept (linked by others via she:hasProcedure)
    if concept in procedure_of:
        return "Procedure"
    
    # Check if it has any broader or narrower relationships
    has_broader = len(broader_map.get(concept, set())) > 0
    has_narrower = len(narrower_map.get(concept, set())) > 0
    
    if not has_broader and not has_narrower:
        return "Orphan Concept"
    
    # Find all paths to top
    all_paths = find_all_paths_to_top(concept)
    
    if not all_paths:
        if has_broader:
            # Has broader but can't reach a valid top concept
            chain = []
            current = concept
            visited = set()
            while current and current not in visited:
                visited.add(current)
                chain.insert(0, current)
                broaders = broader_map.get(current, set())
                current = next(iter(broaders), None) if broaders else None
            return " > ".join(get_pref_label(c) for c in chain) + " (no top concept)"
        return "Orphan Concept"
    
    # Convert all paths to label strings
    path_strings = []
    for path in all_paths:
        path_str = " > ".join(get_pref_label(c) for c in path)
        if path_str not in path_strings:  # Avoid duplicates
            path_strings.append(path_str)
    
    # Join multiple paths with " || "
    return " || ".join(path_strings)

def get_procedure_linked_concepts(concept):
    """Get concepts that have she:hasProcedure relationship with the current concept."""
    linked = procedure_of.get(concept, [])
    if not linked:
        return ""
    return "; ".join(get_pref_label(c) for c in linked)

# Collect all concepts in the scheme
all_concepts = set()
for s in g.subjects(RDF.type, SKOS.Concept):
    if (s, SKOS.inScheme, SCHEME_URI) in g:
        all_concepts.add(s)

# Build the CSV data
csv_data = []
for concept in all_concepts:
    csv_data.append({
        'prefLabel': get_pref_label(concept),
        'definition': get_definition(concept),
        'hierarchy_path': get_hierarchy_path(concept),
        'procedure_linked_by': get_procedure_linked_concepts(concept)
    })

# Create DataFrame and sort
df = pd.DataFrame(csv_data)
df = df.sort_values('prefLabel', key=lambda x: x.str.lower()).reset_index(drop=True)

# Save to CSV
output_filename = "SoilVoc_concepts.csv"
df.to_csv(output_filename, index=False, encoding='utf-8-sig')

# Print statistics
print(f"CSV saved to: {output_filename}")
print(f"\nTotal concepts: {len(df)}")
print(f"- Top concepts (with narrower): {len(df[df['hierarchy_path'] == 'Top Concept'])}")
print(f"- Procedure concepts: {len(df[df['hierarchy_path'] == 'Procedure'])}")
print(f"- Orphan concepts: {len(df[df['hierarchy_path'] == 'Orphan Concept'])}")
print(f"- Concepts with hierarchy path: {len(df[~df['hierarchy_path'].isin(['Top Concept', 'Procedure', 'Orphan Concept'])])}")
print(f"- Concepts with definitions: {len(df[df['definition'] != ''])}")

# Find concepts with multiple paths
multi_path_concepts = df[df['hierarchy_path'].str.contains(' \\|\\| ', regex=True, na=False)]
print(f"\n- Concepts with MULTIPLE hierarchy paths: {len(multi_path_concepts)}")

if len(multi_path_concepts) > 0:
    print("\nExamples of concepts with multiple paths:")
    for idx, row in multi_path_concepts.head(5).iterrows():
        print(f"\n  {row['prefLabel']}:")
        paths = row['hierarchy_path'].split(' || ')
        for i, p in enumerate(paths, 1):
            print(f"    Path {i}: {p}")

CSV saved to: SoilVoc_concepts.csv

Total concepts: 1067
- Top concepts (with narrower): 48
- Procedure concepts: 222
- Orphan concepts: 33
- Concepts with hierarchy path: 764
- Concepts with definitions: 441

- Concepts with MULTIPLE hierarchy paths: 71

Examples of concepts with multiple paths:

  air humidity:
    Path 1: property > meteorological property > air humidity
    Path 2: property > air property > air humidity

  air temperature:
    Path 1: property > meteorological property > air temperature
    Path 2: property > air property > air temperature

  available water holding capacity:
    Path 1: property > soil property > soil physical property > soil water physical property > soil water holding capacity > available water holding capacity
    Path 2: property > soil property > soil physical property > soil physical capacity > soil water holding capacity > available water holding capacity

  controlled drainage:
    Path 1: soil structure index > soil water transmission ind