In [2]:
import json
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, split_uri
from uk2us import uk_us

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import re
from typing import Dict, List, Optional
import logging
import pandas as pd

In [3]:
# Namespaces
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")
she = Namespace("https://soilwise-he.github.io/soil-health#")

### Soil Health Benchmarks glossary crawler

In [None]:
#!/usr/bin/env python3
"""
Soil Health Benchmarks Glossary Crawler

This script crawls the soilhealthbenchmarks.eu glossary page to extract
all terms and their definitions.
"""

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class SoilGlossaryCrawler:
    def __init__(self, base_url: str = "https://soilhealthbenchmarks.eu", delay: float = 1.0):
        """
        Initialize the crawler
        
        Args:
            base_url: Base URL of the website
            delay: Delay between requests in seconds
        """
        self.base_url = base_url
        self.glossary_url = urljoin(base_url, "/glossary/")
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.terms_data = []
        
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """
        Fetch a web page and return BeautifulSoup object
        
        Args:
            url: URL to fetch
            
        Returns:
            BeautifulSoup object or None if failed
        """
        try:
            logger.info(f"Fetching: {url}")
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_glossary_terms(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
        """
        Extract glossary terms and their URLs from the main glossary page
        
        Args:
            soup: BeautifulSoup object of the glossary page
            
        Returns:
            List of dictionaries containing term info
        """
        terms = []
        
        # Common patterns for glossary links
        selectors = [
            'a[href*="/glossary/"]',  # Links containing /glossary/
            '.glossary-term a',       # Links with glossary-term class
            '.glossary-list a',       # Links in glossary-list
            '.term-link',             # Direct term links
            'article a',              # Article links
            '.entry-title a',         # Entry title links
            'h2 a, h3 a, h4 a',      # Header links
        ]
        
        for selector in selectors:
            links = soup.select(selector)
            for link in links:
                href = link.get('href')
                if href and '/glossary/' in href and href != '/glossary/':
                    # Extract term name
                    term_name = link.get_text(strip=True)
                    if not term_name:
                        # Try to extract from URL
                        term_name = href.split('/')[-1].replace('-', ' ').title()
                    
                    full_url = urljoin(self.base_url, href)
                    
                    # Avoid duplicates
                    if not any(t['url'] == full_url for t in terms):
                        terms.append({
                            'term': term_name,
                            'url': full_url,
                            'slug': href.split('/')[-1] if href.split('/')[-1] else href.split('/')[-2]
                        })
        
        # If no terms found with selectors, try to find any links in the page
        if not terms:
            all_links = soup.find_all('a', href=True)
            for link in all_links:
                href = link.get('href')
                if href and '/glossary/' in href and href != '/glossary/':
                    term_name = link.get_text(strip=True)
                    if term_name and len(term_name) > 1:
                        full_url = urljoin(self.base_url, href)
                        if not any(t['url'] == full_url for t in terms):
                            terms.append({
                                'term': term_name,
                                'url': full_url,
                                'slug': href.split('/')[-1] if href.split('/')[-1] else href.split('/')[-2]
                            })
        
        logger.info(f"Found {len(terms)} glossary terms")
        return terms
    
    def extract_definition(self, soup: BeautifulSoup, term: str) -> str:
        """
        Extract definition from a term's individual page
        
        Args:
            soup: BeautifulSoup object of the term page
            term: Term name
            
        Returns:
            Definition text
        """
        definition = ""
        
        # Common selectors for definition content
        selectors = [
            '.definition',
            '.term-definition',
            '.glossary-definition',
            '.entry-content',
            '.post-content',
            '.content',
            'article .text',
            'article p',
            '.description',
            'main p',
            '.single-content p'
        ]
        
        for selector in selectors:
            elements = soup.select(selector)
            if elements:
                # Get text from all matching elements
                texts = []
                for elem in elements:
                    text = elem.get_text(strip=True)
                    if text and len(text) > 20:  # Avoid very short snippets
                        texts.append(text)
                
                if texts:
                    definition = ' '.join(texts)
                    break
        
        # If no definition found, try to get the main content
        if not definition:
            main_content = soup.find('main') or soup.find('article') or soup.find('.content')
            if main_content:
                paragraphs = main_content.find_all('p')
                texts = []
                for p in paragraphs:
                    text = p.get_text(strip=True)
                    if text and len(text) > 20:
                        texts.append(text)
                definition = ' '.join(texts)
        
        # Clean up the definition
        if definition:
            # Remove extra whitespace
            definition = re.sub(r'\s+', ' ', definition).strip()
            # Remove common navigation text
            definition = re.sub(r'(Home|Glossary|Back to glossary|Navigation|Menu)', '', definition, flags=re.IGNORECASE)
            definition = definition.strip()
        
        return definition
    
    def crawl_glossary(self) -> List[Dict[str, str]]:
        """
        Main crawling method
        
        Returns:
            List of dictionaries containing term and definition data
        """
        logger.info("Starting glossary crawl...")
        
        # Get the main glossary page
        soup = self.get_page(self.glossary_url)
        if not soup:
            logger.error("Failed to fetch glossary page")
            return []
        
        # Extract all term links
        terms = self.extract_glossary_terms(soup)
        if not terms:
            logger.warning("No terms found on glossary page")
            return []
        
        # Get definitions for each term
        results = []
        for i, term_info in enumerate(terms, 1):
            logger.info(f"Processing term {i}/{len(terms)}: {term_info['term']}")
            
            # Get the term's individual page
            term_soup = self.get_page(term_info['url'])
            if term_soup:
                definition = self.extract_definition(term_soup, term_info['term'])
                
                results.append({
                    'term': term_info['term'],
                    'definition': definition,
                    'url': term_info['url'],
                    'slug': term_info['slug']
                })
                
                logger.info(f"Extracted definition for '{term_info['term']}' ({len(definition)} chars)")
            else:
                logger.warning(f"Failed to fetch definition for '{term_info['term']}'")
                results.append({
                    'term': term_info['term'],
                    'definition': "Definition not available",
                    'url': term_info['url'],
                    'slug': term_info['slug']
                })
            
            # Rate limiting
            time.sleep(self.delay)
        
        self.terms_data = results
        logger.info(f"Crawling completed. Found {len(results)} terms.")
        return results
    
    def save_to_json(self, filename: str = "soil_glossary.json"):
        """Save results to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.terms_data, f, indent=2, ensure_ascii=False)
        logger.info(f"Results saved to {filename}")
    
    def save_to_csv(self, filename: str = "soil_glossary.csv"):
        """Save results to CSV file"""
        if not self.terms_data:
            logger.warning("No data to save")
            return
        
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            fieldnames = ['term', 'definition', 'url', 'slug']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.terms_data)
        logger.info(f"Results saved to {filename}")
    
    def print_summary(self):
        """Print a summary of the crawled data"""
        if not self.terms_data:
            print("No data available")
            return
        
        print(f"\n=== Soil Health Benchmarks Glossary Summary ===")
        print(f"Total terms: {len(self.terms_data)}")
        print(f"Terms with definitions: {len([t for t in self.terms_data if t['definition'] and t['definition'] != 'Definition not available'])}")
        print(f"Average definition length: {sum(len(t['definition']) for t in self.terms_data) / len(self.terms_data):.0f} characters")
        
        print(f"\nFirst 5 terms:")
        for i, term in enumerate(self.terms_data[:5]):
            print(f"{i+1}. {term['term']}")
            print(f"   Definition: {term['definition'][:100]}...")
            print(f"   URL: {term['url']}")
            print()

def main():
    """Main execution function"""
    # Create crawler instance
    crawler = SoilGlossaryCrawler(delay=1.5)  # 1.5 second delay between requests
    
    # Crawl the glossary
    results = crawler.crawl_glossary()
    
    if results:
        # Save results
        crawler.save_to_json()
        crawler.save_to_csv()
        
        # Print summary
        crawler.print_summary()
        
        print("\nCrawling completed successfully!")
        print("Files saved:")
        print("- soil_glossary.json")
        print("- soil_glossary.csv")
    else:
        print("No results found. Please check the website structure.")

if __name__ == "__main__":
    main()


2025-07-08 21:56:23,536 - INFO - Starting glossary crawl...
2025-07-08 21:56:23,537 - INFO - Fetching: https://soilhealthbenchmarks.eu/glossary/
2025-07-08 21:56:24,050 - INFO - Found 141 glossary terms
2025-07-08 21:56:24,050 - INFO - Processing term 1/141: Glossary
2025-07-08 21:56:24,050 - INFO - Fetching: https://soilhealthbenchmarks.eu/glossary/
2025-07-08 21:56:24,199 - INFO - Extracted definition for 'Glossary' (3254 chars)
2025-07-08 21:56:25,709 - INFO - Processing term 2/141: Soil Health
2025-07-08 21:56:25,709 - INFO - Fetching: https://soilhealthbenchmarks.eu/glossary/4036/
2025-07-08 21:56:26,905 - INFO - Extracted definition for 'Soil Health' (1005 chars)
2025-07-08 21:56:28,409 - INFO - Processing term 3/141: ecosystem services
2025-07-08 21:56:28,409 - INFO - Fetching: https://soilhealthbenchmarks.eu/glossary/ecosystem-services/
2025-07-08 21:56:29,416 - INFO - Extracted definition for 'ecosystem services' (751 chars)
2025-07-08 21:56:30,924 - INFO - Processing term 4/1


=== Soil Health Benchmarks Glossary Summary ===
Total terms: 141
Terms with definitions: 141
Average definition length: 431 characters

First 5 terms:
1. Glossary
   Definition: Sharing knowledge and removing barriers is crucial for the success of theSoil HealthBENCHMARKS proje...
   URL: https://soilhealthbenchmarks.eu/glossary/

2. Soil Health
   Definition: «  IndexScientific definition:Soil healthis the ability of asoil, at a specified point in time, to f...
   URL: https://soilhealthbenchmarks.eu/glossary/4036/

3. ecosystem services
   Definition: «  IndexSoil HealthBenchmarks definition:The contributions of ecosystems (i.e. living systems) to hu...
   URL: https://soilhealthbenchmarks.eu/glossary/ecosystem-services/

4. soil functions
   Definition: «  IndexAre one or a combination ofsoil processesthat underpin the dynamics of theecosystemstructure...
   URL: https://soilhealthbenchmarks.eu/glossary/soil-functions/

5. soil
   Definition: «  IndexSoil HealthBenchmarks definitio

### Convert glossary to SKOS vocabs

In [8]:
def term_to_uri_fragment(term):
    """Convert a term to URI fragment by making it lowercase and replacing spaces with hyphens"""
    return term.lower().replace(' ', '-').replace('_', '-')

def extract_uri_fragment_from_url(url):
    """Extract the URI fragment from the URL, removing trailing slash"""
    if pd.isna(url) or not url:
        return None
    
    # Remove trailing slash
    url = url.rstrip('/')
    
    # Extract the last part after the last slash
    return url.split('/')[-1]

def csv_to_skos_rdf(csv_file_path, output_file_path=None):
    """Convert CSV file to SKOS RDF format using rdflib"""
    
    # Create RDF graph
    g = Graph()
    
    # Define namespaces
    BENCHMARKS = Namespace("https://soilhealthbenchmarks.eu/glossary/")
    
    # Bind namespaces to prefixes
    g.bind("benchmarks", BENCHMARKS)
    g.bind("skos", SKOS)
    
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Create mapping of lowercase terms to their URI fragments
    term_to_fragment_map = {}
    
    # First pass: create the mapping using URLs where available, otherwise generate from terms
    for _, row in df.iterrows():
        term = row['term']
        url = row['url']
        
        # Try to extract fragment from URL first
        fragment = extract_uri_fragment_from_url(url)
        if not fragment:
            # If no valid fragment from URL, generate from term
            fragment = term_to_uri_fragment(term)
        
        term_to_fragment_map[term.lower()] = fragment
    
    # Second pass: create the RDF triples
    for _, row in df.iterrows():
        term = row['term']
        definition = row['definition']
        url = row['url']
        related = row['related']
        
        # Get URI fragment
        fragment = term_to_fragment_map[term.lower()]
        concept_uri = BENCHMARKS[fragment]
        
        # Add concept type
        g.add((concept_uri, RDF.type, SKOS.Concept))
        
        # Add prefLabel (lowercase)
        g.add((concept_uri, SKOS.prefLabel, Literal(term.lower(), lang="en")))
        
        # Add definition(s) - split by "|" if multiple definitions exist
        if pd.notna(definition) and definition:
            # Split by "|" and clean up each definition
            definitions = [d.strip() for d in definition.split('|') if d.strip()]
            
            # Add each definition as a separate skos:definition triple
            for def_text in definitions:
                g.add((concept_uri, SKOS.definition, Literal(def_text, lang="en")))
        
        # Add related terms
        if pd.notna(related) and related:
            # Split by semicolon and clean up
            related_terms = [t.strip() for t in related.split(';') if t.strip()]
            
            for related_term in related_terms:
                # Try to find matching term in our mapping (case-insensitive)
                related_term_lower = related_term.lower()
                if related_term_lower in term_to_fragment_map:
                    related_fragment = term_to_fragment_map[related_term_lower]
                else:
                    # If not found, create fragment directly from the term
                    related_fragment = term_to_uri_fragment(related_term)
                
                related_uri = BENCHMARKS[related_fragment]
                g.add((concept_uri, SKOS.related, related_uri))
    
    # Serialize to Turtle format
    turtle_content = g.serialize(format='turtle')
    
    # Write to file if output path provided
    if output_file_path:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(turtle_content)
        print(f"RDF Turtle file saved to: {output_file_path}")
    
    return turtle_content, g

def print_sample_concepts(graph, namespace, num_concepts=3):
    """Print sample concepts from the graph"""
    print(f"\nSample concepts (first {num_concepts}):")
    print("=" * 50)
    
    # Get all concepts
    concepts = list(graph.subjects(RDF.type, SKOS.Concept))
    
    for i, concept in enumerate(concepts[:num_concepts]):
        print(f"\nConcept {i+1}: {concept}")
        
        # Get prefLabel
        pref_labels = list(graph.objects(concept, SKOS.prefLabel))
        if pref_labels:
            print(f"  skos:prefLabel: {pref_labels[0]}")
        
        # Get definitions (may be multiple)
        definitions = list(graph.objects(concept, SKOS.definition))
        if definitions:
            if len(definitions) == 1:
                definition = str(definitions[0])
                # Truncate long definitions for display
                if len(definition) > 100:
                    definition = definition[:100] + "..."
                print(f"  skos:definition: {definition}")
            else:
                print(f"  skos:definition ({len(definitions)} definitions):")
                for j, def_text in enumerate(definitions, 1):
                    definition = str(def_text)
                    if len(definition) > 80:
                        definition = definition[:80] + "..."
                    print(f"    {j}. {definition}")
        
        # Get related terms
        related_terms = list(graph.objects(concept, SKOS.related))
        if related_terms:
            print(f"  skos:related: {', '.join(str(rt) for rt in related_terms)}")
        
        print("-" * 40)

def validate_ecosystem_services_example(graph, namespace):
    """Validate the ecosystem services example matches the expected format"""
    print("\nValidating 'ecosystem services' example:")
    print("=" * 50)
    
    # Find the ecosystem services concept
    ecosystem_services_uri = namespace['ecosystem-services']
    
    # Check if it exists
    if (ecosystem_services_uri, RDF.type, SKOS.Concept) in graph:
        print(f"✓ Found concept: {ecosystem_services_uri}")
        
        # Check prefLabel
        pref_labels = list(graph.objects(ecosystem_services_uri, SKOS.prefLabel))
        if pref_labels and str(pref_labels[0]) == "ecosystem services":
            print(f"✓ Correct prefLabel: {pref_labels[0]}")
        else:
            print(f"✗ Incorrect prefLabel: {pref_labels}")
        
        # Check definitions (may be multiple)
        definitions = list(graph.objects(ecosystem_services_uri, SKOS.definition))
        if definitions:
            print(f"✓ Has {len(definitions)} definition(s):")
            for i, def_text in enumerate(definitions, 1):
                def_str = str(def_text)
                if len(def_str) > 80:
                    def_str = def_str[:80] + "..."
                print(f"    {i}. {def_str}")
        else:
            print("✗ Missing definitions")
        
        # Check related terms
        related_terms = list(graph.objects(ecosystem_services_uri, SKOS.related))
        expected_related = ['provisioning', 'regulation-and-maintenance', 'cultural']
        
        found_related = [str(rt).split('/')[-1] for rt in related_terms]
        if all(expected in found_related for expected in expected_related):
            print(f"✓ Correct related terms: {found_related}")
        else:
            print(f"✗ Incorrect related terms. Expected: {expected_related}, Found: {found_related}")
        
    else:
        print("✗ Ecosystem services concept not found")

# Example usage
if __name__ == "__main__":
    # Convert the CSV file to SKOS RDF
    print("Converting CSV to SKOS RDF using rdflib...")
    turtle_content, graph = csv_to_skos_rdf('soil_glossary.csv', 'soil_glossary.ttl')
    
    # Get the namespace for validation
    BENCHMARKS = Namespace("https://soilhealthbenchmarks.eu/glossary/")
    
    # Print statistics
    print(f"\nConversion complete!")
    print(f"Total triples: {len(graph)}")
    print(f"Total concepts: {len(list(graph.subjects(RDF.type, SKOS.Concept)))}")
    
    # Print sample concepts
    print_sample_concepts(graph, BENCHMARKS)
    
    # Validate the ecosystem services example
    validate_ecosystem_services_example(graph, BENCHMARKS)
    
    # Print the first part of the Turtle serialization
    print("\nFirst 50 lines of Turtle output:")
    print("=" * 50)
    lines = turtle_content.split('\n')
    for i, line in enumerate(lines[:50]):
        print(f"{i+1:2d}: {line}")
    
    if len(lines) > 50:
        print(f"... and {len(lines) - 50} more lines")

Converting CSV to SKOS RDF using rdflib...
RDF Turtle file saved to: soil_glossary.ttl

Conversion complete!
Total triples: 439
Total concepts: 140

Sample concepts (first 3):

Concept 1: https://soilhealthbenchmarks.eu/glossary/4036
  skos:prefLabel: soil health
  skos:definition (2 definitions):
    1. Soil health is the ability of a soil, at a specified point in time, to function ...
    2. The physical, chemical and biological condition of the soil determining its capa...
----------------------------------------

Concept 2: https://soilhealthbenchmarks.eu/glossary/ecosystem-services
  skos:prefLabel: ecosystem services
  skos:definition (2 definitions):
    1. The contributions of ecosystems (i.e. living systems) to human well-being.
    2. Indirect contributions of ecosystems to the economic, social,culturaland other b...
  skos:related: https://soilhealthbenchmarks.eu/glossary/provisioning, https://soilhealthbenchmarks.eu/glossary/regulation-and-maintenance, https://soilhealthben

### Interlink SKOS vocabs with other thesauri

In [5]:
# Function to normalize British English to American English
def normalize_uk_to_us(label):
    for uk_spelling, us_spelling in uk_us:
        # Use regular expressions to replace only full words
        label = re.sub(rf'\b{uk_spelling}\b', us_spelling, label)
    return label

def concept2vocabs(g, file_path):
    # Load the CSV file
    pref_label_map = {}
    alt_label_map = {}
    with open(file_path, encoding="utf8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            new_uri = row['concept']
            pref_label = row['prefLabel'].lower()
            alt_labels = row['altLabels'].split(';') if row['altLabels'] else []
            pref_label_map[pref_label] = new_uri
            for alt_label in alt_labels:
                alt_label = alt_label.strip().lower()  # Clean and normalize altLabel
                if alt_label:
                    alt_label_map[alt_label] = new_uri

    # Iterate over the graph and process matching prefLabel and altLabel
    for s, p, o in g:
        label = str(o).lower()
        normalized_label = normalize_uk_to_us(label)  # Apply normalization

        if p == SKOS.prefLabel and normalized_label in pref_label_map:
            # Add skos:exactMatch triples
            exact_match_uri = rdflib.URIRef(pref_label_map[normalized_label])
            g.add((s, SKOS.exactMatch, exact_match_uri))
        elif p == SKOS.prefLabel and normalized_label in alt_label_map:
            # Add skos:closeMatch triples
            close_match_uri = rdflib.URIRef(alt_label_map[normalized_label])
            g.add((s, SKOS.closeMatch, close_match_uri))

    return g

In [None]:
# Link to AGROVOC vocabs
graph = concept2vocabs(graph, "ontovocabs/agrovoc.csv")
graph.bind("agrovoc", agrovoc)

In [None]:
# Link to ISO11074
graph = concept2vocabs(graph, "ontovocabs/ISO11074v2025.csv")
graph.bind("iso11074", iso11074)

In [None]:
# Link to INRAE Thesaurus
graph = concept2vocabs(graph, "ontovocabs/inrae.csv")
graph.bind("inrae", inrae)

In [None]:
# Link to GEMET Thesaurus
graph = concept2vocabs(graph, "ontovocabs/gemet.csv")
graph.bind("gemet", gemet)

In [14]:
# Serialize the final graph to Turtle format
graph.serialize(destination="soil_health_benchmarks.ttl", format="turtle")

<Graph identifier=Nd12457f50a4b4f518c2850bd248ebff0 (<class 'rdflib.graph.Graph'>)>

### Interlink SKOS vocabs with soil health KG

In [4]:
def extract_skos_concepts_to_csv(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts from a TTL file and save to CSV.
    
    Args:
        ttl_file_path (str): Path to the input TTL file
        output_csv_path (str): Path to the output CSV file
    """
    
    # Create a graph and parse the TTL file
    g = Graph()
    g.parse(ttl_file_path, format='turtle')
    
    # Define namespaces
    AGRONTOLOGY = Namespace("http://aims.fao.org/aos/agrontology#")
    
    # Bind namespaces for cleaner output (optional)
    g.bind("skos", SKOS)
    g.bind("agrontology", AGRONTOLOGY)
    
    # Query for all SKOS concepts
    concepts_data = []
    
    # Find all subjects that are of type skos:Concept
    for concept_uri in g.subjects(predicate=URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), 
                                 object=SKOS.Concept):
        
        # Get prefLabel (should be unique)
        pref_label = ""
        for pref in g.objects(concept_uri, SKOS.prefLabel):
            pref_label = str(pref)
            break  # Take the first one if multiple exist
        
        # Get altLabels and abbreviations
        alt_labels = []
        
        # Collect skos:altLabel
        for alt in g.objects(concept_uri, SKOS.altLabel):
            alt_labels.append(str(alt))
        
        # Collect agrontology:hasAbbreviation
        for abbrev in g.objects(concept_uri, AGRONTOLOGY.hasAbbreviation):
            alt_labels.append(str(abbrev))
        
        # Join alternative labels with semicolon
        alt_labels_str = ";".join(alt_labels) if alt_labels else ""
        
        # Add to results
        concepts_data.append([
            str(concept_uri),  # Full URI
            pref_label,        # Preferred label
            alt_labels_str     # Alternative labels/abbreviations
        ])
    
    # Sort by URI for consistent output
    concepts_data.sort(key=lambda x: x[0])
    
    # Write to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write header
        writer.writerow(['URI', 'prefLabel', 'altLabel_abbreviation'])
        
        # Write data
        writer.writerows(concepts_data)
    
    print(f"Extracted {len(concepts_data)} SKOS concepts to {output_csv_path}")
    return len(concepts_data)

def extract_with_sparql(ttl_file_path, output_csv_path):
    """
    Alternative implementation using SPARQL query.
    This might be more efficient for large files.
    """
    
    # Create a graph and parse the TTL file
    g = Graph()
    g.parse(ttl_file_path, format='turtle')
    
    # Define namespaces
    g.bind("skos", SKOS)
    g.bind("agrontology", "http://aims.fao.org/aos/agrontology#")
    
    # SPARQL query to get all concepts with their labels
    query = """
    SELECT ?concept ?prefLabel 
           (GROUP_CONCAT(DISTINCT ?altLabel; separator=";") AS ?altLabels)
           (GROUP_CONCAT(DISTINCT ?abbreviation; separator=";") AS ?abbreviations)
    WHERE {
        ?concept a skos:Concept .
        
        OPTIONAL { ?concept skos:prefLabel ?prefLabel }
        OPTIONAL { ?concept skos:altLabel ?altLabel }
        OPTIONAL { ?concept agrontology:hasAbbreviation ?abbreviation }
    }
    GROUP BY ?concept ?prefLabel
    ORDER BY ?concept
    """
    
    # Execute query
    results = g.query(query)
    
    # Process results
    concepts_data = []
    for row in results:
        concept_uri = str(row.concept)
        pref_label = str(row.prefLabel) if row.prefLabel else ""
        
        # Combine altLabels and abbreviations
        alt_parts = []
        if row.altLabels and str(row.altLabels) != "None":
            alt_parts.extend(str(row.altLabels).split(";"))
        if row.abbreviations and str(row.abbreviations) != "None":
            alt_parts.extend(str(row.abbreviations).split(";"))
        
        # Remove empty strings and join
        alt_parts = [part.strip() for part in alt_parts if part.strip()]
        alt_labels_str = ";".join(alt_parts)
        
        concepts_data.append([concept_uri, pref_label, alt_labels_str])
    
    # Write to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['URI', 'prefLabel', 'altLabel_abbreviation'])
        writer.writerows(concepts_data)
    
    print(f"Extracted {len(concepts_data)} SKOS concepts to {output_csv_path}")
    return len(concepts_data)

# Example usage
if __name__ == "__main__":
    # Replace with your actual file paths
    ttl_file = "soil_health_KG.ttl"
    csv_output = "soil_health_KG.csv"
    
    try:
        # Use the first method (iterative approach)
        count = extract_skos_concepts_to_csv(ttl_file, csv_output)
        
        # Alternatively, you can use the SPARQL approach:
        # count = extract_with_sparql(ttl_file, csv_output)
        
        print(f"Successfully processed {count} concepts")
        
    except Exception as e:
        print(f"Error processing file: {e}")
        print("Make sure you have rdflib installed: pip install rdflib")

Extracted 1789 SKOS concepts to soil_health_KG.csv
Successfully processed 1789 concepts


In [7]:
# Link to soil health KG
graph = concept2vocabs(graph, "ontovocabs/soil_health_KG.csv")
graph.bind("she", she)

In [8]:
# Serialize the final graph to Turtle format
graph.serialize(destination="soil_health_benchmarks.ttl", format="turtle")

<Graph identifier=N1105f5023ca44a649ed017f67f020386 (<class 'rdflib.graph.Graph'>)>