In [1]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode, Dataset
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, split_uri

from openai import OpenAI
import re

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import pandas as pd

import time
from typing import Dict, Any, Optional, Tuple, List
import logging
import sys

In [2]:
# Opening config file, the config structure is:
# {"openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [3]:
def load_graph(data):
    g = rdflib.Graph()
    g.parse(data=data, format="turtle")
    return g

In [4]:
def print_rdf(rdf):
    g = rdflib.Graph()
    g.parse(data=rdf, format="turtle")

    for s, p, o in g:
        print(s, p, o)

In [None]:
# Namespaces
she = Namespace("https://soilwise-he.github.io/soil-health#")
agrovoc = Namespace("http://aims.fao.org/aos/agrovoc/")
agrontology = Namespace("http://aims.fao.org/aos/agrontology#")
sio = Namespace("http://semanticscience.org/resource/")
glosis_lh = Namespace("http://w3id.org/glosis/model/layerhorizon/")
glosis_sp = Namespace("http://w3id.org/glosis/model/siteplot/")
qudt = Namespace("http://qudt.org/schema/qudt/")
unit = Namespace("http://qudt.org/vocab/unit/")
iso11074 = Namespace("https://data.geoscience.earth/ncl/ISO11074v2025/")
obo = Namespace("http://purl.obolibrary.org/obo/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
biolink = Namespace("https://w3id.org/biolink/vocab/")
afox = Namespace("http://purl.allotrope.org/ontologies/property#")
afor = Namespace("http://purl.allotrope.org/ontologies/result#")
sorelsc = Namespace("http://sweetontology.net/relaSci/")
sorelpr = Namespace("http://sweetontology.net/relaProvenance/")
sohuj = Namespace("http://sweetontology.net/humanJurisdiction/")
sorelph = Namespace("http://sweetontology.net/relaPhysical/")
sorelm = Namespace("http://sweetontology.net/relaMath/")
sorepsg = Namespace("http://sweetontology.net/reprSpaceGeometry/")
bao = Namespace("http://www.bioassayontology.org/bao#")
repr = Namespace("https://w3id.org/reproduceme#")
sorelch = Namespace("http://sweetontology.net/relaChemical/")
sorelsp = Namespace("http://sweetontology.net/relaSpace/")
om = Namespace("http://www.ontology-of-units-of-measure.org/resource/om-2/")
gemet = Namespace("http://www.eionet.europa.eu/gemet/concept/")
inrae = Namespace("http://opendata.inrae.fr/thesaurusINRAE/")

### Vocabs or not vocabs

In [None]:
def extract_skos_concepts_to_csv(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts from a TTL file and save to CSV.
    
    Args:
        ttl_file_path (str): Path to the input TTL file
        output_csv_path (str): Path to the output CSV file
    """
    
    # Create a graph and parse the TTL file
    g = Graph()
    g.parse(ttl_file_path, format='turtle')
    
    # Define namespaces
    AGRONTOLOGY = Namespace("http://www.semanticweb.org/agrontology/")
    
    # Bind namespaces for cleaner output (optional)
    g.bind("skos", SKOS)
    g.bind("agrontology", AGRONTOLOGY)
    
    # Query for all SKOS concepts
    concepts_data = []
    
    # Find all subjects that are of type skos:Concept
    for concept_uri in g.subjects(predicate=URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), 
                                 object=SKOS.Concept):
        
        # Get prefLabel (should be unique)
        pref_label = ""
        for pref in g.objects(concept_uri, SKOS.prefLabel):
            pref_label = str(pref)
            break  # Take the first one if multiple exist
        
        # Get altLabels and abbreviations
        alt_labels = []
        
        # Collect skos:altLabel
        for alt in g.objects(concept_uri, SKOS.altLabel):
            alt_labels.append(str(alt))
        
        # Collect agrontology:hasAbbreviation
        for abbrev in g.objects(concept_uri, AGRONTOLOGY.hasAbbreviation):
            alt_labels.append(str(abbrev))
        
        # Join alternative labels with semicolon
        alt_labels_str = ";".join(alt_labels) if alt_labels else ""
        
        # Add to results
        concepts_data.append([
            str(concept_uri),  # Full URI
            pref_label,        # Preferred label
            alt_labels_str     # Alternative labels/abbreviations
        ])
    
    # Sort by URI for consistent output
    concepts_data.sort(key=lambda x: x[0])
    
    # Write to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write header
        writer.writerow(['URI', 'prefLabel', 'altLabel_abbreviation'])
        
        # Write data
        writer.writerows(concepts_data)
    
    print(f"Extracted {len(concepts_data)} SKOS concepts to {output_csv_path}")
    return len(concepts_data)

if __name__ == "__main__":
    # Replace with your actual file paths
    ttl_file = "soil_health_KG.ttl"
    csv_output = "skos_concepts.csv"
    
    try:
        # Use the first method (iterative approach)
        count = extract_skos_concepts_to_csv(ttl_file, csv_output)
        
        print(f"Successfully processed {count} concepts")
        
    except Exception as e:
        print(f"Error processing file: {e}")
        print("Make sure you have rdflib installed: pip install rdflib")

Extracted 1786 SKOS concepts to skos_concepts.csv
Successfully processed 1786 concepts


In [None]:
def extract_skos_concepts_with_matches(ttl_file_path, output_csv_path):
    """
    Extract SKOS concepts that have exactMatch or closeMatch properties
    and save their URIs to a CSV file.
    
    Args:
        ttl_file_path (str): Path to the TTL file containing the RDF knowledge graph
        output_csv_path (str): Path where the CSV file will be saved
    """
    
    # Create a graph and load the TTL file
    g = Graph()
    try:
        g.parse(ttl_file_path, format='turtle')
        print(f"Successfully loaded {len(g)} triples from {ttl_file_path}")
    except Exception as e:
        print(f"Error loading TTL file: {e}")
        return
    
    # Define SKOS namespace
    SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
    
    # Set to store unique concept URIs
    concepts_with_matches = set()
    
    # Helper function to check if a URI is a skos:Concept
    def is_skos_concept(uri):
        return (uri, rdflib.RDF.type, SKOS.Concept) in g
    
    # Query for concepts with exactMatch
    exact_match_concepts = g.subjects(SKOS.exactMatch, None)
    for concept in exact_match_concepts:
        if isinstance(concept, URIRef) and is_skos_concept(concept):
            concepts_with_matches.add(str(concept))
    
    # Query for concepts with closeMatch
    close_match_concepts = g.subjects(SKOS.closeMatch, None)
    for concept in close_match_concepts:
        if isinstance(concept, URIRef) and is_skos_concept(concept):
            concepts_with_matches.add(str(concept))
    
    # Convert to sorted list for consistent output
    concepts_list = sorted(list(concepts_with_matches))
    
    print(f"Found {len(concepts_list)} unique concepts with exactMatch or closeMatch properties")
    
    # Save to CSV file
    try:
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            # Write header
            writer.writerow(['concept_uri'])
            # Write concept URIs
            for concept_uri in concepts_list:
                writer.writerow([concept_uri])
        
        print(f"Successfully saved concept URIs to {output_csv_path}")
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return
    
    return concepts_list

if __name__ == "__main__":
    # Basic version - just concept URIs
    ttl_file = "soil_health_KG.ttl"  # Replace with your TTL file path
    output_csv = "skos_concepts_with_matches.csv"
    
    concepts = extract_skos_concepts_with_matches(ttl_file, output_csv)

Successfully loaded 11703 triples from soil_health_KG.ttl
Found 462 unique concepts with exactMatch or closeMatch properties
Successfully saved concept URIs to skos_concepts_with_matches.csv


In [6]:
def analyze_uri_sets(input_csv_path, output_dir="./"):
    """
    Analyze URIs from two columns and create separate CSV files for each set operation result.
    
    Args:
        input_csv_path (str): Path to the input CSV file
        output_dir (str): Directory to save output files
    """
    
    # Read the CSV file
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Successfully loaded CSV with {len(df)} rows")
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return
    
    # Check if required columns exist
    if 'matching' not in df.columns or 'llm' not in df.columns:
        print("Error: Required columns 'matching' and 'llm' not found in CSV")
        return
    
    # Remove NaN values and convert to sets
    keywords_set = set(df['matching'].dropna().astype(str))
    thesauri_set = set(df['llm'].dropna().astype(str))
    
    print(f"Number of unique URIs in matching column: {len(keywords_set)}")
    print(f"Number of unique URIs in llm column: {len(thesauri_set)}")
    
    # Perform set operations
    union_set = keywords_set.union(thesauri_set)
    intersection_set = keywords_set.intersection(thesauri_set)
    keywords_only = keywords_set - thesauri_set
    thesauri_only = thesauri_set - keywords_set
    
    print(f"\nSet operation results:")
    print(f"Union (all unique URIs): {len(union_set)}")
    print(f"Intersection (URIs in both columns): {len(intersection_set)}")
    print(f"matching only: {len(keywords_only)}")
    print(f"llm only: {len(thesauri_only)}")
    
    # Save each set to a separate CSV file
    sets_data = {
        'union': union_set,
        'intersection': intersection_set,
        'matching_only': keywords_only,
        'llm_only': thesauri_only
    }
    
    for set_name, uri_set in sets_data.items():
        filename = f"{output_dir}uri_{set_name}.csv"
        try:
            df_temp = pd.DataFrame({'URI': sorted(uri_set)})
            df_temp.to_csv(filename, index=False)
            print(f"Saved {len(uri_set)} URIs to {filename}")
        except Exception as e:
            print(f"Error saving {filename}: {e}")
    
    print(f"\nAll files saved successfully to directory: {output_dir}")

# Example usage
if __name__ == "__main__":
    # Replace with your actual file paths
    input_file = "union.csv"
    output_directory = "./"  # Current directory, change as needed
    
    analyze_uri_sets(input_file, output_directory)

Successfully loaded CSV with 758 rows
Number of unique URIs in matching column: 758
Number of unique URIs in llm column: 738

Set operation results:
Union (all unique URIs): 1093
Intersection (URIs in both columns): 403
matching only: 355
llm only: 335
Saved 1093 URIs to ./uri_union.csv
Saved 403 URIs to ./uri_intersection.csv
Saved 355 URIs to ./uri_matching_only.csv
Saved 335 URIs to ./uri_llm_only.csv

All files saved successfully to directory: ./


#### LLM-as-a-judge

In [6]:
system_prompt_voc = """You are an expert AI assistant specializing in soil science and controlled vocabulary development. Your task is to analyze a given term and determine if it is suitable for inclusion in a formal, standardized soil science vocabulary.

**Your Goal:**
Classify each term you receive into one of two categories: "Vocabulary" or "Un-vocabulary".

**Definitions and Rules:**

1. **"Vocabulary" Term:**
   
   * Represents a standardized, reusable, and generic concept *within the domain of soil science*.
   * It is often a general concept that can have specific instances, values, or measurements.
   * It can be singular or plural.
   * Abbreviations or standard acronyms that refer directly to those concepts (e.g. `SOC`, `DDT`).
   * It should be a noun or a noun phrase that is broadly recognized and used in soil science literature, without evaluative or descriptive adjectives (avoid “high”, “moderate”, “low”, etc.).
   * *Examples of Vocabulary Terms:* `soil organic carbon`, `cation exchange capacity`, `soil texture`, `bulk density`, `soil horizon`, `parent material`, `silt loam`.
2. **"Un-vocabulary" Term:**
   A term is classified as "Un-vocabulary" if it meets **any** of the following criteria:
   
   * **Evaluative/descriptive instances:** It represents a specific *measurement*, *qualitative state*, or *quantitative description* of a vocabulary term (e.g. “moderate soil organic carbon content”, “high bulk density”, “poor CEC”).
   * **Too broad or out of scope:** The term is a generic concept that is not specific to soil science and lacks a direct, unique meaning within the domain (e.g. “time”, “location” when unqualified).
   * **Context-specific phrases:** The term is phrased as a statement or sentence fragment rather than a standardized standalone noun concept (e.g. “agricultural area under severe erosion”).
3. **Confidence:**
   
   * Provide a confidence score between 0 and 1 reflecting how certain you are in your Vocabulary/Un-vocabulary decision.

**Output Format:**
For every term you are given, you MUST respond in the strict JSON format. Do not add any extra conversation or pleasantries."""

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TermProcessor:
    def __init__(self, system_prompt: str, user_prompt_template: str):
        """
        Initialize the processor with prompts
        
        Args:
            system_prompt: The system prompt (unchanged for all calls)
            user_prompt_template: Template for user prompt with {term} placeholder
        """
        self.client = OpenAI()
        self.system_prompt = system_prompt
        self.user_prompt_template = user_prompt_template
        
    def get_term_from_row(self, row: pd.Series) -> str:
        """
        Extract the term from a CSV row (preferred label, fallback to alternative label)
        
        Args:
            row: Pandas Series representing a CSV row
            
        Returns:
            The term to process
        """
        # Assuming columns are: URL, preferred_label, alternative_label
        preferred_label = row.iloc[1] if len(row) > 1 else ""
        alternative_label = row.iloc[2] if len(row) > 2 else ""
        
        # Use preferred label if available and not empty, otherwise use alternative
        if pd.notna(preferred_label) and str(preferred_label).strip():
            return str(preferred_label).strip()
        elif pd.notna(alternative_label) and str(alternative_label).strip():
            return str(alternative_label).strip()
        else:
            return ""
    
    def call_llm_api(self, term: str) -> Optional[Dict[str, Any]]:
        """
        Call the LLM API with the given term
        
        Args:
            term: The term to evaluate
            
        Returns:
            JSON response from LLM or None if error
        """
        try:
            # Create the user prompt with the term
            prompt_voc = self.user_prompt_template.format(term=term)
            
            completion = self.client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": prompt_voc}
                ],
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "soil_vocab_review",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "term": {
                                    "type": "string",
                                    "description": "The specific term being evaluated, exactly as input."
                                },
                                "is_vocab_term": {
                                    "type": "boolean",
                                    "description": "Whether the term should be included in the controlled vocabulary for soil science."
                                },
                                "confidence_score": {
                                    "type": "number",
                                    "description": "Confidence score of the judgement, from 0 to 1 (inclusive).",
                                    "minimum": 0,
                                    "maximum": 1
                                }
                            },
                            "required": [
                                "term",
                                "is_vocab_term",
                                "confidence_score"
                            ],
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                }
            )
            
            # Parse the JSON response
            response_content = completion.choices[0].message.content
            return json.loads(response_content)
            
        except Exception as e:
            logger.error(f"Error calling LLM API for term '{term}': {str(e)}")
            return None
    
    def process_csv(self, input_csv_path: str, output_csv_path: str, 
                   delay_seconds: float = 1.0, resume_from_row: int = 0):
        """
        Process the CSV file and generate results
        
        Args:
            input_csv_path: Path to input CSV file
            output_csv_path: Path to output CSV file
            delay_seconds: Delay between API calls to respect rate limits
            resume_from_row: Row number to resume from (0-indexed)
        """
        # Read the input CSV
        try:
            df = pd.read_csv(input_csv_path)
            logger.info(f"Loaded CSV with {len(df)} rows")
        except Exception as e:
            logger.error(f"Error reading CSV file: {str(e)}")
            return
        
        # Prepare results list
        results = []
        
        # Load existing results if resuming
        if resume_from_row > 0:
            try:
                existing_df = pd.read_csv(output_csv_path)
                results = existing_df.to_dict('records')
                logger.info(f"Resuming from row {resume_from_row}, loaded {len(results)} existing results")
            except FileNotFoundError:
                logger.warning(f"Output file {output_csv_path} not found, starting fresh")
                resume_from_row = 0
        
        # Process each row starting from resume_from_row
        for idx, row in df.iloc[resume_from_row:].iterrows():
            actual_idx = idx if resume_from_row == 0 else resume_from_row + (idx - df.iloc[resume_from_row:].index[0])
            
            # Get the term from the row
            term = self.get_term_from_row(row)
            
            if not term:
                logger.warning(f"Row {actual_idx}: No valid term found, skipping")
                continue
            
            logger.info(f"Processing row {actual_idx}: '{term}'")
            
            # Call LLM API
            result = self.call_llm_api(term)
            
            if result:
                # Add original row data to the result
                result['original_url'] = row.iloc[0] if len(row) > 0 else ""
                result['preferred_label'] = row.iloc[1] if len(row) > 1 else ""
                result['alternative_label'] = row.iloc[2] if len(row) > 2 else ""
                result['row_index'] = actual_idx
                
                results.append(result)
                logger.info(f"Row {actual_idx}: Success - is_vocab_term: {result['is_vocab_term']}, confidence: {result['confidence_score']}")
            else:
                # Add error entry
                error_result = {
                    'term': term,
                    'is_vocab_term': None,
                    'confidence_score': None,
                    'original_url': row.iloc[0] if len(row) > 0 else "",
                    'preferred_label': row.iloc[1] if len(row) > 1 else "",
                    'alternative_label': row.iloc[2] if len(row) > 2 else "",
                    'row_index': actual_idx,
                    'error': 'API call failed'
                }
                results.append(error_result)
                logger.error(f"Row {actual_idx}: Failed to process term '{term}'")
            
            # Save results periodically (every 10 rows)
            if len(results) % 10 == 0:
                self.save_results(results, output_csv_path)
                logger.info(f"Saved intermediate results ({len(results)} rows)")
            
            # Delay between API calls
            if delay_seconds > 0:
                time.sleep(delay_seconds)
        
        # Save final results
        self.save_results(results, output_csv_path)
        logger.info(f"Processing complete. Results saved to {output_csv_path}")
        
        # Print summary
        successful_calls = sum(1 for r in results if r.get('is_vocab_term') is not None)
        failed_calls = len(results) - successful_calls
        vocab_terms = sum(1 for r in results if r.get('is_vocab_term') is True)
        
        logger.info(f"Summary: {successful_calls} successful, {failed_calls} failed, {vocab_terms} vocab terms identified")
    
    def save_results(self, results: list, output_csv_path: str):
        """Save results to CSV file"""
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(output_csv_path, index=False)
        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")

def main():
    # Configuration
    INPUT_CSV_PATH = "ontovocabs/soil_health_KG.csv"  # Change this to your input file path
    OUTPUT_CSV_PATH = "llm_results.csv"  # Change this to your desired output file path
    DELAY_SECONDS = 1.0  # Delay between API calls (adjust based on rate limits)
    RESUME_FROM_ROW = 0  # Set to row number if resuming from interruption
    
    # Define your prompts here
    SYSTEM_PROMPT = system_prompt_voc
    
    USER_PROMPT_TEMPLATE = """Now please determine if the following term is a vocabulary term or un-vocabulary term: {term}
    """
    
    # Initialize processor
    processor = TermProcessor(SYSTEM_PROMPT, USER_PROMPT_TEMPLATE)
    
    # Process the CSV
    processor.process_csv(
        input_csv_path=INPUT_CSV_PATH,
        output_csv_path=OUTPUT_CSV_PATH,
        delay_seconds=DELAY_SECONDS,
        resume_from_row=RESUME_FROM_ROW
    )

if __name__ == "__main__":
    main()

2025-07-14 17:11:37,022 - INFO - Loaded CSV with 1787 rows
2025-07-14 17:11:37,025 - INFO - Processing row 0: 'abiotic environment'
2025-07-14 17:11:37,940 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:37,957 - INFO - Row 0: Success - is_vocab_term: False, confidence: 0.8
2025-07-14 17:11:38,960 - INFO - Processing row 1: 'abundance'
2025-07-14 17:11:39,875 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:39,896 - INFO - Row 1: Success - is_vocab_term: False, confidence: 0.85
2025-07-14 17:11:40,899 - INFO - Processing row 2: 'abundance of species populations'
2025-07-14 17:11:42,676 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-14 17:11:42,690 - INFO - Row 2: Success - is_vocab_term: False, confidence: 0.85
2025-07-14 17:11:43,702 - INFO - Processing row 3: 'acceptable risk levels'
2025-07-14 17:11:44,241 - INFO - HTTP 

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class LLMResultsPostProcessor:
    def __init__(self, input_csv_path: str):
        """
        Initialize the post-processor with the LLM results CSV file
        
        Args:
            input_csv_path: Path to the CSV file containing LLM results
        """
        self.input_csv_path = input_csv_path
        self.df = None
        self.load_data()
    
    def load_data(self):
        """Load the CSV data and validate it"""
        try:
            self.df = pd.read_csv(self.input_csv_path)
            logger.info(f"Loaded {len(self.df)} rows from {self.input_csv_path}")
            
            # Validate required columns
            required_columns = ['term', 'is_vocab_term', 'confidence_score', 'original_url']
            missing_columns = [col for col in required_columns if col not in self.df.columns]
            
            if missing_columns:
                raise ValueError(f"Missing required columns: {missing_columns}")
            
            # Clean the data
            self.clean_data()
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            sys.exit(1)
    
    def clean_data(self):
        """Clean and validate the data"""
        original_count = len(self.df)
        
        # Remove rows with missing essential data
        self.df = self.df.dropna(subset=['term', 'is_vocab_term', 'confidence_score', 'original_url'])
        
        # Convert confidence_score to numeric, handling any string values
        self.df['confidence_score'] = pd.to_numeric(self.df['confidence_score'], errors='coerce')
        
        # Remove rows with invalid confidence scores
        self.df = self.df.dropna(subset=['confidence_score'])
        
        # Ensure confidence scores are within valid range [0, 1]
        self.df = self.df[
            (self.df['confidence_score'] >= 0) & 
            (self.df['confidence_score'] <= 1)
        ]
        
        cleaned_count = len(self.df)
        removed_count = original_count - cleaned_count
        
        if removed_count > 0:
            logger.warning(f"Removed {removed_count} rows with invalid data")
        
        logger.info(f"Data cleaned: {cleaned_count} valid rows remaining")
    
    def display_summary(self):
        """Display summary statistics of the data"""
        print("\n" + "="*50)
        print("DATA SUMMARY")
        print("="*50)
        
        total_rows = len(self.df)
        vocab_terms = len(self.df[self.df['is_vocab_term'] == True])
        non_vocab_terms = len(self.df[self.df['is_vocab_term'] == False])
        
        print(f"Total processed terms: {total_rows}")
        print(f"Vocab terms (True): {vocab_terms} ({vocab_terms/total_rows*100:.1f}%)")
        print(f"Non-vocab terms (False): {non_vocab_terms} ({non_vocab_terms/total_rows*100:.1f}%)")
        
        print(f"\nConfidence Score Statistics:")
        print(f"Mean: {self.df['confidence_score'].mean():.3f}")
        print(f"Median: {self.df['confidence_score'].median():.3f}")
        print(f"Min: {self.df['confidence_score'].min():.3f}")
        print(f"Max: {self.df['confidence_score'].max():.3f}")
        print(f"Std: {self.df['confidence_score'].std():.3f}")
        
        print(f"\nConfidence Score Distribution:")
        print(f"0.0-0.2: {len(self.df[self.df['confidence_score'] < 0.2])}")
        print(f"0.2-0.4: {len(self.df[(self.df['confidence_score'] >= 0.2) & (self.df['confidence_score'] < 0.4)])}")
        print(f"0.4-0.6: {len(self.df[(self.df['confidence_score'] >= 0.4) & (self.df['confidence_score'] < 0.6)])}")
        print(f"0.6-0.8: {len(self.df[(self.df['confidence_score'] >= 0.6) & (self.df['confidence_score'] < 0.8)])}")
        print(f"0.8-1.0: {len(self.df[self.df['confidence_score'] >= 0.8])}")
    
    def get_vocab_choice(self) -> Optional[bool]:
        """Get user's choice for vocab term filtering"""
        print("\n" + "="*50)
        print("VOCAB TERM FILTERING")
        print("="*50)
        print("Choose which terms to include:")
        print("1. Only vocab terms (is_vocab_term = True)")
        print("2. Only non-vocab terms (is_vocab_term = False)")
        print("3. Both vocab and non-vocab terms")
        
        while True:
            try:
                choice = input("\nEnter your choice (1/2/3): ").strip()
                if choice == '1':
                    return True
                elif choice == '2':
                    return False
                elif choice == '3':
                    return None
                else:
                    print("Invalid choice. Please enter 1, 2, or 3.")
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                sys.exit(0)
    
    def get_confidence_range(self) -> Tuple[float, float]:
        """Get user's choice for confidence score range"""
        print("\n" + "="*50)
        print("CONFIDENCE SCORE FILTERING")
        print("="*50)
        print("Current confidence score range: {:.3f} - {:.3f}".format(
            self.df['confidence_score'].min(),
            self.df['confidence_score'].max()
        ))
        
        while True:
            try:
                print("\nEnter confidence score range (0.0 to 1.0):")
                min_score = float(input("Minimum confidence score: "))
                max_score = float(input("Maximum confidence score: "))
                
                if min_score < 0 or min_score > 1:
                    print("Minimum score must be between 0.0 and 1.0")
                    continue
                if max_score < 0 or max_score > 1:
                    print("Maximum score must be between 0.0 and 1.0")
                    continue
                if min_score > max_score:
                    print("Minimum score cannot be greater than maximum score")
                    continue
                
                return min_score, max_score
                
            except ValueError:
                print("Invalid input. Please enter numeric values.")
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                sys.exit(0)
    
    def apply_filters(self, vocab_filter: Optional[bool], 
                     confidence_range: Tuple[float, float]) -> pd.DataFrame:
        """Apply the selected filters to the data"""
        filtered_df = self.df.copy()
        
        # Apply vocab term filter
        if vocab_filter is not None:
            filtered_df = filtered_df[filtered_df['is_vocab_term'] == vocab_filter]
        
        # Apply confidence score range filter
        min_conf, max_conf = confidence_range
        filtered_df = filtered_df[
            (filtered_df['confidence_score'] >= min_conf) & 
            (filtered_df['confidence_score'] <= max_conf)
        ]
        
        return filtered_df
    
    def save_filtered_results(self, filtered_df: pd.DataFrame, output_path: str):
        """Save the filtered results to a CSV file"""
        try:
            # Create output dataframe with selected columns
            output_df = filtered_df[['original_url', 'term', 'is_vocab_term', 'confidence_score']].copy()
            
            # Sort by confidence score (descending) for better organization
            output_df = output_df.sort_values('confidence_score', ascending=False)
            
            # Save to CSV
            output_df.to_csv(output_path, index=False)
            logger.info(f"Filtered results saved to {output_path}")
            
            return True
            
        except Exception as e:
            logger.error(f"Error saving filtered results: {str(e)}")
            return False
    
    def display_filter_summary(self, filtered_df: pd.DataFrame, 
                             vocab_filter: Optional[bool], 
                             confidence_range: Tuple[float, float]):
        """Display summary of filtered results"""
        print("\n" + "="*50)
        print("FILTER RESULTS SUMMARY")
        print("="*50)
        
        print(f"Applied filters:")
        if vocab_filter is None:
            print(f"  - Vocab terms: Both True and False")
        else:
            print(f"  - Vocab terms: {vocab_filter}")
        
        min_conf, max_conf = confidence_range
        print(f"  - Confidence range: {min_conf:.3f} - {max_conf:.3f}")
        
        print(f"\nResults:")
        print(f"  - Original dataset: {len(self.df)} terms")
        print(f"  - Filtered dataset: {len(filtered_df)} terms")
        print(f"  - Percentage retained: {len(filtered_df)/len(self.df)*100:.1f}%")
        
        if len(filtered_df) > 0:
            vocab_count = len(filtered_df[filtered_df['is_vocab_term'] == True])
            non_vocab_count = len(filtered_df[filtered_df['is_vocab_term'] == False])
            
            print(f"\nFiltered results breakdown:")
            print(f"  - Vocab terms: {vocab_count}")
            print(f"  - Non-vocab terms: {non_vocab_count}")
            print(f"  - Average confidence: {filtered_df['confidence_score'].mean():.3f}")
    
    def interactive_process(self):
        """Run the interactive post-processing workflow"""
        print("LLM Results Post-Processor")
        print("="*50)
        
        # Display summary
        self.display_summary()
        
        # Get user preferences
        vocab_filter = self.get_vocab_choice()
        confidence_range = self.get_confidence_range()
        
        # Apply filters
        filtered_df = self.apply_filters(vocab_filter, confidence_range)
        
        # Display filter summary
        self.display_filter_summary(filtered_df, vocab_filter, confidence_range)
        
        if len(filtered_df) == 0:
            print("\nNo terms match the specified criteria.")
            return
        
        # Get output filename
        print("\n" + "="*50)
        print("SAVE RESULTS")
        print("="*50)
        
        while True:
            try:
                output_filename = input("Enter output filename (e.g., 'filtered_terms.csv'): ").strip()
                if not output_filename:
                    print("Filename cannot be empty.")
                    continue
                
                if not output_filename.endswith('.csv'):
                    output_filename += '.csv'
                
                break
                
            except KeyboardInterrupt:
                print("\nOperation cancelled.")
                return
        
        # Save results
        if self.save_filtered_results(filtered_df, output_filename):
            print(f"\nSuccess! {len(filtered_df)} terms saved to '{output_filename}'")
            print("The output file contains: original_url, term, is_vocab_term, confidence_score")
        else:
            print("Error: Failed to save results.")

def main():
    """Main function to run the post-processor"""
    # Configuration - Update these paths as needed
    INPUT_CSV_PATH = "llm_results_v2.csv"  # Change this to your LLM results file path
    
    print("LLM Results Post-Processor (IDE Version)")
    print("="*50)
    print(f"Input file: {INPUT_CSV_PATH}")
    
    # Check if file exists
    import os
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"Error: File '{INPUT_CSV_PATH}' not found.")
        print("Please update the INPUT_CSV_PATH variable in the main() function.")
        return
    
    # Create and run the post-processor
    processor = LLMResultsPostProcessor(INPUT_CSV_PATH)
    processor.interactive_process()

if __name__ == "__main__":
    main()

2025-07-14 22:11:50,374 - INFO - Loaded 1786 rows from llm_results_v2.csv
2025-07-14 22:11:50,383 - INFO - Data cleaned: 1786 valid rows remaining


LLM Results Post-Processor (IDE Version)
Input file: llm_results_v2.csv
LLM Results Post-Processor

DATA SUMMARY
Total processed terms: 1786
Vocab terms (True): 714 (40.0%)
Non-vocab terms (False): 1072 (60.0%)

Confidence Score Statistics:
Mean: 0.901
Median: 0.920
Min: 0.150
Max: 1.000
Std: 0.069

Confidence Score Distribution:
0.0-0.2: 1
0.2-0.4: 0
0.4-0.6: 0
0.6-0.8: 67
0.8-1.0: 1718

VOCAB TERM FILTERING
Choose which terms to include:
1. Only vocab terms (is_vocab_term = True)
2. Only non-vocab terms (is_vocab_term = False)
3. Both vocab and non-vocab terms

CONFIDENCE SCORE FILTERING
Current confidence score range: 0.150 - 1.000

Enter confidence score range (0.0 to 1.0):

FILTER RESULTS SUMMARY
Applied filters:
  - Vocab terms: True
  - Confidence range: 0.000 - 1.000

Results:
  - Original dataset: 1786 terms
  - Filtered dataset: 714 terms
  - Percentage retained: 40.0%

Filtered results breakdown:
  - Vocab terms: 714
  - Non-vocab terms: 0
  - Average confidence: 0.915

SAVE

2025-07-14 22:12:14,675 - INFO - Filtered results saved to filtered_terms_2.csv



Success! 714 terms saved to 'filtered_terms_2.csv'
The output file contains: original_url, term, is_vocab_term, confidence_score
