# Arena

Notebook for comparing different models and strategies for augmenting dataset.

In [94]:
import os
import json
import pandas as pd
import numpy as np

## Import labeled dataset

In [95]:
def get_all_json_files(directory, key_term):
    # List to store paths of JSON files that contain the key term
    json_files = []
    for filename in os.listdir(directory):
        if key_term in filename and filename.endswith('.json'):
            json_files.append(os.path.join(directory, filename))
    return json_files

def merge_jsonfiles(json_files):
    merged_data = {}
    for file_path in json_files:
        with open(file_path, 'r') as file:
            data_file = json.load(file)
            merged_data.update(data_file)

    return merged_data


In [96]:
directory = 'data'
key_term = 'manual-labels'
json_files = get_all_json_files(directory, key_term)
labeled_dataset = merge_jsonfiles(json_files)
print(list(labeled_dataset.items())[:5])

[('https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg', {'labels': ['Scotland'], 'text': 'Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6 || Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6'}), ('c01d7dcc-ff05-4773-9c0b-6de920bb2434', {'labels': ['Forst'], 'text': "Web Map Service of the dataset 'Piché evaporation at agroforestry site in Forst, 2019 and 2020' || This Web Map Service includes spatial information used by datasets 'AGIS Map Service of the dataset 'Piché evaporation at agroforestry site in Forst, 2019 and 2020''"}), ('https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg', {'labels': ['East Africa'], 'text': '"""Maikona. Sheet """"43"""". Metre Grid (East Africa). Zone H. Series Y633.""" || """Maikona. Sheet """"43"""". Metre Grid (East Africa). Zone H. Series Y633."""'}), ('09adf1ad-a388-40e0-a61f-ba30d300cef9', {'labels': ['Germany'], 'text': "Tree species

## Evaluation

In [97]:
Global_eval = {}

In [98]:
def eval_test(dict_truth,dict_test,return_table = False):
    match_counts = 0
    total_truth = 0
    total_given = 0


    matching_labels_list = []
    non_matching_labels_list = []
    not_found_labels_list = []
    recall_list = []
    text_list = []
    manual_labeled_list = []

    for key, value in dict_truth.items():

        if not key in list(dict_test.keys()):
            continue

        truth_set = set(value['labels'])
        given_set = set(dict_test[key])

        total_truth += len(truth_set)
        total_given += len(given_set)
        match_counts += len(truth_set & given_set)

        # Also appreciate the non-geo entries
        if len(truth_set) == 0: total_truth +=1
        if len(truth_set) == 0 & len(given_set)==0: match_counts +=1
        if len(given_set) == 0: total_given +=1

        matching_labels = truth_set & given_set
        not_found_labels = truth_set.difference(given_set)
        non_matching_labels = given_set.difference(truth_set)
        recall_item = len(truth_set & given_set)/len(truth_set) if len(truth_set) !=0 else 0

        matching_labels_list.append(matching_labels)
        non_matching_labels_list.append(non_matching_labels)
        not_found_labels_list.append(not_found_labels)
        recall_list.append(recall_item)
        text_list.append(value['text'])
        manual_labeled_list.append(truth_set)


    precision = match_counts / total_given if total_given !=0 else 0
    recall = match_counts / total_truth if total_truth !=0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) !=0 else 0

    eval_export = {'precision':precision,
                  'recall':recall,
                  'f1 score': f1_score}
    
    if return_table:
        results_df = pd.DataFrame({
            'Text':text_list,
            'Manual Labeled':manual_labeled_list,
            'Matching Labels': matching_labels_list,
            'Non-Matching Labels': non_matching_labels_list,
            'Not Found Labels':not_found_labels_list,
            'Recall': recall_list
        }, index=dict_truth.keys())

        eval_export['evaluation table']=results_df

    return eval_export

In [99]:
from geopy.geocoders import Nominatim
from shapely import wkt,unary_union,intersection
from shapely.geometry import Point, Polygon, MultiPolygon, box

In [186]:
geolocator = Nominatim(user_agent="SWR", timeout=2)
def text_to_geom(label:str):
    location_obj = geolocator.geocode(label, geometry='wkt')

    if not location_obj:
        return

    wkt_string=location_obj.raw["geotext"]
    geometry = wkt.loads(wkt_string)

    # Check if the geometry is a point
    if isinstance(geometry, Point):
        # Get the bounding box from the raw data to have some weight in the evaluation
        bbox = location_obj.raw["boundingbox"]
        if bbox:
            min_lat, max_lat, min_lon, max_lon = map(float, bbox)
            bounding_box_geom = box(min_lon, min_lat, max_lon, max_lat)
            return bounding_box_geom
    elif isinstance(geometry, Polygon) or isinstance(geometry, MultiPolygon):
        return geometry


def eval_test_geo_polygone(dict_truth,dict_test,return_table = False):
    # Evaluation based on percentage common area between 2 dictionaries with common keys
    # based on querying OSM with a geolocation string
    # Remark: still very slow
    
    matching_area_list = []
    precision_area_list = []

    for key, value in dict_truth.items():

        # matching empty datasets returning 100% overlap 
        if len(value['labels']) == 0 & len(dict_test[key])==0:
            matching_area_list.append(1)
            precision_area_list.append(1)
            continue
        
        # making 1 merged polygone of the desired output
        geometries_truth = []
        for location_truth in value['labels']:
            geom = text_to_geom(location_truth)
            if geom: geometries_truth.append(geom)
        merged_geometries_truth = unary_union(geometries_truth)

        # making 1 merged polygone of the found output
        geometries_test = []
        for location_test in dict_test[key]:
            geom = text_to_geom(location_test)
            if geom: geometries_test.append(geom)
        merged_geometries_test = unary_union(geometries_test)

        # calculating percentage intersection
        intersection = merged_geometries_truth.intersection(merged_geometries_test)
        recall_overlap = intersection.area/merged_geometries_truth.area if merged_geometries_truth.area !=0 else 0
        precision_overlap = intersection.area/merged_geometries_test.area if merged_geometries_test.area !=0 else 0
        matching_area_list.append(recall_overlap)
        precision_area_list.append(precision_overlap)

    eval_export = {
                    'precision area':np.mean(precision_area_list),
                    'recall area':np.mean(matching_area_list),
                    }
    
    if return_table:
        results_df = pd.DataFrame({
            'percentage area match':matching_area_list,
            'precision area ':precision_area_list,
        }, index=dict_truth.keys())

        eval_export['evaluation table']=results_df

    return eval_export

## GliNER

In [101]:
import spacy

In [102]:
labellist = ["geo","location","countries","sample_location"]

In [103]:
#different SpaCy models that can be downloaded
#!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_trf
#!python -m spacy download en_core_web_sm

🌟 Available Models GliNER on Hugging Face

🇬🇧 For English
* GLiNER Base: urchade/gliner_base (CC BY NC 4.0)
* GLiNER Small: urchade/gliner_small (CC BY NC 4.0)
* GLiNER Small v2: urchade/gliner_small-v2 (Apache 2.0)
* GLiNER Small v2.1: urchade/gliner_small-v2.1 (Apache 2.0)
* GLiNER Medium: urchade/gliner_medium (CC BY NC 4.0)
* GLiNER Medium v2: urchade/gliner_medium-v2 (Apache 2.0)
* GLiNER Medium v2.1: urchade/gliner_medium-v2.1 (Apache 2.0)
* GLiNER Large: urchade/gliner_large (CC BY NC 4.0)
* GLiNER Large v2: urchade/gliner_large-v2 (Apache 2.0)
* GLiNER Large v2.1: urchade/gliner_large-v2.1 (Apache 2.0)
* GLiNER NuNerZero span: numind/NuNER_Zero-span (MIT) - +4.5% more powerful GLiNER Large v2.1
* GLiNER News: EmergentMethods/gliner_medium_news-v2.1 (Apache 2.0) 9.5% improvement over GLiNER Large v2.1 on 18 benchmark datasets

🌍 For Other Languages
* Korean: 🇰🇷 taeminlee/gliner_ko
* Italian: 🇮🇹 DeepMount00/universal_ner_ita
* Multilingual: 🌐 urchade/gliner_multi (CC BY NC 4.0) and urchade/gliner_multi-v2.1 (Apache 2.0)

In [104]:
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("gliner_spacy",
    config={
        "gliner_model": "urchade/gliner_multi",
        "labels":labellist,
    }, last=True)



Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  state_dict = torch.load(model_file, map_location=torch.device(map_location))


<gliner_spacy.pipeline.GlinerSpacy at 0x21646632b10>

In [105]:
GliNER_results={}
for key, value in labeled_dataset.items():
    doc = nlp(value['text'])
    locations = []
    for ent in doc.ents:
        locations.append(ent.text)
    GliNER_results[key]= set(locations)

{k: GliNER_results[k] for k in list(GliNER_results)[:10]}

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'2020',
  'Forst',
  'agroforestry site'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': {'soil'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Kuching. Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [106]:
eval_GliNER = eval_test(labeled_dataset,GliNER_results,return_table=True)
{print(f"{k}: {v}") for k, v in eval_GliNER.items() if k != 'evaluation table'}

precision: 0.42118863049095606
recall: 0.6245210727969349
f1 score: 0.5030864197530863


{None}

In [107]:
eval_GliNER_geo = eval_test_geo_polygone(labeled_dataset,GliNER_results,return_table=True)
{print(f"{k}: {v}") for k, v in eval_GliNER_geo.items() if k != 'evaluation table'}

recall area: 0.7091429479492686
precision area: 0.6734661870519892


{None}

In [108]:
merged_dict = {**eval_GliNER, **eval_GliNER_geo}
merged_evaluation_table = pd.concat([eval_GliNER['evaluation table'], eval_GliNER_geo['evaluation table']], axis=1)
merged_dict['evaluation table'] = merged_evaluation_table
Global_eval['GliNER']=merged_dict
Global_eval['GliNER']['evaluation table']

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall,percentage area match,precision area
https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg,Soil Survey of Scotland 1:250 000 scale soil m...,{Scotland},{Scotland},{},{},1.0,1.0,1.000000
c01d7dcc-ff05-4773-9c0b-6de920bb2434,Web Map Service of the dataset 'Piché evaporat...,{Forst},{Forst},"{2020, agroforestry site}",{},1.0,1.0,0.510598
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg,"""""""Maikona. Sheet """"""""43"""""""". Metre Grid (East...",{East Africa},{East Africa},{},{},1.0,1.0,1.000000
09adf1ad-a388-40e0-a61f-ba30d300cef9,Tree species composition of a landscape in nor...,{Germany},{Germany},{},{},1.0,1.0,1.000000
6b664e1e-15ff-4bcb-8cd6-fef048a653a6,Validation and field application of a low-cost...,{},{},{},{},0.0,1.0,1.000000
...,...,...,...,...,...,...,...,...
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/PK4000_2S0.jpg,"Soil Maps of Baradi Area, Baga Area, Jagla Are...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...",{},"{Baradi Area, Jagla Area, Baga Area, Nabharan ...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...",0.0,0.0,0.000000
7f667e39-7f6a-40a9-8e1e-4efda1ecbfe3,Long-term fertilization field experiment Darms...,{Darmstadt},{Darmstadt},"{Germany, farmyard, Berlin, soil}",{},1.0,1.0,0.002081
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/XA1004_1.jpg,Carte 1. Indice Xerothermique de l || Carte 1....,{},{},{},{},0.0,1.0,1.000000
10.1016/j.agwat.2020.106496,Effects of rainwater harvesting system on soil...,"{Loess Plateau, China}","{Loess Plateau, China}","{rainwater harvesting area, deep soil layers, ...",{},1.0,1.0,1.000000


## LLM

### Ollama


In [109]:
#!pip install ollama

In [110]:
import ollama

In [111]:
def keyword_generator(p):
    prompt = "No Bullshit. You are the ultimate human labeler in the world and understand all the soilprojects. You are European and very efficient. Your task is to extract all locations in the text you are given. These locations have to include all the geographic references that this text makes. don't return anything else then the list of locations. Give an export in the form of [location1, location2,...]. Your text is: "+p
    res = ollama.generate(model="llama3", prompt=prompt)["response"]
    print(res)
    return res.replace("\n"," ").strip()

In [112]:
test = 'Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6 || Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6'

In [113]:
# print(keyword_generator(test))

^ ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [114]:
# from ollama import Client
# 
# response = client.chat(model='llama3.1', messages=[
#   {
#     'role': 'user',
#     'content': 'Why is the sky blue?',
#   },
# ])

In [115]:
# import psutil
# for proc in psutil.process_iter():
#     try:
#         if proc.name() == "ollydbg.exe":
#             print("OllyDbg is running.")
#             break  # Stop searching once we find it
#     except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
#         pass

# # If we reach this point without finding OllyDbg, it's likely not running
# print("OllyDbg is not running.")

### OpenAI

In [116]:
import openai
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt
import ast
import re

In [117]:
OPENAI_MODEL = 'gpt-3.5-turbo-0125'

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

labels = ['Location']

#### prompt 1

In [118]:
system_message = f"""
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) types are exclusively: ({", ".join(labels)})."""

In [119]:
# One-shot approach
def assisstant_message():
    return f"""
EXAMPLE:
    Text: 'Soil Maps of Baradi Area, Baga Area, Jagla Area, Garabaria Area, Nabharan Area, Kaliganj Area and Arpara Area. || Soil Maps of Baradi Area, Baga Area, Jagla Area, Garabaria Area, Nabharan Area, Kaliganj Area and Arpara Area.'
    {{
        "Location": ["Baradi","Baga","Jagla","Garabaria","Nabharan","Kaliganj","Arpara"]
    }}
--"""

In [120]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [121]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task(labels, text):
    messages = [
          {"role": "system", "content": system_message},
          {"role": "assistant", "content": assisstant_message()},
          {"role": "user", "content": user_message(text=text)}
      ]

    response = openai.chat.completions.create(
        temperature=0,
        model=OPENAI_MODEL,
        messages=messages,
    )

    return {"model_response": response}

In [122]:
OpenAI_results_1={}
for key, value in labeled_dataset.items():
    result = run_openai_task(labels, value['text'])
    message = result['model_response'].choices[0].message.content
    matches = re.findall(r'\{[^}]*\}', message)
    dict_message = ast.literal_eval(matches[0])
    OpenAI_results_1[key]= set(dict_message['Location'])


{k: OpenAI_results_1[k] for k in list(OpenAI_results_1)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland',
  'South West Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'Forst'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa',
  'Maikona'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': {'Field'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': {'Laguna Merin'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': {'Kilifi'}}

In [123]:
eval_OpenAI_1 = eval_test(labeled_dataset,OpenAI_results_1,return_table=True)
{print(f"{k}: {v}") for k, v in eval_OpenAI_1.items() if k != 'evaluation table'}

precision: 0.7303370786516854
recall: 0.7471264367816092
f1 score: 0.7386363636363638


{None}

In [124]:
eval_OpenAI1_geo = eval_test_geo_polygone(labeled_dataset,OpenAI_results_1,return_table=True)
{print(f"{k}: {v}") for k, v in eval_OpenAI1_geo.items() if k != 'evaluation table'}

recall area: 0.8398818664682381
precision area: 0.7868782326092603


{None}

In [125]:
merged_dict = {**eval_OpenAI_1, **eval_OpenAI1_geo}
merged_evaluation_table = pd.concat([eval_OpenAI_1['evaluation table'], eval_OpenAI1_geo['evaluation table']], axis=1)
merged_dict['evaluation table'] = merged_evaluation_table
Global_eval['OpenAI_1']=merged_dict
Global_eval['OpenAI_1']['evaluation table']

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall,percentage area match,precision area
https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg,Soil Survey of Scotland 1:250 000 scale soil m...,{Scotland},{Scotland},{South West Scotland},{},1.0,1.0,1.000000
c01d7dcc-ff05-4773-9c0b-6de920bb2434,Web Map Service of the dataset 'Piché evaporat...,{Forst},{Forst},{},{},1.0,1.0,1.000000
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg,"""""""Maikona. Sheet """"""""43"""""""". Metre Grid (East...",{East Africa},{East Africa},{Maikona},{},1.0,1.0,0.000006
09adf1ad-a388-40e0-a61f-ba30d300cef9,Tree species composition of a landscape in nor...,{Germany},{Germany},{},{},1.0,1.0,1.000000
6b664e1e-15ff-4bcb-8cd6-fef048a653a6,Validation and field application of a low-cost...,{},{},{Field},{},0.0,1.0,1.000000
...,...,...,...,...,...,...,...,...
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/PK4000_2S0.jpg,"Soil Maps of Baradi Area, Baga Area, Jagla Are...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...",{},{},1.0,1.0,1.000000
7f667e39-7f6a-40a9-8e1e-4efda1ecbfe3,Long-term fertilization field experiment Darms...,{Darmstadt},{Darmstadt},"{Germany, Berlin}",{},1.0,1.0,0.002081
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/XA1004_1.jpg,Carte 1. Indice Xerothermique de l || Carte 1....,{},{},{},{},0.0,1.0,1.000000
10.1016/j.agwat.2020.106496,Effects of rainwater harvesting system on soil...,"{Loess Plateau, China}","{Loess Plateau, China}",{Chinese Loess Plateau},{},1.0,1.0,1.000000


#### prompt 2

In [126]:
system_message = """
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) type is exclusively: "Location". However, only locations applicable to the research for which you receive the abstract and title.
If no location is found, an empty dict is returned. You also ensure that all entities you return can be mapped unambiguously."""

In [127]:
OpenAI_results_2={}
for key, value in labeled_dataset.items():
    result = run_openai_task(labels, value['text'])
    message = result['model_response'].choices[0].message.content
    matches = re.findall(r'\{[^}]*\}', message)
    dict_message = ast.literal_eval(matches[0])
    OpenAI_results_2[key]= set(dict_message['Location']) if dict_message else set()


{k: OpenAI_results_2[k] for k in list(OpenAI_results_2)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland',
  'South West Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'Forst'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa',
  'Maikona'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': {'Laguna Merin'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': {'Kilifi'}}

In [128]:
eval_OpenAI_2 = eval_test(labeled_dataset,OpenAI_results_2,return_table=True)
{print(f"{k}: {v}") for k, v in eval_OpenAI_2.items() if k != 'evaluation table'}

precision: 0.7450980392156863
recall: 0.7279693486590039
f1 score: 0.7364341085271319


{None}

In [129]:
eval_OpenAI2_geo = eval_test_geo_polygone(labeled_dataset,OpenAI_results_2,return_table=True)
{print(f"{k}: {v}") for k, v in eval_OpenAI2_geo.items() if k != 'evaluation table'}

recall area: 0.8225546253394379
precision area: 0.7745094507844615


{None}

In [151]:
merged_dict = {**eval_OpenAI_2, **eval_OpenAI2_geo}
merged_evaluation_table = pd.concat([eval_OpenAI_2['evaluation table'], eval_OpenAI2_geo['evaluation table']], axis=1)
merged_dict['evaluation table'] = merged_evaluation_table
Global_eval['OpenAI_2']=merged_dict
Global_eval['OpenAI_2']['evaluation table'].sort_values(by=['Recall'])[:20]

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall,percentage area match,precision area
https://esdac.jrc.ec.europa.eu//images/Eudasm/US/us21_3.jpg,General Soil Map Kauai Island Hawaii || Genera...,{Kauai Island Hawaii},{},"{Kauai Island, Hawaii}",{Kauai Island Hawaii},0.0,1.0,0.051037
8ef45947-3c2a-4bbf-896f-796cb2593af8,Web Map Service of the dataset 'Spectra LUCAS ...,{},{},{BonaRes},{},0.0,1.0,1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_ngsms1.jpg,Soils map of Nigeria. || Soils map of Nigeria.,{Nigeria},{},{},{Nigeria},0.0,0.0,0.0
3fcd463d-3d28-401e-9fc3-d4a8ad2a9df4,Soil and rhizosphere properties related to car...,{Hohenschulen},{},{},{Hohenschulen},0.0,0.0,0.0
10.1016/j.geoderma.2020.114851,Fusion of Vis-NIR and XRF spectra for estimati...,{},{},{},{},0.0,1.0,1.0
4d3c311c-4272-4562-92b9-0e723462f4a8,Earthworm Biodiversity Data Lietzen I - Earthw...,{},{},{Lietzen},{},0.0,1.0,1.0
fbdd1c72-5681-4b93-9ce3-39889667d055,WMS Service of the dataset 'Cropland agrofores...,{},{},{},{},0.0,1.0,1.0
https://esdac.jrc.ec.europa.eu/resource-type/documents?page=19#19-13,Advances in soil erosion modelling through rem...,"{Europa, EU, Europe}",{},{European},"{Europa, EU, Europe}",0.0,4e-12,1.0
https://esdac.jrc.ec.europa.eu/resource-type/documents?page=3#3-1,Agricultural Adaptation to Climate Change || A...,{},{},{},{},0.0,1.0,1.0
https://esdac.jrc.ec.europa.eu/resource-type/documents?page=16#16-10,Biochar application to soils || Biochar applic...,{},{},{},{},0.0,1.0,1.0


## Hybrid

Trying to finetune the zero-shot SpaCy models with 1 query to a LLM

In [167]:
sLM_label = GliNER_results
{k: sLM_label[k] for k in list(sLM_label)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'2020',
  'Forst',
  'agroforestry site'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': {'soil'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Kuching. Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [168]:
All_response = set()
for _,set_response in sLM_label.items():
    All_response.update(set_response)
print(len(All_response))
print(All_response)

300
{'site', 'Lany Farm', 'North Africa', 'Swabian', 'Tunisie', 'Jaguaribe/Natal', 'Afrika', 'Soilscores', 'Macapá', 'European agricultural soils', 'compost', 'Garabaria Area', 'Kazakh SSR', 'USSR', 'Timor', 'Africa', 'Buyuk Konya Havzasinin Toprak Haritasi.', 'Western Balkans', 'Tensift-basin', 'topsoil', 'Ceylon', 'longitude', 'research sites', 'loam silt', 'Pakistan', 'temperate forest subsoils', 'soil layers', 'Rhodesia', 'North Hessian Hill Country', 'Dornburg', 'Changwat Rayong', 'Nakhonnayok Province. Legend.', 'European', 'Nigeria', 'Crete', 'Southern Europe', 'Marsabit district', 'Bulgaria', 'construction sites', 'humusarmen', 'experimental sites', 'Pusté Jakartice', 'F5 field', 'Light Source', 'region', 'Arpara Area', 'West Africa', 'Formations Végétales et Domaine Forestier National de Madagascar', 'European Vineyard Soils', 'subduction‐zone regions', 'plot A12_12', 'Guadalajara', 'volcanic ash', 'Rostock', 'New Dykesite', 'Kulbacksliden', 'Saskatoon', 'China', 'land uses', 

In [181]:
system_message_1 = """You are an expert in geographical entities with specialized knowledge of OpenStreetMap and Nominatim. Your task is to filter a list of geographical entities, retaining only those that have clearly delineatable locations, such as continents, countries, states, regions, counties, cities, towns, neighborhoods, and street names.

Be cautious when filtering: if a label could plausibly represent a specific, mappable location, it should be retained. Only exclude terms that are highly ambiguous or are clearly more likely to refer to general, non-delineatable locations. For example, "site" should only be excluded if it is clearly intended to represent a general area rather than a specific town or place."""

system_message_2 = """You are an expert in geographical entities with specialized knowledge of OpenStreetMap and Nominatim. Your task is to filter a list of geographical entities, retaining only those that have clearly delineatable locations, such as continents, countries, states, regions, counties, cities, towns, neighborhoods, and street names.

Be cautious when filtering: if a label could plausibly represent a specific, mappable location, it should be retained. Only exclude terms that are highly ambiguous or are clearly more likely to refer to general, non-delineatable locations. For example, "site" should only be excluded if it is clearly intended to represent a general area rather than a specific town or place.

Additionally, if a label is not directly mappable but can be slightly modified to match a specific location (e.g., "West Germany" to "Germany" or "London Area" to "London"), return a dictionary of these changes, where the original label is the key and the modified, mappable location is the value."""


In [180]:
assisstant_message_1= """
EXAMPLE:
    Text: {'soil,Belgium,creek,street,West-Vlaanderen,river,Wetland,Lucerne,Pays de la Loire,F4,Rijnland-Palts'} 
    {
        "delineatable": ["Belgium","West-Vlaanderen","Lucerne","Pays de la Loire","Rijnland-Palts"],
        "Non-delineatable": ["soil","creek","river","Wetland","F4"]
    }
"""
assisstant_message_2= """
EXAMPLE:
    Text: {'soil,Belgium,creek,street,West-Vlaanderen,river,Wetland,Lucerne,Pays de la Loire,F4,Rijnland-Palts,London Area'} 
    {
        "delineatable": ["Belgium","West-Vlaanderen","Lucerne","Pays de la Loire","Rijnland-Palts","London"],
        "Non-delineatable": ["soil","creek","river","Wetland","F4"],
        "changes":{"London Area":"London"}
    }
"""

In [171]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [189]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task_filter(text, system_message,assisstant_message):
    messages = [
          {"role": "system", "content": system_message},
          {"role": "assistant", "content": assisstant_message},
          {"role": "user", "content": user_message(text=text)}
      ]

    response = openai.chat.completions.create(
        temperature=0,
        model=OPENAI_MODEL,
        messages=messages,
    )

    return {"model_response": response}

In [173]:
result_hybrid = run_openai_task_filter(All_response, system_message_1, assisstant_message_1)

In [175]:
message_hybrid = result_hybrid['model_response'].choices[0].message.content
try:
    dict_message_hybrid = ast.literal_eval(message_hybrid)
except (ValueError, SyntaxError):
    matches_hybrid = re.findall(r'\{[^}]*\}', message_hybrid)
    dict_message_hybrid = ast.literal_eval(matches_hybrid[0])

if "changes" in list(dict_message_hybrid.keys()):
    sLM_label_1 = {key: {dict_message_hybrid["changes"].get(label, label) for label in value} for key, value in sLM_label.items()}
else:
    sLM_label_1=sLM_label

ok_entities = set(dict_message_hybrid['delineatable'])
cleaned_hybrid_dict = {key: value.intersection(ok_entities) for key, value in sLM_label_1.items()}
{k: cleaned_hybrid_dict[k] for k in list(cleaned_hybrid_dict)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': set(),
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [176]:
eval_Hybrid = eval_test(labeled_dataset,cleaned_hybrid_dict,return_table=True)
{print(f"{k}: {v}") for k, v in eval_Hybrid.items() if k != 'evaluation table'}

precision: 0.4835164835164835
recall: 0.3371647509578544
f1 score: 0.3972911963882619


{None}

In [177]:
eval_Hybrid_geo = eval_test_geo_polygone(labeled_dataset,cleaned_hybrid_dict,return_table=True)
{print(f"{k}: {v}") for k, v in eval_Hybrid_geo.items() if k != 'evaluation table'}

recall area: 0.518784194897132
precision area: 0.4986771262387808


{None}

In [182]:
merged_dict = {**eval_Hybrid, **eval_Hybrid_geo}
merged_evaluation_table = pd.concat([eval_Hybrid['evaluation table'], eval_Hybrid_geo['evaluation table']], axis=1)
merged_dict['evaluation table'] = merged_evaluation_table
Global_eval['Hybrid_1']=merged_dict
Global_eval['Hybrid_1']['evaluation table'].sort_values(by=['Recall'])[:20]

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall,percentage area match,precision area
https://esdac.jrc.ec.europa.eu//images/Eudasm/US/us21_3.jpg,General Soil Map Kauai Island Hawaii || Genera...,{Kauai Island Hawaii},{},{},{Kauai Island Hawaii},0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3005_1SO.jpg,Present Land Use. West Malaysia. Sheet 1. || P...,"{Malaysia, West Malaysia}",{},{},"{Malaysia, West Malaysia}",0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/LK3018_LU.jpg,Vavuniya District. Land Use. || Vavuniya Distr...,{Vavuniya District},{},{},{Vavuniya District},0.0,0.0,0.0
f5bcf938-44cc-41fd-8d03-5543933d1d79,Web Map Service of the dataset 'Permafrost deg...,{Alaska},{},{},{Alaska},0.0,0.0,0.0
5187f8c5-38ef-4b07-bc26-a5e257a8ef59,| || |,{},{},{},{},0.0,1.0,1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/LK3022_LU.jpg,Colombo and Kalutara Districts. Land Use. || C...,"{Colombo District, Kalutara District, Kalutara...",{},{},"{Colombo District, Kalutara District, Kalutara...",0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/SK/cze_x1back.jpg,Soil Map Slovakia - pody || Soil Map Slovakia ...,{Slovakia},{},{},{Slovakia},0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/IR4002_3LU.jpg,land Use Map of Diful II Project. Ostan VII Ir...,{Iran},{},{},{Iran},0.0,0.0,0.0
2f32fbe1-30fd-4472-8b97-e02267c34ca6,WMS Service of datasets from the 'ZALF Focus A...,{Paulinenaue},{},{},{Paulinenaue},0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/AF3000_1SO.jpg,Soil Map. The Jilga Valley. Vol. IV-13. || Soi...,{Jilga},{},{},{Jilga},0.0,0.0,0.0


In [190]:
result_hybrid = run_openai_task_filter(All_response, system_message_2, assisstant_message_2)

message_hybrid = result_hybrid['model_response'].choices[0].message.content
try:
    dict_message_hybrid = ast.literal_eval(message_hybrid)
except (ValueError, SyntaxError):
    matches_hybrid = re.findall(r'\{[^}]*\}', message_hybrid)
    dict_message_hybrid = ast.literal_eval(matches_hybrid[0])

if "changes" in list(dict_message_hybrid.keys()):
    sLM_label_1 = {key: {dict_message_hybrid["changes"].get(label, label) for label in value} for key, value in sLM_label.items()}
else:
    sLM_label_1=sLM_label

ok_entities = set(dict_message_hybrid['delineatable'])
cleaned_hybrid_dict = {key: value.intersection(ok_entities) for key, value in sLM_label_1.items()}
{k: cleaned_hybrid_dict[k] for k in list(cleaned_hybrid_dict)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': set(),
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [191]:
eval_Hybrid = eval_test(labeled_dataset,cleaned_hybrid_dict,return_table=True)
{print(f"{k}: {v}") for k, v in eval_Hybrid.items() if k != 'evaluation table'}

precision: 0.4444444444444444
recall: 0.3065134099616858
f1 score: 0.36281179138321995


{None}

In [192]:
eval_Hybrid_geo = eval_test_geo_polygone(labeled_dataset,cleaned_hybrid_dict,return_table=True)
{print(f"{k}: {v}") for k, v in eval_Hybrid_geo.items() if k != 'evaluation table'}

precision area: 0.4639944646861015
recall area: 0.4871166943051461


{None}

In [193]:
merged_dict = {**eval_Hybrid, **eval_Hybrid_geo}
merged_evaluation_table = pd.concat([eval_Hybrid['evaluation table'], eval_Hybrid_geo['evaluation table']], axis=1)
merged_dict['evaluation table'] = merged_evaluation_table
Global_eval['Hybrid_2']=merged_dict
Global_eval['Hybrid_2']['evaluation table'].sort_values(by=['Recall'])[:20]

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall,percentage area match,precision area
https://esdac.jrc.ec.europa.eu//images/Eudasm/US/us21_3.jpg,General Soil Map Kauai Island Hawaii || Genera...,{Kauai Island Hawaii},{},{},{Kauai Island Hawaii},0.0,0.0,0.0
a45c7a1f-3dc5-478f-9d25-50dacd607d02,Soil values from the agricultural soil valuati...,{Brandenburg},{},{Germany},{Brandenburg},0.0,1.0,0.079697
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekija2.jpg,"""""""Kijabe. Sheet """"""""134/3"""""""". Y731 (D.O.S. 4...",{},{},{},{},0.0,1.0,1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_za4000_7to.jpg,Noodsberg. 2930BD. South Africa. [Noodsberg. S...,"{Suid-Afrika, Noodsberg, South Africa}",{},{},"{Suid-Afrika, Noodsberg, South Africa}",0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3005_1SO.jpg,Present Land Use. West Malaysia. Sheet 1. || P...,"{Malaysia, West Malaysia}",{},{},"{Malaysia, West Malaysia}",0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/LK3018_LU.jpg,Vavuniya District. Land Use. || Vavuniya Distr...,{Vavuniya District},{},{},{Vavuniya District},0.0,0.0,0.0
f5bcf938-44cc-41fd-8d03-5543933d1d79,Web Map Service of the dataset 'Permafrost deg...,{Alaska},{},{},{Alaska},0.0,0.0,0.0
5187f8c5-38ef-4b07-bc26-a5e257a8ef59,| || |,{},{},{},{},0.0,1.0,1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/LK3022_LU.jpg,Colombo and Kalutara Districts. Land Use. || C...,"{Colombo District, Kalutara District, Kalutara...",{},{},"{Colombo District, Kalutara District, Kalutara...",0.0,0.0,0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/IR4002_3LU.jpg,land Use Map of Diful II Project. Ostan VII Ir...,{Iran},{},{},{Iran},0.0,0.0,0.0


## Summary Evaluation Table

In [194]:
Global_eval
for k_glob, v_glob in Global_eval.items():
    print('-----------------------')
    print('Summary: ',k_glob)
    for k, v in v_glob.items():
        if k == 'evaluation table':
            continue
        print(f"--> {k}: {v:.3f}")


-----------------------
Summary:  GliNER
--> precision: 0.421
--> recall: 0.625
--> f1 score: 0.503
--> recall area: 0.709
--> precision area: 0.673
-----------------------
Summary:  OpenAI_1
--> precision: 0.730
--> recall: 0.747
--> f1 score: 0.739
--> recall area: 0.840
--> precision area: 0.787
-----------------------
Summary:  OpenAI_2
--> precision: 0.745
--> recall: 0.728
--> f1 score: 0.736
--> recall area: 0.823
--> precision area: 0.775
-----------------------
Summary:  Hybrid_1
--> precision: 0.484
--> recall: 0.337
--> f1 score: 0.397
--> recall area: 0.519
--> precision area: 0.499
-----------------------
Summary:  Hybrid_2
--> precision: 0.444
--> recall: 0.307
--> f1 score: 0.363
--> precision area: 0.464
--> recall area: 0.487
