# Arena

In [1]:
import pickle
import os
import json
import pandas as pd

Notebook for comparing different models and strategies for augmenting dataset.

## Import labeled dataset

In [2]:
def get_all_json_files(directory, key_term):
    # List to store paths of JSON files that contain the key term
    json_files = []
    for filename in os.listdir(directory):
        if key_term in filename and filename.endswith('.json'):
            json_files.append(os.path.join(directory, filename))
    return json_files

def merge_jsonfiles(json_files):
    merged_data = {}
    for file_path in json_files:
        with open(file_path, 'r') as file:
            data_file = json.load(file)
            merged_data.update(data_file)

    return merged_data


In [3]:
directory = 'data'
key_term = 'manual-labels'
json_files = get_all_json_files(directory, key_term)
labeled_dataset = merge_jsonfiles(json_files)
print(list(labeled_dataset.items())[:5])

[('https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg', {'labels': ['Scotland'], 'text': 'Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6 || Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6'}), ('c01d7dcc-ff05-4773-9c0b-6de920bb2434', {'labels': ['Forst'], 'text': "Web Map Service of the dataset 'Piché evaporation at agroforestry site in Forst, 2019 and 2020' || This Web Map Service includes spatial information used by datasets 'AGIS Map Service of the dataset 'Piché evaporation at agroforestry site in Forst, 2019 and 2020''"}), ('https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg', {'labels': ['East Africa'], 'text': '"""Maikona. Sheet """"43"""". Metre Grid (East Africa). Zone H. Series Y633.""" || """Maikona. Sheet """"43"""". Metre Grid (East Africa). Zone H. Series Y633."""'}), ('09adf1ad-a388-40e0-a61f-ba30d300cef9', {'labels': ['Germany'], 'text': "Tree species

## Evaluation

In [None]:
Global_eval = {}

In [4]:
def eval_test(dict_truth,dict_test,return_table = False):
    match_counts = 0
    total_truth = 0
    total_given = 0


    matching_labels_list = []
    non_matching_labels_list = []
    not_found_labels_list = []
    recall_list = []
    text_list = []
    manual_labeled_list = []

    for key, value in dict_truth.items():

        if not key in list(dict_test.keys()):
            continue

        truth_set = set(value['labels'])
        given_set = set(dict_test[key])

        total_truth += len(truth_set)
        total_given += len(given_set)
        match_counts += len(truth_set & given_set)

        # Also appreciate the non-geo entries
        if len(truth_set) == 0: total_truth +=1
        if len(truth_set) == 0 & len(given_set)==0: match_counts +=1
        if len(given_set) == 0: total_given +=1

        matching_labels = truth_set & given_set
        not_found_labels = truth_set.difference(given_set)
        non_matching_labels = given_set.difference(truth_set)
        recall_item = len(truth_set & given_set)/len(truth_set) if len(truth_set) !=0 else 0

        matching_labels_list.append(matching_labels)
        non_matching_labels_list.append(non_matching_labels)
        not_found_labels_list.append(not_found_labels)
        recall_list.append(recall_item)
        text_list.append(value['text'])
        manual_labeled_list.append(truth_set)


    precision = match_counts / total_given if total_given !=0 else 0
    recall = match_counts / total_truth if total_truth !=0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) !=0 else 0

    eval_export = {'precision':precision,
                  'recall':recall,
                  'f1 score': f1_score}
    
    if return_table:
        results_df = pd.DataFrame({
            'Text':text_list,
            'Manual Labeled':manual_labeled_list,
            'Matching Labels': matching_labels_list,
            'Non-Matching Labels': non_matching_labels_list,
            'Not Found Labels':not_found_labels_list,
            'Recall': recall_list
        }, index=dict_truth.keys())

        eval_export['evaluation table']=results_df

    return eval_export

In [None]:
def eval_test_geo_polygone(dict_truth,dict_test,return_table = False):
    

## GliNER

In [5]:
import spacy

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [6]:
from transformers import AutoTokenizer

In [7]:
labellist = ["geo","location","countries","sample_location"]

In [8]:
#!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_trf
#!python -m spacy download en_core_web_sm

🌟 Available Models on Hugging Face
🇬🇧 For English
* GLiNER Base: urchade/gliner_base (CC BY NC 4.0)
* GLiNER Small: urchade/gliner_small (CC BY NC 4.0)
* GLiNER Small v2: urchade/gliner_small-v2 (Apache 2.0)
* GLiNER Small v2.1: urchade/gliner_small-v2.1 (Apache 2.0)
* GLiNER Medium: urchade/gliner_medium (CC BY NC 4.0)
* GLiNER Medium v2: urchade/gliner_medium-v2 (Apache 2.0)
* GLiNER Medium v2.1: urchade/gliner_medium-v2.1 (Apache 2.0)
* GLiNER Large: urchade/gliner_large (CC BY NC 4.0)
* GLiNER Large v2: urchade/gliner_large-v2 (Apache 2.0)
* GLiNER Large v2.1: urchade/gliner_large-v2.1 (Apache 2.0)
* GLiNER NuNerZero span: numind/NuNER_Zero-span (MIT) - +4.5% more powerful GLiNER Large v2.1
* GLiNER News: EmergentMethods/gliner_medium_news-v2.1 (Apache 2.0) 9.5% improvement over GLiNER Large v2.1 on 18 benchmark datasets

🌍 For Other Languages
* Korean: 🇰🇷 taeminlee/gliner_ko
* Italian: 🇮🇹 DeepMount00/universal_ner_ita
* Multilingual: 🌐 urchade/gliner_multi (CC BY NC 4.0) and urchade/gliner_multi-v2.1 (Apache 2.0)

In [9]:
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("gliner_spacy",
    config={
        "gliner_model": "urchade/gliner_multi",
        "labels":labellist,
    }, last=True)



Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  state_dict = torch.load(model_file, map_location=torch.device(map_location))


<gliner_spacy.pipeline.GlinerSpacy at 0x216694f5710>

In [10]:
GliNER_results={}
for key, value in labeled_dataset.items():
    doc = nlp(value['text'])
    locations = []
    for ent in doc.ents:
        locations.append(ent.text)
    GliNER_results[key]= set(locations)

{k: GliNER_results[k] for k in list(GliNER_results)[:10]}

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'2020',
  'Forst',
  'agroforestry site'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': {'soil'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Kuching. Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [11]:
eval_GliNER = eval_test(labeled_dataset,GliNER_results,return_table=True)
Global_eval['GliNER']=eval_GliNER
{print(f"{k}: {v}") for k, v in eval_GliNER.items() if k != 'evaluation table'}

precision: 0.42118863049095606
recall: 0.6245210727969349
f1 score: 0.5030864197530863


{None}

In [12]:
eval_GliNER['evaluation table']

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall
https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg,Soil Survey of Scotland 1:250 000 scale soil m...,{Scotland},{Scotland},{},{},1.0
c01d7dcc-ff05-4773-9c0b-6de920bb2434,Web Map Service of the dataset 'Piché evaporat...,{Forst},{Forst},"{2020, agroforestry site}",{},1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg,"""""""Maikona. Sheet """"""""43"""""""". Metre Grid (East...",{East Africa},{East Africa},{},{},1.0
09adf1ad-a388-40e0-a61f-ba30d300cef9,Tree species composition of a landscape in nor...,{Germany},{Germany},{},{},1.0
6b664e1e-15ff-4bcb-8cd6-fef048a653a6,Validation and field application of a low-cost...,{},{},{},{},0.0
...,...,...,...,...,...,...
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/PK4000_2S0.jpg,"Soil Maps of Baradi Area, Baga Area, Jagla Are...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...",{},"{Baradi Area, Jagla Area, Baga Area, Nabharan ...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...",0.0
7f667e39-7f6a-40a9-8e1e-4efda1ecbfe3,Long-term fertilization field experiment Darms...,{Darmstadt},{Darmstadt},"{Germany, farmyard, Berlin, soil}",{},1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/XA1004_1.jpg,Carte 1. Indice Xerothermique de l || Carte 1....,{},{},{},{},0.0
10.1016/j.agwat.2020.106496,Effects of rainwater harvesting system on soil...,"{Loess Plateau, China}","{Loess Plateau, China}","{rainwater harvesting area, deep soil layers, ...",{},1.0


## LLM

### Ollama


In [13]:
#!pip install ollama

In [14]:
import ollama

In [15]:
def keyword_generator(p):
    prompt = "No Bullshit. You are the ultimate human labeler in the world and understand all the soilprojects. You are European and very efficient. Your task is to extract all locations in the text you are given. These locations have to include all the geographic references that this text makes. don't return anything else then the list of locations. Give an export in the form of [location1, location2,...]. Your text is: "+p
    res = ollama.generate(model="llama3", prompt=prompt)["response"]
    print(res)
    return res.replace("\n"," ").strip()

In [16]:
test = 'Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6 || Soil Survey of Scotland 1:250 000 scale soil map of South West Scotland; Sheet 6'

In [17]:
# print(keyword_generator(test))

^ ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [18]:
# from ollama import Client
# 
# response = client.chat(model='llama3.1', messages=[
#   {
#     'role': 'user',
#     'content': 'Why is the sky blue?',
#   },
# ])

In [19]:
# import psutil
# for proc in psutil.process_iter():
#     try:
#         if proc.name() == "ollydbg.exe":
#             print("OllyDbg is running.")
#             break  # Stop searching once we find it
#     except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
#         pass

# # If we reach this point without finding OllyDbg, it's likely not running
# print("OllyDbg is not running.")

### OpenAI

In [23]:
import openai
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt
import ast
import re

In [24]:
OPENAI_MODEL = 'gpt-3.5-turbo-0125'

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

labels = ['Location']

#### prompt 1

In [25]:
system_message = f"""
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) types are exclusively: ({", ".join(labels)})."""

In [26]:
# One-shot approach
def assisstant_message():
    return f"""
EXAMPLE:
    Text: 'Soil Maps of Baradi Area, Baga Area, Jagla Area, Garabaria Area, Nabharan Area, Kaliganj Area and Arpara Area. || Soil Maps of Baradi Area, Baga Area, Jagla Area, Garabaria Area, Nabharan Area, Kaliganj Area and Arpara Area.'
    {{
        "Location": ["Baradi","Baga","Jagla","Garabaria","Nabharan","Kaliganj","Arpara"]
    }}
--"""

In [27]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [28]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task(labels, text):
    messages = [
          {"role": "system", "content": system_message},
          {"role": "assistant", "content": assisstant_message()},
          {"role": "user", "content": user_message(text=text)}
      ]

    response = openai.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
    )

    return {"model_response": response}

In [29]:
OpenAI_results_1={}
for key, value in labeled_dataset.items():
    result = run_openai_task(labels, value['text'])
    message = result['model_response'].choices[0].message.content
    matches = re.findall(r'\{[^}]*\}', message)
    dict_message = ast.literal_eval(matches[0])
    OpenAI_results_1[key]= set(dict_message['Location'])


{k: OpenAI_results_1[k] for k in list(OpenAI_results_1)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland',
  'South West Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'Forst'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa',
  'Maikona'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': {'Field'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': {'Influence of crop rotational position'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': {'Laguna Merin'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': {'Kilifi'}}

In [30]:
eval_OpenAI_1 = eval_test(labeled_dataset,OpenAI_results_1,return_table=True)
Global_eval['OpenAI - Prompt1']=eval_OpenAI_1
{print(f"{k}: {v}") for k, v in eval_OpenAI_1.items() if k != 'evaluation table'}

precision: 0.7153558052434457
recall: 0.7318007662835249
f1 score: 0.7234848484848484


{None}

In [31]:
eval_OpenAI_table = eval_OpenAI_1['evaluation table']
eval_OpenAI_table

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall
https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg,Soil Survey of Scotland 1:250 000 scale soil m...,{Scotland},{Scotland},{South West Scotland},{},1.0
c01d7dcc-ff05-4773-9c0b-6de920bb2434,Web Map Service of the dataset 'Piché evaporat...,{Forst},{Forst},{},{},1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg,"""""""Maikona. Sheet """"""""43"""""""". Metre Grid (East...",{East Africa},{East Africa},{Maikona},{},1.0
09adf1ad-a388-40e0-a61f-ba30d300cef9,Tree species composition of a landscape in nor...,{Germany},{Germany},{},{},1.0
6b664e1e-15ff-4bcb-8cd6-fef048a653a6,Validation and field application of a low-cost...,{},{},{Field},{},0.0
...,...,...,...,...,...,...
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/PK4000_2S0.jpg,"Soil Maps of Baradi Area, Baga Area, Jagla Are...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...","{Jagla, Nabharan, Baradi, Baga, Garabaria, Arp...",{},{},1.0
7f667e39-7f6a-40a9-8e1e-4efda1ecbfe3,Long-term fertilization field experiment Darms...,{Darmstadt},{Darmstadt},{Germany},{},1.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/XA1004_1.jpg,Carte 1. Indice Xerothermique de l || Carte 1....,{},{},{},{},0.0
10.1016/j.agwat.2020.106496,Effects of rainwater harvesting system on soil...,"{Loess Plateau, China}","{Loess Plateau, China}",{},{},1.0


#### prompt 2

In [32]:
system_message = """
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) type is exclusively: "Location". However, only locations applicable to the research for which you receive the abstract and title.
If no location is found, an empty dict is returned. You also ensure that all entities you return can be mapped unambiguously."""

In [33]:
OpenAI_results_2={}
for key, value in labeled_dataset.items():
    result = run_openai_task(labels, value['text'])
    message = result['model_response'].choices[0].message.content
    matches = re.findall(r'\{[^}]*\}', message)
    dict_message = ast.literal_eval(matches[0])
    OpenAI_results_2[key]= set(dict_message['Location']) if dict_message else set()


{k: OpenAI_results_2[k] for k in list(OpenAI_results_2)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'Forst'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa',
  'Maikona'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': {'Laguna Merin'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': {'Kilifi'}}

In [34]:
eval_OpenAI_2 = eval_test(labeled_dataset,OpenAI_results_2,return_table=True)
Global_eval['OpenAI - Prompt2']=eval_OpenAI_2
{print(f"{k}: {v}") for k, v in eval_OpenAI_2.items() if k != 'evaluation table'}

precision: 0.7380952380952381
recall: 0.7126436781609196
f1 score: 0.7251461988304094


{None}

In [35]:
eval_OpenAI_table = eval_OpenAI_2['evaluation table']
eval_OpenAI_table.sort_values(by=['Recall'])[:20]

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall
https://esdac.jrc.ec.europa.eu//images/Eudasm/US/us21_3.jpg,General Soil Map Kauai Island Hawaii || Genera...,{Kauai Island Hawaii},{},"{Kauai Island, Hawaii}",{Kauai Island Hawaii},0.0
5392b39d-fd8c-4f6e-9522-c77f76a81941,"Effect of N fertilizers on yield, N-uptake, an...",{},{},{},{},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/US/us_x19back.jpg,Soil Conservation Service Activities (sheet no...,{},{},{},{},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/AF3000_1SO.jpg,Soil Map. The Jilga Valley. Vol. IV-13. || Soi...,{Jilga},{},{Jilga Valley},{Jilga},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_sd4000_1.jpg,"Plantations, Species & Age Classes. Map 1. D.O...",{},{},{},{},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_s5_goc.jpg,Afrika Kartenwerk Serie S || Afrika Kartenwerk...,{Afrika},{},{},{Afrika},0.0
2a3d1b59-350c-4db6-a28c-ed31407c1fa4,Kinetics of P-desorption from Fe- and Al-hydro...,{},{},{},{},0.0
500e7063-e828-4e82-b533-91c80a010817,Půdní mapa 1 : 50 000 || Soil map at a scale o...,{Czech},{},{Czech Geological Survey},{Czech},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/XA2009_2SU.jpg,Lower Mekong Basin. Crop Suitability Map. Sout...,"{Lower Mekong Basin, Bassin Inferieur du Mekon...",{},{Mekong Basin},"{Lower Mekong Basin, Bassin Inferieur du Mekon...",0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/IR2002_1SO.jpg,Generalized Map of the Great Soil Groups of th...,{Khuzistan},{},{Khuzistan Plains},{Khuzistan},0.0


## Hybrid

Trying to finetune the zero-shot SpaCy models with 1 query to a LLM

In [36]:
sLM_label = GliNER_results
{k: sLM_label[k] for k in list(sLM_label)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': {'2020',
  'Forst',
  'agroforestry site'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': {'soil'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching',
  'Kuching. Sarawak'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [37]:
All_response = set()
for _,set_response in sLM_label.items():
    All_response.update(set_response)
print(len(All_response))
print(All_response)

300
{'site', 'Lany Farm', 'North Africa', 'Swabian', 'Tunisie', 'Jaguaribe/Natal', 'Afrika', 'Soilscores', 'Macapá', 'European agricultural soils', 'compost', 'Garabaria Area', 'Kazakh SSR', 'USSR', 'Timor', 'Africa', 'Buyuk Konya Havzasinin Toprak Haritasi.', 'Western Balkans', 'Tensift-basin', 'topsoil', 'Ceylon', 'longitude', 'research sites', 'loam silt', 'Pakistan', 'temperate forest subsoils', 'soil layers', 'Rhodesia', 'North Hessian Hill Country', 'Dornburg', 'Changwat Rayong', 'Nakhonnayok Province. Legend.', 'European', 'Nigeria', 'Crete', 'Southern Europe', 'Marsabit district', 'Bulgaria', 'construction sites', 'humusarmen', 'experimental sites', 'Pusté Jakartice', 'F5 field', 'Light Source', 'region', 'Arpara Area', 'West Africa', 'Formations Végétales et Domaine Forestier National de Madagascar', 'European Vineyard Soils', 'subduction‐zone regions', 'plot A12_12', 'Guadalajara', 'volcanic ash', 'Rostock', 'New Dykesite', 'Kulbacksliden', 'Saskatoon', 'China', 'land uses', 

In [38]:
system_message = """You are an expert in geographical entities with specialized knowledge of OpenStreetMap and Nominatim. Your task is to filter a list of geographical entities, retaining only those that have clearly delineatable locations, such as continents, countries, states, regions, counties, cities, towns, neighborhoods, and street names.

Be cautious when filtering: if a label could plausibly represent a specific, mappable location, it should be retained. Only exclude terms that are highly ambiguous or are clearly more likely to refer to general, non-delineatable locations. For example, "site" should only be excluded if it is clearly intended to represent a general area rather than a specific town or place.

Additionally, if a label is not directly mappable but can be slightly modified to match a specific location (e.g., "West Germany" to "Germany" or "London Area" to "London"), return a dictionary of these changes, where the original label is the key and the modified, mappable location is the value."""

In [39]:
assisstant_message= """
EXAMPLE:
    Text: {'soil,Belgium,creek,street,West-Vlaanderen,river,Wetland,Lucerne,Pays de la Loire,F4,Rijnland-Palts,London Area'} 
    {
        "delineatable": ["Belgium","West-Vlaanderen","Lucerne","Pays de la Loire","Rijnland-Palts"],
        "Non-delineatable": ["soil","creek","river","Wetland","F4"]
    }
"""
# assisstant_message= """
# EXAMPLE:
#     Text: {'soil,Belgium,creek,street,West-Vlaanderen,river,Wetland,Lucerne,Pays de la Loire,F4,Rijnland-Palts'} 
#     {
#         "delineatable": ["Belgium","West-Vlaanderen","Lucerne","Pays de la Loire","Rijnland-Palts","London"],
#         "Non-delineatable": ["soil","creek","river","Wetland","F4"],
#         "changes":{"London Area":"London"}
#     }
# """

In [40]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [41]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task_filter(text):
    messages = [
          {"role": "system", "content": system_message},
          {"role": "assistant", "content": assisstant_message},
          {"role": "user", "content": user_message(text=text)}
      ]

    response = openai.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
    )

    return {"model_response": response}

In [42]:
result_hybrid = run_openai_task_filter(All_response)

In [43]:
message_hybrid = result_hybrid['model_response'].choices[0].message.content
try:
    dict_message_hybrid = ast.literal_eval(message_hybrid)
except (ValueError, SyntaxError):
    matches_hybrid = re.findall(r'\{[^}]*\}', message_hybrid)
    dict_message_hybrid = ast.literal_eval(matches_hybrid[0])


In [None]:
if "changes" in list(dict_message_hybrid.keys()):
    sLM_label_1 = {key: {dict_message_hybrid["changes"].get(label, label) for label in value} for key, value in sLM_label.items()}
else:
    sLM_label_1=sLM_label

ok_entities = set(dict_message_hybrid['delineatable'])
cleaned_hybrid_dict = {key: value.intersection(ok_entities) for key, value in sLM_label_1.items()}
{k: cleaned_hybrid_dict[k] for k in list(cleaned_hybrid_dict)[:10]}

{'https://esdac.jrc.ec.europa.eu//Eudasm/UK/South-West_Scotland6.jpg': {'Scotland'},
 'c01d7dcc-ff05-4773-9c0b-6de920bb2434': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kemaiko.jpg': {'East Africa'},
 '09adf1ad-a388-40e0-a61f-ba30d300cef9': {'Germany'},
 '6b664e1e-15ff-4bcb-8cd6-fef048a653a6': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_shlu5c.jpg': {'St. Helena'},
 '70304e68-c369-4c1b-8d54-79f48bc182c4': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/uy13003_su.jpg': set(),
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/MY3006_4LU.jpg': {'Kuching'},
 'https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_kekili1.jpg': set()}

In [None]:
eval_Hybrid = eval_test(labeled_dataset,cleaned_hybrid_dict,return_table=True)
Global_eval['Hybrid']=eval_Hybrid
{print(f"{k}: {v}") for k, v in eval_Hybrid.items() if k != 'evaluation table'}

precision: 0.520618556701031
recall: 0.38697318007662834
f1 score: 0.443956043956044


{None}

In [None]:
eval_Hybrid_table = eval_Hybrid['evaluation table']
eval_Hybrid_table.sort_values(by=['Recall'])[:20]

Unnamed: 0,Text,Manual Labeled,Matching Labels,Non-Matching Labels,Not Found Labels,Recall
https://esdac.jrc.ec.europa.eu/resource-type/documents?page=4#4-21,The Relevance of Black Soils for Sustainable D...,{},{},{},{},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_mgfvdfn.jpg,Formations Végétales et Domaine Forestier Nati...,{Madagascar},{},{},{Madagascar},0.0
10.1016/j.agwat.2020.106496,Effects of rainwater harvesting system on soil...,"{Loess Plateau, China}",{},{},"{Loess Plateau, China}",0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Africa/images/maps/download/afr_mmps.jpg,Província de Maputo. Carta de Solos. || Provín...,"{Província de Maputo, Maputo}",{},{},"{Província de Maputo, Maputo}",0.0
https://esdac.jrc.ec.europa.eu/resource-type/documents?page=4#4-1,Mercury in European topsoils: Anthropogenic so...,"{European, Europe}",{},{},"{European, Europe}",0.0
https://esdac.jrc.ec.europa.eu//Eudasm/NL/neth_x176.jpg,Bodemkaart van Nederland Rhenen || Bodemkaart ...,"{Rhenen, Nederland Rhenen}",{},{Netherlands},"{Rhenen, Nederland Rhenen}",0.0
b89da85c-84e8-4664-b138-86faeb5ae2f8,Long-Term Fertilization Trial Dikopshof Descri...,{Dikopshof},{},{},{Dikopshof},0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/Asia/images/maps/download/TH3004_1SO.jpg,Major Soil Characteristics Nakhonnayok Provinc...,"{Nakhonnayok Province, Nakhonnayok}",{},{},"{Nakhonnayok Province, Nakhonnayok}",0.0
https://esdac.jrc.ec.europa.eu//images/Eudasm/latinamerica/images/maps/download/mex_x1.jpg,Carta Edapologica - Estardos Unidos Mexicanos ...,"{Huatabampo, Mexicanos}",{},{},"{Huatabampo, Mexicanos}",0.0
10.1016/j.scitotenv.2020.144026,Treated wastewater reuse for irrigation: Pros ...,{},{},{},{},0.0


## Summary Evaluation Table

In [None]:
Global_eval