# Validation Dataset

We manually annotated the sample with the annotator tool Prodigy, and saved the annotations in a file called 'prodigy_manual_annotations' which is stored inside prodigy.db 

In [4]:
# Load libraries and set working directory to where files are stored
!pip install pandas
import pandas as pd


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.5 MB 5.5 MB/s eta 0:00:03
   -- ------------------------------------- 0.6/11.5 MB 7.9 MB/s eta 0:00:02
   ---- ----------------------------------- 1.2/11.5 MB 10.6 MB/s eta 0:00:01
   ------- -------------------------------- 2.0/11.5 MB 13.0 MB/s eta 0:00:01
   ------------ --------------------------- 3.5/11.5 MB 17.3 MB/s eta 0:00:01
   ------------------- -------------------- 5.6/11.5 MB 22.2 MB/s eta 0:00:01
   ---------------------------- ----------- 8.3/11.5 MB 27.8 MB/s eta 0:00:01
   ------------------------

In [None]:
import spacy
import spacy_transformers
from spacy.tokens import DocBin
import srsly

nlp = spacy.load("en_core_web_trf")  # Load the spaCy model that corresponds to your use

In [None]:

doc_bin = DocBin()  # This will store the Doc objects
data = srsly.read_jsonl('gold_standard.jsonl')  # Load your Prodigy annotations

for example in data:
    text = example['text']
    annotations = example['spans']
    doc = nlp.make_doc(text)  # Create a Doc object from text
    ents = []
    for span in annotations:
        start, end, label = span['start'], span['end'], span['label']
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents  # Assign the entity annotations
    doc_bin.add(doc)

# doc_bin.to_disk("./manual.spacy")  # Save the DocBin to disk

# Load the DocBin object
doc_bin = DocBin().from_disk("manual.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

### Model Entities

In [8]:
# These instead are the entities that were annotated by the model
ner_entities = pd.read_csv("ner_entities_only_relevant_domains.csv")

# Filter the 'ner_entities' DataFrame to keep only the rows where 'doc' matches the gold standard sample
ner_entities_sample = ner_entities[ner_entities['doc'].isin(sample['text'])]

from spacy.training import Example
docs = [nlp(text) for text in validation_ner]
doc_bin = DocBin(docs=docs)
doc_bin.to_disk("bert.spacy")

### Merge Datasets

In [6]:
docs_ner = DocBin().from_disk("bert.spacy")
docs_ner = list(docs_ner.get_docs(nlp.vocab))

def process_doc(doc):
    entities = {"GPE": [], "LOC": [], "FAC": [], "ORG": []}
    for ent in doc.ents:
        if ent.label_ in entities:  # Check if the entity type is one we want to collect
            entities[ent.label_].append(ent.text)
    return {"doc": doc.text, "entities": entities}

processed_docs_gold = [process_doc(doc) for doc in docs]
processed_docs_ner = [process_doc(doc) for doc in docs_ner]

# Convert the lists to DataFrames
df_gold = pd.DataFrame(processed_docs_gold)
df_ner = pd.DataFrame(processed_docs_ner)

# Merge the DataFrames on the 'doc' column
merged_df = pd.merge(df_gold, df_ner, on='doc', suffixes=('_gold', '_ner'))
# merged_df.to_csv("gold_standard_entities.csv")

## Toponym Recognition Evaluation

In [5]:
def calculate_metrics(df):
    df = df.copy()  # Avoid working with a slice of the original DataFrame

    def compare_entities(row):
        # Convert both arrays to sets for easier comparison
        gold_set = set(row['gold_entities_array'])
        machine_set = set(row['machine_entities_array'])
        
        # Calculate True Positives (TP), False Positives (FP), False Negatives (FN)
        tp = len(gold_set & machine_set)
        fp = len(machine_set - gold_set)
        fn = len(gold_set - machine_set)
        
        return pd.Series([tp, fp, fn])

    # Apply the function across the dataframe and create new columns
    df[['True Positives', 'False Positives', 'False Negatives']] = df.apply(compare_entities, axis=1)

    # Optionally, calculate precision, recall, and F1-score
    df['Precision'] = df['True Positives'] / (df['True Positives'] + df['False Positives'])
    df['Recall'] = df['True Positives'] / (df['True Positives'] + df['False Negatives'])
    
    # Replace NaN (which results from division by zero) with 1
    df['Precision'].replace(np.nan, 1, inplace=True)
    df['Recall'].replace(np.nan, 1, inplace=True)

    df['F1-Score'] = 2 * (df['Precision'] * df['Recall']) / (df['Precision'] + df['Recall'])

    return df

In [6]:
merged_df = pd.read_csv("hand_annotation_vs_spacy.csv")

In [7]:
import ast

# Convert JSON-like strings to dictionaries
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return {}  # or np.nan, or handle as needed

# Convert JSON strings in 'entities_gold' to dictionaries
merged_df['entities_gold'] = merged_df['entities_gold'].apply(safe_literal_eval)

# Convert JSON strings in 'entities_gold' to dictionaries
merged_df['entities_ner'] = merged_df['entities_ner'].apply(safe_literal_eval)

In [8]:
# Initialize empty DataFrame to store all results
articles_entities = pd.DataFrame()

def preprocess_entities(df, gold_col='entities_gold', machine_col='entities_ner'):

    # Extract and create GPE, LOC, FAC columns from 'entities_gold'
    for etype in ['GPE', 'LOC', 'FAC']:
        df[etype] = df[gold_col].apply(lambda x: x.get(etype, []))

    # Extract and create machine_GPE, machine_LOC, machine_FAC columns from 'entities_ner'
    for etype in ['GPE', 'LOC', 'FAC']:
        df[f'machine_{etype}'] = df[machine_col].apply(lambda x: x.get(etype, []))
    
    # Convert empty lists to NaN for both sets of columns
    df[['GPE', 'LOC', 'FAC']] = df[['GPE', 'LOC', 'FAC']].map(lambda x: np.nan if not x else x)
    df[[f"machine_{etype}" for etype in ['GPE', 'LOC', 'FAC']]] = df[[f"machine_{etype}" for etype in ['GPE', 'LOC', 'FAC']]].map(lambda x: np.nan if not x else x)

    # Create arrays (lists) for the gold and machine entities
    df['gold_entities_array'] = df.apply(lambda row: [row['GPE'], row['LOC'], row['FAC']], axis=1)
    df['machine_entities_array'] = df.apply(lambda row: [row['machine_GPE'], row['machine_LOC'], row['machine_FAC']], axis=1)
    
    # Flatten the arrays and remove NaNs
    df['gold_entities_array'] = df['gold_entities_array'].apply(lambda x: [item for sublist in x if isinstance(sublist, list) for item in sublist])
    df['machine_entities_array'] = df['machine_entities_array'].apply(lambda x: [item for sublist in x if isinstance(sublist, list) for item in sublist])
    
    # Select only the relevant columns
    df = df[['doc', 'gold_entities_array', 'machine_entities_array']]

    return df

def calculate_metrics(df):
    df = df.copy()  # Avoid working with a slice of the original DataFrame

    def compare_entities(row):
        # Convert both arrays to sets for easier comparison
        gold_set = set(row['gold_entities_array'])
        machine_set = set(row['machine_entities_array'])
        
        # Calculate True Positives (TP), False Positives (FP), False Negatives (FN)
        tp = len(gold_set & machine_set)
        fp = len(machine_set - gold_set)
        fn = len(gold_set - machine_set)
        
        return pd.Series([tp, fp, fn])

    # Apply the function across the dataframe and create new columns
    df[['True Positives', 'False Positives', 'False Negatives']] = df.apply(compare_entities, axis=1)

    # Optionally, calculate precision, recall, and F1-score
    df['Precision'] = df['True Positives'] / (df['True Positives'] + df['False Positives'])
    df['Recall'] = df['True Positives'] / (df['True Positives'] + df['False Negatives'])
    
    # Replace NaN (which results from division by zero) with 1
    df['Precision'].replace(np.nan, 1, inplace=True)
    df['Recall'].replace(np.nan, 1, inplace=True)

    df['F1-Score'] = 2 * (df['Precision'] * df['Recall']) / (df['Precision'] + df['Recall'])

    return df

In [9]:
import numpy as np
processed_df = preprocess_entities(merged_df)
metrics = calculate_metrics(processed_df)

# Sum TP, FP, FN across all documents
total_tp = metrics['True Positives'].sum()
total_fp = metrics['False Positives'].sum()
total_fn = metrics['False Negatives'].sum()

# Micro-averaged (overall) precision, recall, and F1-score
overall_precision = total_tp / (total_tp + total_fp)
overall_recall = total_tp / (total_tp + total_fn)
overall_f1_score = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall)

print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")


Overall Precision: 0.9659735349716446
Overall Recall: 0.9092526690391459
Overall F1-Score: 0.9367552703941339


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Precision'].replace(np.nan, 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Recall'].replace(np.nan, 1, inplace=True)


In [30]:
processed_df

Unnamed: 0,doc,gold_entities_array,machine_entities_array
0,Rishi Sunak departs 10 Downing Street ahead of...,[],[]
1,KNOWLE and Dorridge CC suffered a heavy 160-ru...,[],[]
2,now Wheldon Infant School and Nursery on Franc...,"[Castleford, Francis Street]","[Castleford, Francis Street]"
3,Council leaders in West Norfolk are set to exa...,"[West Norfolk, West Norfolk]",[West Norfolk]
4,Our 10 photographs from the ChronicleLive arch...,"[Newcastle, Newcastle, Newcastle, Newcastle, N...","[Newcastle, Newcastle, Newcastle, Newcastle, N..."
...,...,...,...
95,Antrim and Newtownabbey Borough Council has al...,"[Antrim, Newtownabbey, Newtownabbey, Antrim, N...","[Antrim, Newtownabbey, Newtownabbey, Antrim, N..."
96,Young Charlton Athletic forward Ryan Viggars h...,"[Wales, Bloomfields]","[Wales, Bloomfields]"
97,The following planning applications have been ...,"[Newark, Newark, Balderton, Bilsthorpe, Morton...","[Newark, Newark, Balderton, Bilsthorpe, Morton..."
98,"Easier access to your trusted, local news. Sub...","[Rothes, Wick, Mosset Park]","[Rothes, Wick, Mosset Park]"


## Toponym Disambiguation Evaluation

In [11]:
import numpy as np
import pandas as pd
import ast  # For safely evaluating string representations of dictionaries

# Load CSV file
merged_df = pd.read_csv("gold_standard_entities.csv")

def preprocess_entities(df, gold_col='entities_gold', machine_col='entities_ner'):
    # Ensure that 'entities_gold' and 'entities_ner' columns are dictionaries
    df[gold_col] = df[gold_col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df[machine_col] = df[machine_col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Extract and create GPE, LOC, FAC, ORG columns from 'entities_gold'
    for etype in ['GPE', 'LOC', 'FAC', 'ORG']:
        df[etype] = df[gold_col].apply(lambda x: x.get(etype, []))

    # Extract and create machine_GPE, machine_LOC, machine_FAC, machine_ORG columns from 'entities_ner'
    for etype in ['GPE', 'LOC', 'FAC', 'ORG']:
        df[f'machine_{etype}'] = df[machine_col].apply(lambda x: x.get(etype, []))

    # Convert empty lists to NaN for both sets of columns
    for etype in ['GPE', 'LOC', 'FAC', 'ORG']:
        df[etype] = df[etype].apply(lambda x: np.nan if not x else x)
        df[f'machine_{etype}'] = df[f'machine_{etype}'].apply(lambda x: np.nan if not x else x)

    # Set 'ORG' to NaN where any of 'FAC', 'LOC', or 'GPE' is not NaN (for both gold and machine entities)
    df['ORG'] = df.apply(lambda row: np.nan if row[['FAC', 'LOC', 'GPE']].notna().any() else row['ORG'], axis=1)
    df['machine_ORG'] = df.apply(lambda row: np.nan if row[[f'machine_{etype}' for etype in ['FAC', 'LOC', 'GPE']]].notna().any() else row['machine_ORG'], axis=1)

    # Create arrays (lists) for the gold and machine entities
    df['gold_entities_array'] = df.apply(lambda row: [row['GPE'], row['LOC'], row['FAC'], row['ORG']], axis=1)
    df['machine_entities_array'] = df.apply(lambda row: [row['machine_GPE'], row['machine_LOC'], row['machine_FAC'], row['machine_ORG']], axis=1)

    # Flatten the arrays and remove NaNs
    df['gold_entities_array'] = df['gold_entities_array'].apply(lambda x: [item for sublist in x if isinstance(sublist, list) for item in sublist])
    df['machine_entities_array'] = df['machine_entities_array'].apply(lambda x: [item for sublist in x if isinstance(sublist, list) for item in sublist])

    # Select only the relevant columns
    df = df[['doc', 'gold_entities_array', 'machine_entities_array']]

    return df

# Process the DataFrame
processed_df = preprocess_entities(merged_df)

In [12]:
# now it's on the whole 100 article sample
processed_df_long = processed_df.explode('machine_entities_array')
processed_df_long = processed_df_long.drop_duplicates(subset=['doc', 'machine_entities_array'], keep='first')
print(len(processed_df_long))

# List of entities to remove (while above we did that manually, since then I created this csv file with a longer list of entities to remove)
entities_to_remove = pd.read_csv("temp/entities_to_remove.csv")
entities_to_remove = entities_to_remove['value'].tolist()

# Filter the DataFrame
processed_df_long = processed_df_long[~processed_df_long['machine_entities_array'].isin(entities_to_remove)]
print(len(processed_df_long))

processed_df_long['toponym_lowercased'] = processed_df_long['machine_entities_array'].str.lower()
# Convert list-like columns to strings (if applicable)
processed_df_long['machine_entities_array'] = processed_df_long['machine_entities_array'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
processed_df_long['gold_entities_array'] = processed_df_long['gold_entities_array'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)

566
543


In [111]:
# check if liverpool is in the list to_remove
print('birmingham' in entities_to_remove)

True


In [13]:
# Merge with candidates
toponym_candidates = pd.read_csv("toponym_candidates_with_lads.csv")
candidates_pool = processed_df_long.merge(toponym_candidates, left_on='toponym_lowercased', right_on='values', how='left')
# group by machine_entity_array and doc and list the unique LAD24NM (do not drop other columns)
grouped_df = candidates_pool.groupby(['machine_entities_array', 'doc', 'toponym_lowercased', 'clean_name', 'gold_entities_array']).agg({
    'LAD24NM': list,       # Create list of LAD24NM
    'POSTCODE_DISTRICT': list,      # Create list of geometry
}).reset_index()

# for each doc, assign a unique ID
grouped_df['doc_id'] = grouped_df.groupby('doc').ngroup()
# grouped_df.to_csv("candidates_classification_test_on_100_articles.csv")

  toponym_candidates = pd.read_csv("temp/toponym_candidates_with_lads.csv")


Prepare data for LabelStudio / Prodigy

In [19]:
candidates_pool_simple = candidates_pool[['doc', 'machine_entities_array', 'LAD24NM', 'POSTCODE_DISTRICT', 'Latitude', 'Longitude']]
candidates_pool_simple
# group by doc and machine_entities_array and remove duplicates
grouped_candidates_pool = candidates_pool_simple.drop_duplicates(subset=['doc', 'machine_entities_array', 'LAD24NM'], keep='first')
grouped_candidates_pool

Unnamed: 0,doc,machine_entities_array,LAD24NM,POSTCODE_DISTRICT,Latitude,Longitude
0,Rishi Sunak departs 10 Downing Street ahead of...,10 Downing Street,Westminster,SW1A,51.504607,-0.132177
1,Rishi Sunak departs 10 Downing Street ahead of...,Labour,North Yorkshire,BD24,54.079897,-2.284160
2,Rishi Sunak departs 10 Downing Street ahead of...,Labour,Thanet,CT9,51.382937,1.389221
3,Rishi Sunak departs 10 Downing Street ahead of...,Labour,Dorset,DT2,50.749692,-2.450836
4,Rishi Sunak departs 10 Downing Street ahead of...,Labour,Boston,PE20,52.931083,-0.103562
...,...,...,...,...,...,...
6097,A group of around 20 women and men protested a...,the Town Hall,Gwynedd,LL48,52.936185,-4.069524
6098,A group of around 20 women and men protested a...,the Town Hall,West Northamptonshire,NN12,52.126196,-0.999142
6099,A group of around 20 women and men protested a...,the Town Hall,Cherwell,OX16,52.062293,-1.342842
6100,A group of around 20 women and men protested a...,the Town Hall,West Oxfordshire,OX18,51.762965,-1.591496


In [None]:
grouped_candidates_pool.to_csv("candidates_pool.csv")

In [7]:
import pandas as pd
# in R, in Toponym_disambiguation.R I added metadata to this file, and will now load it back
grouped_candidates_pool = pd.read_csv("annotators_data.csv")

In [8]:
grouped_candidates_pool

Unnamed: 0,...1,doc,machine_entities_array,LAD24NM,POSTCODE_DISTRICT,Latitude,Longitude,group_articles,domain,date,Main_LAD
0,0,Rishi Sunak departs 10 Downing Street ahead of...,10 Downing Street,Westminster,SW1A,51.504607,-0.132177,article-1506227,scotsman.com,2022-11-15,City of Edinburgh
1,1,Rishi Sunak departs 10 Downing Street ahead of...,Labour,North Yorkshire,BD24,54.079897,-2.284160,article-1506227,scotsman.com,2022-11-15,City of Edinburgh
2,2,Rishi Sunak departs 10 Downing Street ahead of...,Labour,Thanet,CT9,51.382937,1.389221,article-1506227,scotsman.com,2022-11-15,City of Edinburgh
3,3,Rishi Sunak departs 10 Downing Street ahead of...,Labour,Dorset,DT2,50.749692,-2.450836,article-1506227,scotsman.com,2022-11-15,City of Edinburgh
4,4,Rishi Sunak departs 10 Downing Street ahead of...,Labour,Boston,PE20,52.931083,-0.103562,article-1506227,scotsman.com,2022-11-15,City of Edinburgh
...,...,...,...,...,...,...,...,...,...,...,...
3862,6097,A group of around 20 women and men protested a...,the Town Hall,Gwynedd,LL48,52.936185,-4.069524,article-151086,redditchadvertiser.co.uk,2021-10-08,Redditch
3863,6098,A group of around 20 women and men protested a...,the Town Hall,West Northamptonshire,NN12,52.126196,-0.999142,article-151086,redditchadvertiser.co.uk,2021-10-08,Redditch
3864,6099,A group of around 20 women and men protested a...,the Town Hall,Cherwell,OX16,52.062293,-1.342842,article-151086,redditchadvertiser.co.uk,2021-10-08,Redditch
3865,6100,A group of around 20 women and men protested a...,the Town Hall,West Oxfordshire,OX18,51.762965,-1.591496,article-151086,redditchadvertiser.co.uk,2021-10-08,Redditch


In [99]:
import pandas as pd
import json

def create_labelstudio_json(df):
    tasks = []

    # Group by 'doc' and 'machine_entities_array'
    grouped = df.groupby(['doc', 'machine_entities_array'])

    for (doc, machine_entity), group in grouped:
        # Collect all LAD24NM options for this doc-entity pair
        options = [
            {"value": row['LAD24NM']} for _, row in group.iterrows() if pd.notna(row['LAD24NM'])
        ]
        
        # Add the additional fixed options, formatted correctly
        additional_options = [
            {"value": "LAD not in options"},
            {"value": "Entity is not a location"},
            {"value": "Entity is a location outside the UK"},
            {"value": "Entity spans across several districts (e.g., a region)"},
            {"value": "Unsure"}
        ]
        
        # Combine the options
        options += additional_options
        
        # Create the task in Label Studio format
        task = {
            "data": {
                "id": len(tasks),  # Assign a unique id based on the number of tasks
                "doc": doc,
                "machine_entities_array": machine_entity,
                "main_LAD": group['Main_LAD'].iloc[0],  # Take the first instance for metadata
                "domain": group['domain'].iloc[0],
                "options": options  # This is a list of dictionaries
            }
        }
        tasks.append(task)
    
    # Convert list of tasks to JSON
    return json.dumps(tasks, indent=4)

# Example usage
grouped_candidates_pool.fillna('Not linked to anything', inplace=True)  # Replace NaN values
labelstudio_json = create_labelstudio_json(grouped_candidates_pool)

# Save the JSON to a file for Label Studio
with open('labelstudio_tasks.json', 'w') as f:
    f.write(labelstudio_json)

Enhancing with entity position

In [100]:
import random 

def extract_filtered_entities_with_positions(doc):
    """
    Extract entity positions from the filtered machine entities array.
    
    Args:
    doc (spacy Doc): The original spacy document.
    
    Returns:
    list: A list of entity data, including text, start, end, label, and other metadata.
    """
    filtered_entities = []
    entities = {"GPE": [], "LOC": [], "FAC": [], "ORG": []}
    for ent in doc.ents:
        if ent.label_ in entities:  # Check if the entity type is one we want to collect
            entity_data = {
                "id": ent.text,  # unique identifier can be ent.text or generate one dynamically
                "doc": doc.text,
                "from_name": "label",
                "to_name": "text",
                "type": "labels",
                "value": {
                    "start": ent.start_char,
                    "end": ent.end_char,
                    "text": ent.text,
                    "labels": [ent.label_]  # GPE, LOC, FAC, ORG, etc.
                }
            }
            filtered_entities.append(entity_data)

    # Filter out ORG if FAC, LOC, or GPE is present
    has_fac_loc_gpe = any(ent["value"]["labels"][0] in ["FAC", "LOC", "GPE"] for ent in filtered_entities)
    if has_fac_loc_gpe:
        filtered_entities = [ent for ent in filtered_entities if ent["value"]["labels"][0] != "ORG"]

    return filtered_entities


def process_docs_for_labelstudio(docs_ner, model_version="en_core_web_trf"):
    """
    Convert spacy docs to LabelStudio JSON format.
    
    Args:
    docs_ner (list): List of spacy Docs.
    model_version (str): The version of the model used.
    score (float): The model confidence score (if available, otherwise default 0.0).
    
    Returns:
    list: A list of tasks ready to be exported to LabelStudio format.
    """
    tasks = []
    
    for doc in docs_ner:
        # Extract only entities that passed filtering
        entity_data = extract_filtered_entities_with_positions(doc)
        
        task = {
            "data": {
                "doc": doc.text
            },
            "predictions": [
                {
                    "model_version": model_version,
                    "result": entity_data
                }
            ]
        }
        tasks.append(task)
    
    return tasks

# Example usage with spacy Docs:
# Load your spacy docs
docs_ner = DocBin().from_disk("bert.spacy")
docs_ner = list(docs_ner.get_docs(nlp.vocab))

labelstudio_tasks = process_docs_for_labelstudio(docs_ner)

# Save the JSON to a file for Label Studio
with open('labelstudio_predictions.json', 'w') as f:
    json.dump(labelstudio_tasks, f, indent=4)

In [101]:
import json

# Load the two JSON files
with open('labelstudio_predictions.json', 'r') as f1:  # length 96
    predictions_data = json.load(f1)

with open('labelstudio_tasks.json', 'r') as f2:  # length 543
    options_metadata_data = json.load(f2)

# Create a lookup dictionary from the predictions based on 'doc'
predictions_lookup = {entry['data']['doc']: entry for entry in predictions_data}

# Merge options/metadata with predictions based on the common 'doc'
merged_data = []

for task_entry in options_metadata_data:
    doc_text = task_entry['data']['doc']
    
    # Start with the task entry
    merged_entry = task_entry.copy()

    # Check if the doc exists in the predictions JSON
    if doc_text in predictions_lookup:
        # Merge the corresponding prediction entry with the task entry
        merged_entry['predictions'] = predictions_lookup[doc_text]['predictions']  # Copy the predictions
    else:
        merged_entry['predictions'] = []  # If no prediction found, initialize as empty list

    merged_data.append(merged_entry)

# Remove 'doc' from 'predictions' results in merged_data
for prediction in merged_data:
    if 'predictions' in prediction:  # Ensure 'predictions' key exists
        for result in prediction['predictions']:
            if 'result' in result:  # Ensure 'result' key exists
                for res in result['result']:
                    if "doc" in res:
                        del res["doc"]

# Replace with _ any blank space from the entry id
for entry in merged_data:
    for prediction in entry['predictions']:
        for result in prediction['result']:
            result['id'] = result['id'].replace(' ', '_')

# Save the merged data to a new JSON file
with open('merged_data.json', 'w') as outfile:
    json.dump(merged_data, outfile, indent=4)

print("Merge completed successfully!")

Merge completed successfully!


In [102]:
import json

# Load the merged data
with open('merged_data.json', 'r') as infile:
    merged_data = json.load(infile)

# Function to update labels based on machine_entities_array
def update_labels(merged_data):
    for item in merged_data:
        machine_entities_array = item['data']['machine_entities_array']
        for prediction in item['predictions']:
            for result in prediction['result']:
                if result['value']['text'] == machine_entities_array:
                    result['value']['labels'] = ["Entity"]
                else:
                    result['value']['labels'] = ["Other"]
    return merged_data

# Update the labels in merged_data
updated_data = update_labels(merged_data)

# Save the updated data to a new JSON file
with open('updated_merged_data.json', 'w') as outfile:
    json.dump(updated_data, outfile, indent=4)

Two annotators have annotated the above JSON file, 543 tasks, in Label Studio. Now we import the results and calculate Cohen's Kappa.

In [147]:
with open('annotated_disamb.json', 'r') as infile:
    annotation_results = json.load(infile)

In [148]:
annotation_results

def extract_annotations(annotation_results):
    annotations = {}
    for item in annotation_results:
        annotator = item['annotator']
        if annotator not in annotations:
            annotations[annotator] = []
        annotations[annotator].append(item['options'])
    return annotations

# Extract the annotations
extracted_annotations = extract_annotations(annotation_results)

# Split in two arrays
y1 = extracted_annotations['oscarbovin@hotmail.se']
y2 = extracted_annotations['s.bisiani@surrey.ac.uk']

import sklearn
from sklearn.metrics import cohen_kappa_score

# Convert the lists to strings
y1 = [str(item) for item in y1]
y2 = [str(item) for item in y2]

cohen_kappa_score(y1, y2)

In [155]:
# filter for elements in y1 and y2 that are not the same, comparing them by position
diffs = [(i, y1[i], y2[i]) for i in range(len(y1)) if y1[i] != y2[i]]
diffs_csv = pd.DataFrame(diffs, columns=['index', 'y1', 'y2'])
diffs_csv.head(27)

# Create a dictionary with the correct values
correct_values = {
    42: 'LAD not in options',
    43: 'Entity spans across several districts (e.g., a region)',
    57: 'South Lanarkshire',
    67: 'Gloucester',
    69: 'LAD not in options',
    77: 'Entity spans across several districts (e.g., a region)',
    95: 'Wandsworth',
    102: 'Entity spans across several districts (e.g., a region)',
    113: 'Entity is a location outside the UK',
    159: 'Entity spans across several districts (e.g., a region)',
    183: 'Entity is a location outside the UK',
    193: 'Entity is not a location',
    218: 'Entity spans across several districts (e.g., a region)',
    222: 'Entity spans across several districts (e.g., a region)',
    230: 'Entity spans across several districts (e.g., a region)',
    232: 'Armagh City, Banbridge and Craigavon',
    279: 'LAD not in options',
    309: "King's Lynn and West Norfolk",
    313: 'Rugby',
    413: 'Entity is not a location',
    414: 'Newark and Sherwood',
    454: 'Entity is not a location',
    455: 'Entity is not a location',
    457: 'Entity spans across several districts (e.g., a region)',
    505: 'Entity is not a location',
    507: 'North Yorkshire',
    528: 'LAD not in options'
}

# make a new array with the correct values
corrected_y2 = y2.copy()
for index, value in correct_values.items():
    corrected_y2[index] = value

# create df from original annotated data json
annotated_data = pd.DataFrame(annotation_results)
annotated_data = annotated_data.groupby('id').sample(n=1, random_state=1).reset_index(drop=True)
annotated_data['annotators_choice'] = corrected_y2
annotated_data = annotated_data.drop(columns=['options', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time', 'agreement'])
annotated_data.to_csv("annotators_disambiguated_data.csv")

In [None]:
import json

# Function to extract entity details
def extract_entities(row):
    entities = []
    for entity_type, entity_list in row['entities'].items():
        for entity in entity_list:
            start_pos = row['doc'].find(entity)
            if start_pos != -1:
                end_pos = start_pos + len(entity)
                entities.append({
                    "start": start_pos,
                    "end": end_pos,
                    "entity_type": entity_type
                })
    return entities

# Create a dictionary to store the JSON structure
json_data = {}

# Iterate through the DataFrame and populate the dictionary
for index, row in ner_entities_sample.iterrows():
    json_data[row['doc']] = extract_entities(row)

# Save the dictionary as a JSON file
with open('entities.json', 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

print("JSON file created successfully!")