In [1]:
import json
import pickle
import numpy as np
import pandas as pd

# Paragraphs

In [2]:
# open the paragraphs file if you don't want to run the above again

filename = r"C:\Users\conix\Documents\Corpus\methods_corpus\methods_paragraph_corpus.pickle"
with open(filename, "rb") as f:
    methods_paras = pickle.load(f)


filename = r"C:\Users\conix\Documents\Corpus\methods_sentencesrawtext.pickle"        
with open(filename, "rb") as f:
    methods_sentences = pickle.load(f)

print(f'total number of methods paragraphs: {len(methods_paras)}')
print(f'total number of methods sentences: {len(methods_sentences)}')

total number of methods paragraphs: 44853
total number of methods sentences: 641892


# Helper functions

In [3]:
# Function to create a list of the columns from the dictionary with the classification


def extract_strings(d):
    strings = []
    if isinstance(d, dict):
        for key, value in d.items():
            strings.append(key)
            strings.extend(extract_strings(value))
    elif isinstance(d, list):
        for item in d:
            strings.extend(extract_strings(item))
    elif isinstance(d, str):
        strings.append(d)
    return strings

# read json file with the annotations


def read_jsonfile(filepath):
    data = []
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Error decoding line: {line}")
    except FileNotFoundError:
        print(f"The file {filepath} was not found.")
    return data

# Function to get the labels from the jsonl


def extract_terms_after_colons(data):
    terms = []
    
    def traverse_dict(d):
        for key, value in d.items():
            if isinstance(value, dict):
                traverse_dict(value)  # Recursively traverse if value is a dict
            else:
                # Extract the part after ':::'
                if isinstance(value, str) and ':::' in value:
                    terms.append(value.split(':::')[-1])
    
    traverse_dict(data)
    return terms

# Function to update the df for higher categories (in place)


def recursive_update(df, classif):
    # Go through each item in the classification
    for key, value in classif.items():
        if isinstance(value, list):
            # First process subcategories (go deeper into the hierarchy)
            for v in value:
                if isinstance(v, dict):
                    # Recursively update subcategories first
                    recursive_update(df, v)
            
            # After processing subcategories, update the current category
            sub_categories = [v if isinstance(v, str) else list(v.keys())[0] for v in value]
            df.loc[df[sub_categories].eq(1).any(axis=1), key] = 1
    
    # Return the DataFrame to allow for chaining if needed
    return df

# Function to turn the jsonl into a complete df


def json_to_df(data, classif):
    # Get the columns from the classification
    all_strings = extract_strings(classif)
    extra_columns = ["id", "displayed_text"]
    columns = all_strings + extra_columns

    # Create a list to hold all rows
    rows = []

    # Loop over the data and create a new row for each entry
    for i in data:
        row = {}  # Dictionary for the current row

        # Extract labels
        labels = extract_terms_after_colons(i['label_annotations'])

        # Set values for all columns in all_strings
        for cat in all_strings:
            row[cat] = 1 if cat in labels else 0

        # Add extra columns
        row["displayed_text"] = i["displayed_text"]
        row['id'] = i['id']

        # Append the row to the list of rows
        rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns)

    # Update higher categories using recursive update (ensure in-place modification)
    df = recursive_update(df, classif)

    return df

# Our classification

## Old

In [4]:
# terms and classification used for the first batch of annotations
# for the updated terms, see classifbelow
classif_old = {
    "Phenotype": [
        {
            "Phen datatypes": [
                {"MORPH": 
               [ "quant morph","Interbreeding_morph",
                {
                    "qual morph": [
                        "color_pattern", 
                        "Shape", 
                        "Texture", 
                        "Ultrastructural", 
                        
                    ]
                }]},
                {
                    "BEHAV": [
                        "Acoustic data", 
                        "feeding", 
                        "Mating behavior"
                    ]
                },
                "ECOLOGY"
                            ]
        },
        {
            "Phen processing": [
                "IMAGING",
                "SAMPLING",
                "STORAGE"
            ]
        },
        {
            "Phen analysis": [
                "phen_regression",
                "phen_pylo"
            ]
        }
    ],
    "Genotype": [
        {
            "genot datatypes": [
                "Nuclear DNA",
                "Organellar DNA",
                "Transcriptomic data",
                "Proteomic data",
                "Microsatellites",
                "Whole genomes",
                "Exomes",
                "Genome-wide studies/SNPs",
                "Epigenetic data",
                "eDNA",
                {
                    "BIOCHEM": [
                        "Chemotax", 
                        "Cytotax"
                    ]
                }
            ]
        },
        {
            "gen processing": [
                {"SEQUENCING": ["gen1", 
                "gen2", 
                "gen3",]},
                "other"
                
                
            ]
        },
        {
            "gen analysis": [
                {
                    "GEN_NON_PHYLO": [
                        "Distance", 
                        "haplowebs", 
                        "Fixed alt character states", 
                        "Clustering", 
                        "fen_Interbreeding"
                    ]
                },
                 "PHYLO_SD",
                    
                {
                    "PHYLO_TREE": [
                        "Distance_based", 
                        "Character_based",
                         "Consensus_supertree",
                    ]
                },
               
                "Other",
                "ML_methods"
            ]
        }
    ],
    "Singletons": [
        "Interbreeding", 
        "spec justification", 
        "Phylogenetic", 
        "Specimen storage location", 
        "sampling location", 
        "abbreviations & terms", 
        "nomenclature & history",
        "BIOGEO"
    ]
}

## New

In [5]:
classif = {
    "PHENOTYPE": [
        {
            "Phen_data": [
                {"MORPH": 
               [ "quant_morph","interbr_morph",
                {
                    "qual_morph": [
                        "color_pattern", 
                        "shape", 
                        "texture", 
                        "ultrastruct", 
                        
                    ]
                }]},
                {
                    "BEHAV": [
                        "acoustic", 
                        "feeding", 
                        "mating"
                    ]
                },
                "ECOLOGY"
                            ]
        },
        {
            "Phen_proc": [
                "IMAGING",
                "SAMPLING",
                "STORAGE"
            ]
        },
        {
            "Phen_analysis": [
                "phen_nonphylo",
                "phen_pylo"
            ]
        }
    ],
    "GENOTYPE": [
        {
            "Gen_data": [
                "nuclear",
                "organellar",
                "transcriptomic",
                "proteomic",
                "tandem_repeats",
                "whole_genomes",
                "exomes",
                "genome_wide",
                "epigenetic",
                "eDNA",
                {
                    "BIOCHEM": [
                        "chemotax", 
                        "cytotax"
                    ]
                }
            ]
        },
        {
            "Gen_proc": [
                {"SEQUENCING": ["gen1", 
                "gen2", 
                "gen3"]},
                "genproc_other"
                
                
            ]
        },
        {
            "Gen_analysis": [
                {
                    "GEN_NON_PHYLO": [
                        "distance", 
                        "haplowebs", 
                        "fixed_alt_states", 
                        "clustering", 
                        "gen_interbr"
                    ]
                },
                 "PHYLO_SD",
                    
                {
                    "PHYLO_TREE": [
                        "distance_based", 
                        "character_based",
                         "consensus_supertree",
                    ]
                },
                "MACHINE_LEARNING"
            ]
        }
    ],
    "Singletons": [
        "interbreeding", 
        "rank_just", 
        "phylogenetic", 
        "specimen_storage_loc", 
        "sampling_loc", 
        "abbrev_terms", 
        "nomenclat_history",
        "biogeo"
    ]
}

# Annotate Data

We annotate in potato, using the immigration framing template with an adapted config file (implementing our classification)

Category retention:
- keep only those that after batch 1 have at least 10 instances (i.e. 2.5%). Dropped: 'gen_interbr', 'tandem_repeats', 'haplowebs', 'fixed_alt_states', 'ultrastruct', 'machine_learning', 'chemotax', 'gen2','texture', 'cytotax', 'clustering', 'biochem', 'proteomic', 'feeding','mating', 'transcriptomic', 'whole_genomes', 'exomes', 'genome_wide', 'epigenetic', 'edna', 'gen3', 'genproc_other', 'consensus_supertree'

## load newly annotated data from jsonl

This process includes reading the data and making sure that higher level categories are "1" if their leaf categories are "1"

In [4]:
## first batch, 403 done by me of which 100 also by Marlies
batch1 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\corrected_dfx.csv", sep = ";")
batch1_double = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\checked_samples.csv", sep = ";")

# add a column to show whether it has been double-checked by Marlies
batch1['checked'] = np.where(batch1['id'].isin(batch1_double['id'].values), 1, 0)
batch1 = batch1.drop(columns = ['Unnamed: 0','Phylo_singlelocus', 'Phylo_multilocus','Other', 'Revisions', 'bad sample', 'gen alaysis' ])
# add categroy that was added
batch1['SEQUENCING'] = np.where(((batch1['gen1'] == 1) |(batch1['gen2'] == 1) |(batch1['gen3'] == 1)),1,0)

# Marlies batch 2 (300)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\annotated_instances_marlies2_stijn.jsonl"
filepath2 = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\annotated_instances_marlies2_marlies.jsonl"
data = read_jsonfile(filepath)
batch2_m = json_to_df(data, classif)
# batch2_m['checked'] = 0
data2 = read_jsonfile(filepath2)
batch2_m2 = json_to_df(data2, classif)

## Laura batch 2 (300)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\laura2\annotated_instances_laura2_stijn.jsonl"
filepath_laura =  r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\laura2\annotated_instances_laura2_laura.jsonl"
data = read_jsonfile(filepath_laura)
batch2_l = json_to_df(data, classif)
batch2_l['checked'] = 0
data = read_jsonfile(filepath)
batch2_ls = json_to_df(data, classif)
batch2_ls['checked'] = 0

## Stijn batch 2 (100)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\stijn2\annotated_instances_STIJN2.tsv"
df = pd.read_csv(filepath, sep = '\t')

terms = extract_strings(classif)
df.columns = ['user', 'id', 'displayed_text'] + [i.split(":::")[1] for i in df.columns[3:]]


missing = [j for j in extract_strings(classif) if j not in df.columns]
df[missing] = np.nan
df[terms] = df[terms].notna().astype(int)
batch2_s = recursive_update(df, classif)
batch2_s = batch2_s.drop(columns = 'user')
batch2_s['checked'] = 0

# active learning

filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\active_learning_annotated1.jsonl"
data = read_jsonfile(filepath)
batch4_1 = json_to_df(data, classif)
batch4_1['checked'] = 0

filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\active_learning_annotated2.jsonl"
data = read_jsonfile(filepath)
batch4_2 = json_to_df(data, classif)
batch4_2['checked'] = 0

# laura batch 3

filepath_laura =  r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\thirdTry\laura\annotated_instances_l3.jsonl"
data = read_jsonfile(filepath_laura)
batch3_l = json_to_df(data, classif)
batch3_l['checked'] = 0

# targeted, domain knowledge based sampling
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fifthTry\targeted_samples.jsonl"
data = read_jsonfile(filepath)
batch5 = json_to_df(data, classif)
batch5['checked'] = 0

# jhoe
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\jhoe2\annotated_instances_jhoe.jsonl"
data = read_jsonfile(filepath)
batch2_j = json_to_df(data, classif)
batch2_j['checked'] = 0

## Check Brennan Prediger for batch 1

In [69]:
#dont forget to map column names

batch1 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\corrected_dfx.csv", sep = ";")


filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\annotated_instances_marlies.jsonl"
data = read_jsonfile(filepath)
m = json_to_df(data, classif)

In [71]:
m_id = m.id.values

overlap_s = batch1.loc[batch1.id.isin(m_id)]
overlap_m = m.loc[m.id.isin(overlap_s.id.values)]
cols = ['PHENOTYPE', 'Phen_data', 'MORPH', 'quant_morph', 'interbr_morph',
       'qual_morph', 'color_pattern', 'shape', 'texture', 'ultrastruct',
       'BEHAV', 'acoustic', 'feeding', 'mating', 'ECOLOGY', 'Phen_proc',
       'IMAGING', 'SAMPLING', 'STORAGE', 'Phen_analysis', 'phen_nonphylo',
       'phen_pylo', 'GENOTYPE', 'Gen_data', 'nuclear', 'organellar',
       'transcriptomic', 'proteomic', 'tandem_repeats', 'whole_genomes',
       'exomes', 'genome_wide', 'epigenetic', 'eDNA', 'BIOCHEM', 'chemotax',
       'cytotax', 'Gen_proc',  'gen1', 'gen2', 'gen3',
       'genproc_other', 'Gen_analysis', 'GEN_NON_PHYLO', 'distance',
       'haplowebs', 'fixed_alt_states', 'clustering', 'gen_interbr',
       'PHYLO_SD', 'PHYLO_TREE', 'distance_based', 'character_based',
       'consensus_supertree', 'MACHINE_LEARNING', 'Singletons',
       'interbreeding', 'rank_just', 'phylogenetic', 'specimen_storage_loc',
       'sampling_loc', 'abbrev_terms', 'nomenclat_history', 'biogeo', 'id']
      
overlap_s = overlap_s[cols].set_index('id').sort_index()
overlap_m = overlap_m[cols].set_index('id').sort_index()

In [95]:
A = overlap_s.values
B = overlap_m.values

# 1) Compute p_o
matches = np.sum(A == B)
total_cells = A.size  # or A.shape[0] * A.shape[1]
p_o = matches / total_cells

# 2) For 2 categories, p_c = 1/2
#    So Brennan–Prediger’s kappa = 2 * p_o - 1
kappa_bp = 2 * p_o - 1

print("Observed agreement (p_o) =", p_o)
print("Brennan–Prediger’s Kappa =", kappa_bp)
print(len(A))

Observed agreement (p_o) = 0.8828125
Brennan–Prediger’s Kappa = 0.765625
100


In [96]:
# Marlies batch 2 (300)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\annotated_instances_marlies2_stijn.jsonl"
filepath2 = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\annotated_instances_marlies2_marlies.jsonl"
data = read_jsonfile(filepath)
batch2_m = json_to_df(data, classif).set_index('id')
# batch2_m['checked'] = 0
data2 = read_jsonfile(filepath2)
batch2_m2 = json_to_df(data2, classif).set_index('id')

m_id = batch2_m2.index
s = batch2_m.loc[batch2_m.index.isin(m_id)]
m = batch2_m2.loc[batch2_m2.index.isin(s.index)]
cols = [i for i in s.columns if i != 'displayed_text']
s = s[cols].sort_index()
m = m[cols].sort_index()


A =s.values
B = m.values

# 1) Compute p_o
matches = np.sum(A == B)
total_cells = A.size  # or A.shape[0] * A.shape[1]
p_o = matches / total_cells

# 2) For 2 categories, p_c = 1/2
#    So Brennan–Prediger’s kappa = 2 * p_o - 1
kappa_bp = 2 * p_o - 1

print("Observed agreement (p_o) =", p_o)
print("Brennan–Prediger’s Kappa =", kappa_bp)
print(len(A))

Observed agreement (p_o) = 0.9714285714285714
Brennan–Prediger’s Kappa = 0.9428571428571428
98


## map diffferences

We made small changes to the classification (i.e. drop categories) after initial annotations. As a part of that, we changed the names in the annotation interface. We map these here.

In [70]:
# map classification 1 and 2, and straighten differences

mapping_dct = {
    'Phenotype': 'PHENOTYPE',
    'Phen datatypes': 'Phen_data',
    'MORPH': 'MORPH',
    'quant morph': 'quant_morph',
    'qual morph': 'qual_morph',
    'color_pattern': 'color_pattern',
    'Shape': 'shape',
    'Texture': 'texture',
    'Ultrastructural': 'ultrastruct',
    'Interbreeding_morph': 'interbr_morph',
    'BEHAV': 'BEHAV',
    'Acoustic data': 'acoustic',
    'feeding': 'feeding',
    'Mating behavior': 'mating',
    'ECOLOGY': 'ECOLOGY',
    'Phen processing': 'Phen_proc',
    'IMAGING': 'IMAGING',
    'SAMPLING': 'SAMPLING',
    'STORAGE': 'STORAGE',
    'Phen analysis': 'Phen_analysis',
    'phen_regression': 'phen_nonphylo',
    'Phen_pylo': 'phen_pylo',
    'Genotype': 'GENOTYPE',
    'genot datatypes': 'Gen_data',
    'Nuclear DNA': 'nuclear',
    'Organellar DNA': 'organellar',
    'Transcriptomic data': 'transcriptomic',
    'Proteomic data': 'proteomic',
    'Microsatellites': 'tandem_repeats',
    'Whole genomes': 'whole_genomes',
    'Exomes': 'exomes',
    'Genome-wide studies/SNPs': 'genome_wide',
    'Epigenetic data': 'epigenetic',
    'eDNA': 'eDNA',
    'BIOCHEM': 'BIOCHEM',
    'Chemotax': 'chemotax',
    'Cytotax': 'cytotax',
    'gen processing': 'Gen_proc',
    'SEQUENCING':'SEQUENCING',
    'gen1': 'gen1',
    'gen2': 'gen2',
    'gen3': 'gen3',
    'other': 'genproc_other',
    'gen analysis': 'Gen_analysis',
    'GEN_NON_PHYLO': 'GEN_NON_PHYLO',
    'Distance': 'distance',
    'haplowebs': 'haplowebs',
    'Fixed alt character states': 'fixed_alt_states',
    'Clustering': 'clustering',
    'fen_Interbreeding': 'gen_interbr',
    'PHYLO_SD': 'PHYLO_SD',
    'PHYLO_TREE': 'PHYLO_TREE',
    'Distance_based': 'distance_based',
    'Character_based': 'character_based',
    'Consensus_supertree': 'consensus_supertree',
    
    'ML_methods': 'MACHINE_LEARNING',
    'Singletons': 'Singletons',
    'Interbreeding': 'interbreeding',
    'spec justification': 'rank_just',
    'Phylogenetic': 'phylogenetic',
    'Specimen storage location': 'specimen_storage_loc',
    'sampling location': 'sampling_loc',
    'abbreviations & terms': 'abbrev_terms',
    'nomenclature & history': 'nomenclat_history',
    'BIOGEO': 'biogeo',
    'id': 'id',
    'displayed_text': 'displayed_text',
    'checked': 'checked'
}

batch1 = batch1.rename(columns = mapping_dct)
batch1 = batch1.loc[:, ~batch1.columns.duplicated()]

In [6]:
batch5.to_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fifthTry\batch5.csv", sep = ";")

In [7]:
batch2_j.to_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\jhoe2\batch2_j.csv", sep = ";")

## make 1 file out of all the separate annotations

In [6]:
# concatenate them

batch1 =  pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\batch1_FullFinal.csv", sep = ";")
batch2_m = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\marlies_batch2_checked.csv", sep = ";")
batch2_l = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\laura2\corrected_df_laura2_DONE.csv", sep = ";")
batch3_l = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\thirdTry\laura\batch3_l.csv", sep = ";")
batch2_s = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\stijn2\stijn2.csv", sep = ";")
batch2_j = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\jhoe2\batch2_j.csv", sep = ";")
batch4_1 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\batch4.csv", sep = ";")
batch4_2 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\batch4_2.csv", sep = ";")
batch5 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fifthTry\batch5.csv", sep = ";")


batch2_l['checked'] = 1

batch1['batch'] = 'batch1'
batch2_m['batch'] = 'batch2_m'
batch2_s['batch'] = 'batch2_s'
batch2_l['batch'] = 'batch2_l'
batch2_j['batch'] = 'batch2_j'
batch3_l['batch'] = 'batch3_l'
batch4_1['batch'] = "batch4_1"
batch4_2['batch'] = "batch4_2"
batch5['batch'] = "batch5"

dfs = [batch1, batch2_m, batch2_s, batch2_l, batch2_j, batch3_l, batch4_1, batch4_2,batch5]

# for df in dfs:
#     df = df.astype('object')  # Specify type conversion
for idx, i in enumerate(dfs):
    i.columns = [j.lower() for j in i.columns]
    try:
        dfs[idx] = i.drop(columns = ['unnamed: 0', 'terms_abbrev'], errors = 'ignore')
    except KeyError as e:
        print('column not there')
        
combined_df = pd.concat(dfs, axis=0, ignore_index=True)    

In [8]:
# mistakes that came up in diagnosing gpt


combined_df.loc[combined_df.ecology == 1].iloc[0]['ecology'] = 0

mistakes_dct = {
    "./Corpus/Zootaxa/2/zootaxa_2496_1_3.json_0": ["rank_just", 1],
    "./Corpus/Zootaxa/4/6/zootaxa_4619_2_6.json_0": ["rank_just", 1],
    "./Corpus/Zootaxa/4/7/zootaxa_4751_2_11.json_1": ["rank_just", 1],
    "./Corpus/Zootaxa/4/6/zootaxa_4651_2_6.json_0": ["rank_just", 1],
    "./Corpus/Zootaxa/4/7/zootaxa_4731_4_1.json_2": ["rank_just", 1],
    "./Corpus/Zootaxa/3/6/zootaxa_3681_4_1.json_3": ["rank_just", 1],
    "./Corpus/Zootaxa/3/1/zootaxa_3162_1_3.json_1": ["rank_just", 1],
    "./Corpus/Pensoft/journal_of_hymenoptera_research-25-035.json_0": ["rank_just", 1],
    "./Corpus/Pensoft/zookeys-835-087.json_0": ["rank_just", 1],
    "./Corpus/Zootaxa/4/2/zootaxa_4263_1_3.json_0": ["rank_just", 1],
    "./Corpus/Pensoft/phytokeys-47-059.json_0": ["biogeo", 1],
    "./Corpus/Zootaxa/4/0/zootaxa_4061_4_1.json_1": ["biogeo", 1],
    "./Corpus/Zootaxa/4/5/zootaxa_4554_2_8.json_0": ["biogeo", 1],
    "./Corpus/Zootaxa/4/1/zootaxa_4132_2_2.json_1": ["biogeo", 0],
    "./Corpus/Zootaxa/4/4/zootaxa_4433_1_4.json_4": ["biogeo", 0],
    "./Corpus/Zootaxa/1/zootaxa_1278_1_1.json_1": ["biogeo", 0],
    "./Corpus/EJT/10_5852_ejt_2012_13.json_21": ["biogeo", 0],
    "./Corpus/Pensoft/phytokeys-47-059.json_0": ["sampling", 0],
    "./Corpus/Zootaxa/4/8/zootaxa_4809_3_2.json_1": ["color_pattern", 0],
    "./Corpus/Pensoft/zookeys-913-089.json_0": ["color_pattern", 1],
    "./Corpus/Pensoft/zookeys-315-055.json_0": ["sequencing", 1],
    "./Corpus/Zootaxa/4/3/zootaxa_4303_4_8.json_1": ["sequencing", 1],
    "./Corpus/Zootaxa/4/3/zootaxa_4312_3_3.json_0": ["sequencing", 1],
    "./Corpus/Zootaxa/4/3/zootaxa_4374_4_5.json_2": ["sequencing", 1],
    "./Corpus/Zootaxa/4/7/zootaxa_4751_2_11.json_1": ["sequencing", 1],
    "./Corpus/Zootaxa/4/5/zootaxa_4543_2_8.json_0": ["sequencing", 1],
    "./Corpus/Zootaxa/4/6/zootaxa_4656_3_11.json_1": ["sequencing", 1],
    "./Corpus/Zootaxa/4/7/zootaxa_4766_3_2.json_2": ["sequencing", 1],
    "./Corpus/Zootaxa/4/1/zootaxa_4147_4_9.json_0": ["phen_nonphylo", 0],
    "./Corpus/Zootaxa/4/8/zootaxa_4822_4_4.json_1": ["phen_nonphylo", 0],
    "./Corpus/Zootaxa/4/9/zootaxa_4949_3_1.json_6": ["phen_nonphylo", 0],
}

for k, v in mistakes_dct.items():
    combined_df.loc[(combined_df.id == k), v[0]] = v[1]

- all phylogenetic methods should have 'phylogenetic'
- all haplotype should have 'genetic distance'
- all interbr_morph should be interbeeding


In [9]:
# have to complete labels

combined_df.loc[(combined_df.interbr_morph == 1) | (combined_df.mating == 1) | (combined_df.gen_interbr == 1), 'interbreeding'] = 1
combined_df.loc[(combined_df.haplowebs == 1) | (combined_df.distance_based == 1), 'distance' ] = 1
combined_df.loc[(combined_df.phen_pylo == 1) | (combined_df.phylo_sd == 1) | (combined_df.phylo_tree == 1), 'phylogenetic'] = 1

In [10]:
# recursive updating using the hierarchy

def lowercase_nested(data):
    """Recursively converts all strings in a nested structure (dict, list) to lowercase."""
    if isinstance(data, dict):
        # Recursively handle dictionaries
        return {k.lower(): lowercase_nested(v) if isinstance(k, str) else k for k, v in data.items()}
    elif isinstance(data, list):
        # Recursively handle lists
        return [lowercase_nested(item) for item in data]
    elif isinstance(data, str):
        # Convert strings to lowercase
        return data.lower()
    else:
        # Leave other data types unchanged
        return data


lc_classif = lowercase_nested(classif)
recursive_update(combined_df, lc_classif).head()

Unnamed: 0,phenotype,phen_data,morph,quant_morph,qual_morph,color_pattern,shape,texture,ultrastruct,interbr_morph,...,specimen_storage_loc,sampling_loc,abbrev_terms,nomenclat_history,biogeo,id,displayed_text,checked,sequencing,batch
0,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,./Corpus/EJT/10_5852_ejt_2021_735_1243.json_0,The material examined was collected in fragmen...,0,0,batch1
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,./Corpus/Zootaxa/1/zootaxa_1920_1_5.json_0,Invertebrate samples\nwere collected using a h...,0,0,batch1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Zootaxa/4/7/zootaxa_4729_2_8.json_0,The nymphs were collected in the stream by han...,0,0,batch1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Pensoft/phytokeys-47-059.json_0,We verified both the endemic status and the di...,0,0,batch1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Pensoft/zookeys-315-055.json_0,"During each cruise, specimens were sorted onbo...",1,1,batch1


In [12]:
# mistakes selected by comparing regression and gpt, corrected
# see TM_IdentifyAnnotationMistakes

corrections = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\data\overlapping_mistakes_correctedLauraStijn_csv_TEST_SET.csv", sep = ";")

# check how many mistakes were due to annotator mistakes
corrections['annotation_mistake'] = np.where(corrections['correct'] == corrections['predicted'], 1, 0)
print(corrections['annotation_mistake'].value_counts() / len(corrections))


corrections.head()

annotation_mistake
1    0.702469
0    0.297531
Name: count, dtype: float64


Unnamed: 0,id,ground truth,predicted,correct,label,old_label,text,annotation_mistake
0,./Corpus/EJT/10_5852_ejt_2016_194.json_1,0,1,0,genetic and molecular features,genotype,European Journal of Taxonomy 194: 1â€“16 (2016...,0
1,./Corpus/EJT/10_5852_ejt_2016_194.json_1,0,1,0,phylogenetic tree reconstruction methods,phylo_tree,European Journal of Taxonomy 194: 1â€“16 (2016...,0
2,./Corpus/EJT/10_5852_ejt_2016_194.json_1,0,1,0,analysis of molecular data,gen_analysis,European Journal of Taxonomy 194: 1â€“16 (2016...,0
3,./Corpus/EJT/10_5852_ejt_2017_271.json_1,0,1,1,sequencing,sequencing,"The BCM criterion is similar to BM, but the qu...",1
4,./Corpus/EJT/10_5852_ejt_2017_271.json_1,1,0,1,biogeographical methods,biogeo,"The BCM criterion is similar to BM, but the qu...",0


In [13]:
# now fix the mistakes in combined

for _, row in corrections.iterrows():
    _id        = row['id']
    col_to_fix = row['old_label']
    new_value  = row['correct']
    combined_df.loc[combined_df['id'] == _id, col_to_fix] = new_value

In [14]:
len(combined_df)

1845

In [15]:
# save the file
# this is the data we use for training the models

combined_df.to_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData13052025_after_corrections.csv")
combined_df = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData13052025_after_corrections.csv", index_col = 'Unnamed: 0')

# Data from active learning

Get training sampls for sparse categories

In [12]:
desired = ['PHENOTYPE',
 'Phen_data',
 'MORPH',
 'biogeo',
 'color_pattern',
 'Phen_proc',
 'IMAGING',
 'quant_morph',
 'STORAGE',
 'SAMPLING',
 'GENOTYPE',
 'interbr_morph',
 'Gen_data',
 'Gen_analysis',
 'Gen_proc',
 'SEQUENCING',
 'gen1',
 'organellar',
 'PHYLO_TREE',
 'Phen_analysis',
 'character_based',
 'phen_nonphylo',
 'GEN_NON_PHYLO',
 'ECOLOGY',
 'distance',
 'nuclear',
 'BEHAV',
 'phylogenetic',
 'rank_just',
 'phen_pylo',
 'distance_based',
 'acoustic',
 'PHYLO_SD',
 'interbreeding',
 ]

desired = [i.lower() for i in desired]

In [13]:
len(desired)

34

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# original data
df = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData20012025.csv")
df = df.drop(columns = 'Unnamed: 0')
df.head(2)


# preprocess

# Extract features and labels
X = df['displayed_text']  # Features (text field)
y = df[desired]# df.iloc[:, :-4]  # Assuming last 3 columns are metadata, adjust as needed

# Remove columns with only one class (all 0s or all 1s)
non_constant_columns = [col for col in y.columns if y[col].nunique() > 1]
y = y[non_constant_columns]

# vectorize text
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_vectorized = vectorizer.fit_transform(X)


# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=2808)

In [50]:
mask = combined_df[desired].sum() < 58
combined_df[desired].sum()[mask]

rank_just         47
phen_pylo         42
distance_based    27
acoustic          29
phylo_sd          23
dtype: int64

In [51]:
combined_df[desired].sum()[mask].index

Index(['rank_just', 'phen_pylo', 'distance_based', 'acoustic', 'phylo_sd'], dtype='object')

In [52]:
sparse = combined_df[desired].sum()[mask].index
sparse

Index(['rank_just', 'phen_pylo', 'distance_based', 'acoustic', 'phylo_sd'], dtype='object')

In [53]:
import os

# load a classifier
import joblib

base_filepath = r"C:\Users\conix\Documents\Corpus\classifier_models"

# Load the vectorizer
vectorizer_filepath = os.path.join(base_filepath, "vectorizer.pkl")
vectorizer = joblib.load(vectorizer_filepath)

# Load each model dynamically based on label names
loaded_models = {}
for label in y.columns:  # Ensure this matches the original label set
    model_filepath = os.path.join(base_filepath, f"model_{label}.pkl")
    loaded_models[label] = joblib.load(model_filepath)

In [54]:
new_samples_dct = {key:value for key,value in methods_paras.items() if key not in combined_df.id.values}
new_models_dict = {key:value for key,value in loaded_models.items() if key in sparse}

In [55]:
# New samples
import random

new_sample_keys = random.sample(list(new_samples_dct.keys()), 5000)
new_samples = [new_samples_dct[i] for i in new_sample_keys]


# Preprocess using the vectorizer
new_samples_vectorized = vectorizer.transform(new_samples)
predictions = {}
probabilities = {}

for label, model in new_models_dict.items():
    predictions[label] = model.predict(new_samples_vectorized)
    probabilities[label] = model.predict_proba(new_samples_vectorized) 

# Convert probabilities to a DataFrame
import pandas as pd

probabilities_df = pd.DataFrame({label: prob[:, 1] for label, prob in probabilities.items()})

In [56]:
# check where the model is most uncertain by looking at the difference in pron between the two classes
# smaller is more uncertain

uncertainty = probabilities_df.apply(lambda x: abs(x - 0.5), axis=1)
most_uncertain_indices = uncertainty.sum(axis=1).nsmallest(n=10).index

In [65]:
sample_indices = {}
for i in uncertainty.columns:

    #sample the most informative for each
    n = 8
    # get indices of n most uncertain samples
    indices = uncertainty.loc[:,i].sort_values()[:n].index.values
    sample_indices[i] = indices

indices = list(set([item for sublist in sample_indices.values() for item in sublist]))

for i in [(0.25,4), (0.17,3), (0.12,2), (0.1,2)]:
    extra = list(uncertainty[(uncertainty <i[0]).sum(axis=1) >= i[1]].index.values)
    indices.extend(extra)  

indices = list(set(indices))
print(len(indices))

52


In [66]:
# make a dataset to annotate

samples_to_annotate = {new_sample_keys[i]:new_samples[i] for i in indices}


output_file_path = r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\active_learning_samples2.json'


# Write the random key-value pairs to the output file
with open(output_file_path, 'w') as f:
    for key, text in samples_to_annotate.items():
        entry = {
            "id": key,
            "text": text,
            "annotations": []
        }
        f.write(json.dumps(entry) + '\n')

# Data from domain knowledge searches (for diversity)

Sparse categories: phylogenetic species delimitation (25),  distance based tree methods (33), acoustic data (38),  phylogenetic analysis methods for phenotypic data (54), rank_justification (56), behav (mating, feeding), nuclear genes (78), genetic distance (96).


Categories that the classifier doesn't perform well for: 

- phen_nonphylo: bad recall because somewhat diverse -- it gets the common things (like PCA), but misses rarer things. This is a very heterogeneous category after all.
- ecology: extremely heterogeneous, hard to solve by extra sampling
- nuclear: **heterogeneous, but probably not so much, and sample is small. SO SAMPLE MORE.**
- behav: heterogeneous because all behaviour apart from acoustic is rare. Sample some more feeding and mating because sample is low. SO SAMPLE MORE.
- rank_just: very heterogeneous and very low sample. Try to sample more, but likely not very effective.
- phen_pylo: **low sample, but terms often repeat so try SAMPLING MORE.**
- distance_based: **sample is small but terms repeat so SAMPLE MORE.**
- acoustic: **sample is small but terms repeat so SAMPLE MORE.**
- phylo_sd: **sample is small but terms repeat so SAMPLE MORE.**
- biogeo: very diverse, hard to solve by extra sampling.
- color_pattern: very diverse, hard to solve by extra sampling.
- sampling: very diverse, hard to solve by extra sampling.
- phen_analysis: **sample is small but terms repeat so SAMPLE MORE.**

We try targeted sampling for those, to improve the classifier. We don't use the active learning approach because we are scared that this will narrow the sample too much.

## search terms

In [4]:
# search terms

search_dct = {
    "phylo_sd": [            
        r"\bGMYC\b",                             
        r"\bGeneralized Mixed Yule Coalescent\b",
        r"\bPTP\b",                              
        r"\bPoisson Tree Processes\b",   
        r"\bBPP\b",                             
        r"\bBayesian Phylogenetics and Phylogeography\b",
        r"\bcoalescent-based delimitation\b",                            
        r"\bspecies tree(?:s)\?b",             
        r"\bmultispecies coalescent\b",              
    ],
    "distance_based": [
        r"\bdistance-based (?:phylogenetic )?tree inference\b", 
        r"\bdistance-based tree\b",                      
        r"\bneighbor-joining\b",                       
        r"\bUPGMA\b",                                   
        r"\bgenetic distance tree\b",                     
        r"\bFitch-Margoliash\b",   
        r"\bNJ\b",             
    ],
    "acoustic": [
        r"\bacoustic\b",
        r"\bcall\b",
        r"\bvocal(?:ization|ize|s|izing)?\b",
        r"\bmicrophone\b",
        r"\boscillogram(?:s)?\b",
        r"\bspectral analys(?:is|es)?\b",
        r"\bnote duration(?:s)?\b",
        r"\bspectrogram(?:s)?\b",
        r"\bfrequency analys(?:is|es)?\b",
        r"\btimbre\b",
        r"\bpitch\b",
        r"\bamplitude\b",
        r"\bwavelength\b",
        r"\bdB(?: SPL)?\b",
        r"\bsignal-to-noise ratio\b",
        r"\bacoustic spectrum\b",
        r"\brecorder(?:s)?\b",
        r"\bsonograph(?:y|s)?\b",
        r"\brhythmic\b",
        r"\bchirp(?:s)?\b",
        r"\btrill(?:s)?\b",
        r"\bwhistle(?:s)?\b",
        r"\bpulse(?:s)?\b",
        r"\bclick(?:s)?\b",
        r"\becolocation\b",
        r"\bsonar\b"
    ],
    "phen_pylo": [
        r"\bmatrix\b",                
        r"\bcladistic(?:s)?\b",       
        r"\bhomoplasy\b",             
        r"\bautapomorph(?:y|ies)\b",  
        r"\bplesiomorph(?:y|ies)\b",  
        r"\bbremer\b",
        r"\bretention\b",
        r"\bmultistate\b",
    ],
    "rank_just": [
        r"\bspecies concept\b",
        r"\belevated to\b",
        r"\new ranked\b",
        r"\bcriteria\b",
        r"\bsufficiently different\b",
        r"\bspecies status\b",
        r"\bshould be recognized as\b",
        r"\bshould be recognised as\b",
        r"\bphylogenetically distinct\b",
        r"\bcombined evidence\b",
        r"\bshould be ranked as\b",
    
    ],
    "nuclear": [
        r"\bnuclear\b",             
        r"\b18s\b",         
        r"\b28s\b",         
        r"\brag1\b",                      
        r"\brag2\b",                     
        r"\bits(?:[ _]region)?\b",       
        r"\bef1(?:-alpha)?\b",            
        r"\bh3\b",                        
        r"\bh4\b",                       
        r"\btub(?:ulin)?\b",             
        r"\bgapdh\b",                     
        r"\brpb1\b",                      
        r"\brpb2\b",                      
        r"\brna polymerase\b",            
    ],
    "phen_nonphylo": [
        r"\blandmark\b",
        r"\bregression\b",
        r"\bANOVA\b", 
        r"\bMANOVA\b",  
        r"\bANCOVA\b",  
        r"\bt-test\b",
        r"\bpaired t-test\b",
        r"\bwhitney\b",
        r"\bwilcoxon\b",
        r"\bchi-square\b",
        r"\bKruskal-Wallis\b",
        r"\bSpearman(?:'s)?\b",
        r"\bPearson(?:'s)?\b",
        r"\bmorphometric analys(?:is|es)\b",
        r"\bgeometric morphometrics\b",
        r"\bmorphospace\b"
    ],
    "behav": [
        r"\bmating\b",
        r"\bcourtship\b",
        r"\bbreeding\b",
        r"\bseasonal breeding\b",
        r"\bmonogam(?:y|ous)\b",
        r"\bmate guarding\b",
        r"\bfeeding\b",
        r"\bdiet\b",
        r"\bforaging\b",
        r"\bprey\b",
    ]
}

selection_dct = {'phylo_sd': 30, 'distance_based': 15, 'acoustic': 15, 'phen_pylo': 30, 'rank_just': 30, 'nuclear': 15, 'phen_nonphylo': 30, 'behav':20}

## search

In [2]:
import random
import re

def select_matching_texts(search_dct, text_dct, selection_dct):
    """
    Selects a specified number of randomly chosen text pieces per category from text_dct
    based on regex patterns in search_dct.

    Args:
    - search_dct (dict): Dictionary mapping categories to lists of regex patterns.
    - text_dct (dict): Dictionary mapping unique IDs to text pieces.
    - selection_dct (dict): Dictionary mapping categories to the number of texts to select.

    Returns:
    - dict: A dictionary with unique text IDs as keys and text pieces as values.
    """
    matched_texts = {key: [] for key in search_dct}  # Store matches per category
    
    # Iterate through text pieces and categorize them based on regex matches
    for text_id, text in text_dct.items():
        for category, patterns in search_dct.items():
            if any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns):
                matched_texts[category].append((text_id, text))

    # Select the requested number of random texts per category
    selected_texts = {}
    total_selected = 0

    for category, num_to_select in selection_dct.items():
        if category in matched_texts:
            available_texts = matched_texts[category]
            random.shuffle(available_texts)  # Shuffle for randomness
            
            # Select the required number of texts (or all if fewer than requested)
            selected_pairs = available_texts[:num_to_select]
            
            # Store in final dictionary
            for text_id, text in selected_pairs:
                selected_texts[text_id] = text
                total_selected += 1

    return selected_texts

In [10]:
new_samples_dct = {key:value for key,value in methods_paras.items() if key not in combined_df.id.values}
texts = select_matching_texts(search_dct, new_samples_dct, selection_dct)

In [13]:
output_file_path = r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\diversity_sampling1.json'


# Write the random key-value pairs to the output file
with open(output_file_path, 'w') as f:
    for key, text in texts.items():
        entry = {
            "id": key,
            "text": text,
            "annotations": []
        }
        f.write(json.dumps(entry) + '\n')