In [1]:
import json
import pickle

import numpy as np
import pandas as pd

# Paragraphs

In [2]:
# open the paragraphs file if you don't want to run the above again

filename = r"C:\Users\conix\Documents\Corpus\methods_corpus\methods_paragraph_corpus.pickle"
with open(filename, "rb") as f:
    methods_paras = pickle.load(f)


filename = r"C:\Users\conix\Documents\Corpus\methods_sentencesrawtext.pickle"        
with open(filename, "rb") as f:
    methods_sentences = pickle.load(f)

print(f'total number of methods paragraphs: {len(methods_paras)}')
print(f'total number of methods sentences: {len(methods_sentences)}')

total number of methods paragraphs: 44853
total number of methods sentences: 641892


# Convenience functions

In [3]:
# Function to create a list of the columns from the dictionary with the classification


def extract_strings(d):
    strings = []
    if isinstance(d, dict):
        for key, value in d.items():
            strings.append(key)
            strings.extend(extract_strings(value))
    elif isinstance(d, list):
        for item in d:
            strings.extend(extract_strings(item))
    elif isinstance(d, str):
        strings.append(d)
    return strings

# read json file with the annotations


def read_jsonfile(filepath):
    data = []
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Error decoding line: {line}")
    except FileNotFoundError:
        print(f"The file {filepath} was not found.")
    return data

# Function to get the labels from the jsonl


def extract_terms_after_colons(data):
    terms = []
    
    def traverse_dict(d):
        for key, value in d.items():
            if isinstance(value, dict):
                traverse_dict(value)  # Recursively traverse if value is a dict
            else:
                # Extract the part after ':::'
                if isinstance(value, str) and ':::' in value:
                    terms.append(value.split(':::')[-1])
    
    traverse_dict(data)
    return terms

# Function to update the df for higher categories (in place)


def recursive_update(df, classif):
    # Go through each item in the classification
    for key, value in classif.items():
        if isinstance(value, list):
            # First process subcategories (go deeper into the hierarchy)
            for v in value:
                if isinstance(v, dict):
                    # Recursively update subcategories first
                    recursive_update(df, v)
            
            # After processing subcategories, update the current category
            sub_categories = [v if isinstance(v, str) else list(v.keys())[0] for v in value]
            df.loc[df[sub_categories].eq(1).any(axis=1), key] = 1
    
    # Return the DataFrame to allow for chaining if needed
    return df

# Function to turn the jsonl into a complete df


def json_to_df(data, classif):
    # Get the columns from the classification
    all_strings = extract_strings(classif)
    extra_columns = ["id", "displayed_text"]
    columns = all_strings + extra_columns

    # Create a list to hold all rows
    rows = []

    # Loop over the data and create a new row for each entry
    for i in data:
        row = {}  # Dictionary for the current row

        # Extract labels
        labels = extract_terms_after_colons(i['label_annotations'])

        # Set values for all columns in all_strings
        for cat in all_strings:
            row[cat] = 1 if cat in labels else 0

        # Add extra columns
        row["displayed_text"] = i["displayed_text"]
        row['id'] = i['id']

        # Append the row to the list of rows
        rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns)

    # Update higher categories using recursive update (ensure in-place modification)
    df = recursive_update(df, classif)

    return df

# Our classification

## Old

In [4]:
# terms and classification used for the first batch of annotations
# for the updated terms, see classifbelow
classif_old = {
    "Phenotype": [
        {
            "Phen datatypes": [
                {"MORPH": 
               [ "quant morph","Interbreeding_morph",
                {
                    "qual morph": [
                        "color_pattern", 
                        "Shape", 
                        "Texture", 
                        "Ultrastructural", 
                        
                    ]
                }]},
                {
                    "BEHAV": [
                        "Acoustic data", 
                        "feeding", 
                        "Mating behavior"
                    ]
                },
                "ECOLOGY"
                            ]
        },
        {
            "Phen processing": [
                "IMAGING",
                "SAMPLING",
                "STORAGE"
            ]
        },
        {
            "Phen analysis": [
                "phen_regression",
                "phen_pylo"
            ]
        }
    ],
    "Genotype": [
        {
            "genot datatypes": [
                "Nuclear DNA",
                "Organellar DNA",
                "Transcriptomic data",
                "Proteomic data",
                "Microsatellites",
                "Whole genomes",
                "Exomes",
                "Genome-wide studies/SNPs",
                "Epigenetic data",
                "eDNA",
                {
                    "BIOCHEM": [
                        "Chemotax", 
                        "Cytotax"
                    ]
                }
            ]
        },
        {
            "gen processing": [
                {"SEQUENCING": ["gen1", 
                "gen2", 
                "gen3",]},
                "other"
                
                
            ]
        },
        {
            "gen analysis": [
                {
                    "GEN_NON_PHYLO": [
                        "Distance", 
                        "haplowebs", 
                        "Fixed alt character states", 
                        "Clustering", 
                        "fen_Interbreeding"
                    ]
                },
                 "PHYLO_SD",
                    
                {
                    "PHYLO_TREE": [
                        "Distance_based", 
                        "Character_based",
                         "Consensus_supertree",
                    ]
                },
               
                "Other",
                "ML_methods"
            ]
        }
    ],
    "Singletons": [
        "Interbreeding", 
        "spec justification", 
        "Phylogenetic", 
        "Specimen storage location", 
        "sampling location", 
        "abbreviations & terms", 
        "nomenclature & history",
        "BIOGEO"
    ]
}

## New

In [5]:
classif = {
    "PHENOTYPE": [
        {
            "Phen_data": [
                {"MORPH": 
               [ "quant_morph","interbr_morph",
                {
                    "qual_morph": [
                        "color_pattern", 
                        "shape", 
                        "texture", 
                        "ultrastruct", 
                        
                    ]
                }]},
                {
                    "BEHAV": [
                        "acoustic", 
                        "feeding", 
                        "mating"
                    ]
                },
                "ECOLOGY"
                            ]
        },
        {
            "Phen_proc": [
                "IMAGING",
                "SAMPLING",
                "STORAGE"
            ]
        },
        {
            "Phen_analysis": [
                "phen_nonphylo",
                "phen_pylo"
            ]
        }
    ],
    "GENOTYPE": [
        {
            "Gen_data": [
                "nuclear",
                "organellar",
                "transcriptomic",
                "proteomic",
                "tandem_repeats",
                "whole_genomes",
                "exomes",
                "genome_wide",
                "epigenetic",
                "eDNA",
                {
                    "BIOCHEM": [
                        "chemotax", 
                        "cytotax"
                    ]
                }
            ]
        },
        {
            "Gen_proc": [
                {"SEQUENCING": ["gen1", 
                "gen2", 
                "gen3"]},
                "genproc_other"
                
                
            ]
        },
        {
            "Gen_analysis": [
                {
                    "GEN_NON_PHYLO": [
                        "distance", 
                        "haplowebs", 
                        "fixed_alt_states", 
                        "clustering", 
                        "gen_interbr"
                    ]
                },
                 "PHYLO_SD",
                    
                {
                    "PHYLO_TREE": [
                        "distance_based", 
                        "character_based",
                         "consensus_supertree",
                    ]
                },
                "MACHINE_LEARNING"
            ]
        }
    ],
    "Singletons": [
        "interbreeding", 
        "rank_just", 
        "phylogenetic", 
        "specimen_storage_loc", 
        "sampling_loc", 
        "abbrev_terms", 
        "nomenclat_history",
        "biogeo"
    ]
}

# Annotate Data

We annotate in potato, using the immigration framing template with an adapted config file (implementing our classification)

To Do:

- check whether I did not misinterpret wing venation as color and pattern
- check ultrastructural data: SEM is not enough
- check habitus: is not enough for qualitative morph

What do we have now (17/01)

- 403 annotated by stijn (corrected_dfx), of which 100 also rated by Marlies and compared with Stijn's (checked_samples)
- 300 annotated by Stijn (annotated_instances_marlies2_stijn.jsonl), of which 100 also annotated by Marlies (annotated_instances_marlies2_marlies). Note: the 7 samples without any categories still need to be added somehow (if no categories, the output file doesnt have them). Comparison of the 100 shared ones is done (corrected_df_marlies2_DONE), and the entire batch 2 for Marlies can be found in marlies_batch2_checked.
- 300 annotated instances by Laura and Stijn (annotated_instances_laura2_stijn.jsonl and annotated_instances_laura2_laura.jsonl). Samples without codes still need to be dealt with. The annotations have been compared and finalized (corrected_df_laura2_DONE).
- 100 samples annotated by Stijn (annotated_instances_STIJN2.tsv)

TO DO:
- Joe samples
- Compare Joe samples
- Laura extra + compare
- compare those 100 by Marlies

Still need to do for all:

- all phylogenetic methods should have 'phylogenetic'
- all haplotype should have 'genetic distance'
- all interbr_morph should be interbeeding
- Make sure I do the hierarchy updating again at the end for all the ones that have been corrected
- Does acoustic data always assume nonphylo analysis?
- Does gen_analysis imply gen_data?
- does SPEC_DELIMITATION always involve phylogenetic tree inference? (or at least, if there is GMYC)?
  

## load newly annotated data from jsonl

This process includes reading the data and making sure that higher level categories are "1" if their leaf categories are "1"

In [5]:
## first batch, 403 done by me of which 100 also by Marlies
batch1 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\corrected_dfx.csv", sep = ";")
batch1_double = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\checked_samples.csv", sep = ";")

# add a column to show whether it has been double-checked by Marlies
batch1['checked'] = np.where(batch1['id'].isin(batch1_double['id'].values), 1, 0)
batch1 = batch1.drop(columns = ['Unnamed: 0','Phylo_singlelocus', 'Phylo_multilocus','Other', 'Revisions', 'bad sample', 'gen alaysis' ])
# add categroy that was added
batch1['SEQUENCING'] = np.where(((batch1['gen1'] == 1) |(batch1['gen2'] == 1) |(batch1['gen3'] == 1)),1,0)

# Marlies batch 2 (300)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\annotated_instances_marlies2_stijn.jsonl"
filepath2 = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\annotated_instances_marlies2_marlies.jsonl"
data = read_jsonfile(filepath)
batch2_m = json_to_df(data, classif)
# batch2_m['checked'] = 0
data2 = read_jsonfile(filepath2)
batch2_m2 = json_to_df(data2, classif)

## Laura batch 2 (300)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\laura2\annotated_instances_laura2_stijn.jsonl"
filepath_laura =  r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\laura2\annotated_instances_laura2_laura.jsonl"
data = read_jsonfile(filepath_laura)
batch2_l = json_to_df(data, classif)
batch2_l['checked'] = 0
data = read_jsonfile(filepath)
batch2_ls = json_to_df(data, classif)
batch2_ls['checked'] = 0
## Stijn batch 2 (100)
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\stijn2\annotated_instances_STIJN2.tsv"
df = pd.read_csv(filepath, sep = '\t')

terms = extract_strings(classif)
df.columns = ['user', 'id', 'displayed_text'] + [i.split(":::")[1] for i in df.columns[3:]]


missing = [j for j in extract_strings(classif) if j not in df.columns]
df[missing] = np.nan
df[terms] = df[terms].notna().astype(int)
batch2_s = recursive_update(df, classif)
batch2_s = batch2_s.drop(columns = 'user')
batch2_s['checked'] = 0

In [71]:
filepath = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\stijn2\annotated_instances_STIJN2.tsv"
df = pd.read_csv(filepath, sep = '\t')

terms = extract_strings(classif)
df.columns = ['user', 'id', 'displayed_text'] + [i.split(":::")[1] for i in df.columns[3:]]


missing = [j for j in extract_strings(classif) if j not in df.columns]
df[missing] = np.nan
df[terms] = df[terms].notna().astype(int)
batch2_s = recursive_update(df, classif)
batch2_s = batch2_s.drop(columns = 'user')
batch2_s['checked'] = 0

In [122]:
batch1_double.shape

(100, 74)

## map diffferences

In [152]:
# map classification 1 and 2, and straighten differences

mapping_dct = {
    'Phenotype': 'PHENOTYPE',
    'Phen datatypes': 'Phen_data',
    'MORPH': 'MORPH',
    'quant morph': 'quant_morph',
    'qual morph': 'qual_morph',
    'color_pattern': 'color_pattern',
    'Shape': 'shape',
    'Texture': 'texture',
    'Ultrastructural': 'ultrastruct',
    'Interbreeding_morph': 'interbr_morph',
    'BEHAV': 'BEHAV',
    'Acoustic data': 'acoustic',
    'feeding': 'feeding',
    'Mating behavior': 'mating',
    'ECOLOGY': 'ECOLOGY',
    'Phen processing': 'Phen_proc',
    'IMAGING': 'IMAGING',
    'SAMPLING': 'SAMPLING',
    'STORAGE': 'STORAGE',
    'Phen analysis': 'Phen_analysis',
    'phen_regression': 'phen_nonphylo',
    'Phen_pylo': 'phen_pylo',
    'Genotype': 'GENOTYPE',
    'genot datatypes': 'Gen_data',
    'Nuclear DNA': 'nuclear',
    'Organellar DNA': 'organellar',
    'Transcriptomic data': 'transcriptomic',
    'Proteomic data': 'proteomic',
    'Microsatellites': 'tandem_repeats',
    'Whole genomes': 'whole_genomes',
    'Exomes': 'exomes',
    'Genome-wide studies/SNPs': 'genome_wide',
    'Epigenetic data': 'epigenetic',
    'eDNA': 'eDNA',
    'BIOCHEM': 'BIOCHEM',
    'Chemotax': 'chemotax',
    'Cytotax': 'cytotax',
    'gen processing': 'Gen_proc',
    'SEQUENCING':'SEQUENCING',
    'gen1': 'gen1',
    'gen2': 'gen2',
    'gen3': 'gen3',
    'other': 'genproc_other',
    'gen analysis': 'Gen_analysis',
    'GEN_NON_PHYLO': 'GEN_NON_PHYLO',
    'Distance': 'distance',
    'haplowebs': 'haplowebs',
    'Fixed alt character states': 'fixed_alt_states',
    'Clustering': 'clustering',
    'fen_Interbreeding': 'gen_interbr',
    'PHYLO_SD': 'PHYLO_SD',
    'PHYLO_TREE': 'PHYLO_TREE',
    'Distance_based': 'distance_based',
    'Character_based': 'character_based',
    'Consensus_supertree': 'consensus_supertree',
    
    'ML_methods': 'MACHINE_LEARNING',
    'Singletons': 'Singletons',
    'Interbreeding': 'interbreeding',
    'spec justification': 'rank_just',
    'Phylogenetic': 'phylogenetic',
    'Specimen storage location': 'specimen_storage_loc',
    'sampling location': 'sampling_loc',
    'abbreviations & terms': 'abbrev_terms',
    'nomenclature & history': 'nomenclat_history',
    'BIOGEO': 'biogeo',
    'id': 'id',
    'displayed_text': 'displayed_text',
    'checked': 'checked'
}

batch1 = batch1.rename(columns = mapping_dct)
batch1 = batch1.loc[:, ~batch1.columns.duplicated()]

In [153]:
batch1.to_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\batch1_FullFinal.csv", sep = ";")

## make 1 file

In [186]:
# concatenate them

batch1 =  pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\batch1_FullFinal.csv", sep = ";")
batch2_m = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\marlies_batch2_checked.csv", sep = ";")
batch2_l = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\laura2\corrected_df_laura2_DONE.csv", sep = ";")
batch2_s = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\stijn2\stijn2.csv", sep = ";")

batch2_l['checked'] = 1

batch1['batch'] = 'batch1'
batch2_m['batch'] = 'batch2_m'
batch2_s['batch'] = 'batch2_s'
batch2_l['batch'] = 'batch2_l'

dfs = [batch1, batch2_m, batch2_s, batch2_l]

# for df in dfs:
#     df = df.astype('object')  # Specify type conversion
for idx, i in enumerate(dfs):
    i.columns = [j.lower() for j in i.columns]
    try:
        dfs[idx] = i.drop(columns = ['unnamed: 0', 'terms_abbrev'], errors = 'ignore')
    except KeyError as e:
        print('column not there')
        
combined_df = pd.concat(dfs, axis=0, ignore_index=True)    

- all phylogenetic methods should have 'phylogenetic'
- all haplotype should have 'genetic distance'
- all interbr_morph should be interbeeding
- Make sure I do the hierarchy updating again at the end for all the ones that have been corrected
- Does acoustic data always assume nonphylo analysis?
- Does gen_analysis imply gen_data?
- does SPEC_DELIMITATION always involve phylogenetic tree inference? (or at least, if there is GMYC)?

In [190]:
# have to complete labels

combined_df.loc[(combined_df.interbr_morph == 1) | (combined_df.mating == 1) | (combined_df.gen_interbr == 1), 'interbreeding'] = 1
combined_df.loc[(combined_df.haplowebs == 1) | (combined_df.distance_based == 1), 'distance' ] = 1
combined_df.loc[(combined_df.phen_pylo == 1) | (combined_df.phylo_sd == 1) | (combined_df.phylo_tree == 1), 'phylogenetic'] = 1

In [199]:
# recursive updating using the hierarchy

def lowercase_nested(data):
    """Recursively converts all strings in a nested structure (dict, list) to lowercase."""
    if isinstance(data, dict):
        # Recursively handle dictionaries
        return {k.lower(): lowercase_nested(v) if isinstance(k, str) else k for k, v in data.items()}
    elif isinstance(data, list):
        # Recursively handle lists
        return [lowercase_nested(item) for item in data]
    elif isinstance(data, str):
        # Convert strings to lowercase
        return data.lower()
    else:
        # Leave other data types unchanged
        return data


lc_classif = lowercase_nested(classif)
recursive_update(combined_df, lc_classif).head()

Unnamed: 0,phenotype,phen_data,morph,quant_morph,qual_morph,color_pattern,shape,texture,ultrastruct,interbr_morph,...,specimen_storage_loc,sampling_loc,abbrev_terms,nomenclat_history,biogeo,id,displayed_text,checked,sequencing,batch
0,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,./Corpus/EJT/10_5852_ejt_2021_735_1243.json_0,The material examined was collected in fragmen...,0,0,batch1
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,./Corpus/Zootaxa/1/zootaxa_1920_1_5.json_0,Invertebrate samples\nwere collected using a h...,0,0,batch1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Zootaxa/4/7/zootaxa_4729_2_8.json_0,The nymphs were collected in the stream by han...,0,0,batch1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Pensoft/phytokeys-47-059.json_0,We verified both the endemic status and the di...,0,0,batch1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Pensoft/zookeys-315-055.json_0,"During each cruise, specimens were sorted onbo...",1,0,batch1


In [9]:
# combined_df.to_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData20012025.csv")
combined_df = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData20012025.csv", index_col = 'Unnamed: 0')

## load newly annotated data from tsv

This process includes reading the data and making sure that higher level categories are "1" if their leaf categories are "1"

In [61]:
df1 = pd.read_csv(filepath1, sep = '\t')

terms = extract_strings(classif)
missing = [j for j in extract_strings(classif) if j not in [i.split(":::")[1] for i in a[3:]]]

df1.columns = ['user', 'id', 'displayed_text'] + [i.split(":::")[1] for i in a[3:]]
df1[missing] = np.nan
df1[terms] = df1[terms].notna().astype(int)
df1 = recursive_update(df1, classif)

NameError: name 'filepath1' is not defined

In [335]:
batch2_l.loc[batch2_l.id == './Corpus/Zootaxa/2/zootaxa_2620_1_1.json_1']['displayed_text'].values

array(['Morphological terminology follows Goulet & Huber (1993), Viitasaari (2002), and Vikberg (2003). Body parts are measured in millimetres. The annuli of the\nlancet are counted from the base towards the tip of the lancet starting with annulus 1. One reason is that they were\nmissing in previous revisions (Kopelke 2007c: not even length of body is given). If possible (these are often more or less damaged), type\nspecimens were also measured. The specimens chosen for measurement were well-developed specimens, and\ntheir body size was near or above the mean. Reared specimens are often small and sometimes deformed due to\nrearing conditions or because larvae were collected for rearing when too small. From the set of measurements\nof one individual many different ratios or indices can be calculated. In small females the ovipositor and its\nsheath is relatively longer when compared to the length of the hind femur or width of the head. Therefore the\nvariation of ovipositor / head width 

## Compare and fix annotations by multiple annotators

In [6]:
missing_stijn = [i for i in batch2_m2.id.values if i not in batch2_m.id.values]
remaining_marlies = [i for i in batch2_m2.id.values if i not in missing_stijn]

df1 = batch2_m.loc[batch2_m.id.isin(remaining_marlies)]
df2 = batch2_m2.loc[batch2_m2.id.isin(remaining_marlies + missing_stijn)]

In [9]:
# first make sure that the column names and shapes are identical
if df1.shape == df2.shape:
    print('all good')
else:
    print('shape mismatch')
    print(df1.shape)
    print(df2.shape)

all good


In [7]:
def add_rows(ids, df):
    new_rows = []
    for i in ids:
        new_row = {col: 0 for col in df.columns}
        new_row['id'] = i
        new_rows.append(new_row)
    
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return df

In [8]:
df1 = add_rows(missing_stijn, df1).sort_values(by='id').reset_index(drop=True).set_index('id')
df2 = df2.sort_values(by='id').reset_index(drop=True).set_index('id')

In [7]:
# it should be rows that are empty for one, but not for the other
# you don't always need this
# so add them

# missing_s = [i for i in batch2_l.id.values if i not in batch2_ls.id.values]
# missing_l = [i for i in batch2_ls.id.values if i not in batch2_l.id.values]


# df1 = add_rows(missing_s, batch2_ls).sort_values(by='id').reset_index(drop=True).set_index('id')
# df2 = add_rows(missing_l, batch2_l).sort_values(by='id').reset_index(drop=True).set_index('id')

In [10]:
# get an idea of how much disagreement there is
shared = df1.compare(df2, result_names=('stijn','marlies'))

In [11]:
# number of samples to correct

shared.shape

(77, 72)

In [227]:
# only load this when not at the start of the process

df1 = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\corrected_df_marlies2.csv", sep = ";")
# df1 = df1.set_index('id')

In [192]:
# first make sure that the column names and shapes are identical

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# the row you want to fix
r = 76

# get the id
idx = shared.index[r]

pd.DataFrame(shared.iloc[r,:]).dropna(axis =0)

Unnamed: 0,Unnamed: 1,./Corpus/Zootaxa/9/zootaxa_960_1_1.json_0
Phen_data,stijn,1.0
Phen_data,marlies,0.0
MORPH,stijn,1.0
MORPH,marlies,0.0


In [190]:
# the text
df2.loc[idx]['displayed_text']

'The material of Lygistorrhina sanctaecatharinae was\noriginally pinned, but was mounted on slides after treatment with KOH and dehydration. The data matrix (Appendix 1) was based on the one given by Grimaldi\nand Blagoderov (2001), although some characters were excluded because we could not\ncode the states, and some new characters were added. The character states for Matileola\nyangi were taken from the original description (Papp 2003). The new taxa and Lygistorrhina sanctaecatharinae were coded from actual specimens. We did not try to find characters to resolve the phylogeny among the species of Lygistorrhina and Probolaeus. The data\nmatrix for analysis was constructed and manipulated with the computer programme Winclada version 1.00.08 (Nixon 2002). Phylogenetic relationships were studied by parsimony\nanalysis, using the computer programme NONA, version 2.0 (Goloboff 1999), together\nwith Winclada, to search for the most parsimonious cladograms. The search parameters\nused with N

In [194]:
# choose one df (e.g. dfx) to fix in place
# change various column row values in place

# df1.loc[idx, 'shape'] = 1
# df1.loc[idx, 'sampling_loc'] = 1
# df1.loc[idx, 'MORPH'] = 1
# df1.loc[idx, 'STORAGE'] = 0
# df1.loc[idx, 'displayed_text'] ='The Azores Autonomous Region is a Portuguese archipelago formed by nine volcanic islands, fortytwo identified seamounts, narrow shelves, steep island slopes, bathyal and abyssal plains and oceanic ridges, located\nin the central North Atlantic Ocean (Morato et al. The archipelago is composed of three main groups of\nislands intersected by the Mid-Atlantic Ridge (MAR), which divides the western group of islands (Flores and Corvo)\nfrom the central (Terceira, Graciosa, S. Jorge, Pico and Faial) and eastern (S. Miguel and S. Maria) groups, over the\nAzores plateau that rises from the adjacent abyssal plains from ca. '


# use this to save the corrected df each time you make changes
# df1.to_csv(r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\corrected_df_marlies2_DONE.csv', sep = ';')

In [236]:
# final.to_csv(r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\secondTry\marlies2\marlies_batch2_checked.csv', sep = ';')

# Data from active learning

Get training sampls for sparse categories

In [6]:
desired = ['PHENOTYPE',
 'Phen_data',
 'MORPH',
 'Phen_proc',
 'IMAGING',
 'quant_morph',
 'STORAGE',
 'SAMPLING',
 'GENOTYPE',
 'interbr_morph',
 'Gen_data',
 'Gen_analysis',
 'Gen_proc',
 'SEQUENCING',
 'organellar',
 'PHYLO_TREE',
 'Phen_analysis',
 'character_based',
 'color_pattern',
 'phen_nonphylo',
 'GEN_NON_PHYLO',
 'ECOLOGY',
 'distance',
 'nuclear',
 'BEHAV',
 'phylogenetic',
 'rank_just',
 'phen_pylo',
 'distance_based',
 'acoustic',
 'PHYLO_SD',
 'interbreeding',
 ]

desired = [i.lower() for i in desired]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# original data
df = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData20012025.csv")
df = df.drop(columns = 'Unnamed: 0')
df.head(2)


# preprocess

# Extract features and labels
X = df['displayed_text']  # Features (text field)
y = df[desired]# df.iloc[:, :-4]  # Assuming last 3 columns are metadata, adjust as needed

# Remove columns with only one class (all 0s or all 1s)
non_constant_columns = [col for col in y.columns if y[col].nunique() > 1]
y = y[non_constant_columns]

# vectorize text
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_vectorized = vectorizer.fit_transform(X)


# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=2808)

In [10]:
mask = combined_df[desired].sum() < 60
combined_df[desired].sum()[mask]

nuclear           54
behav             47
rank_just         40
phen_pylo         30
distance_based    23
acoustic          20
phylo_sd          18
dtype: int64

In [11]:
combined_df[desired].sum()[mask].index

Index(['nuclear', 'behav', 'rank_just', 'phen_pylo', 'distance_based',
       'acoustic', 'phylo_sd'],
      dtype='object')

In [12]:
sparse = combined_df[desired].sum()[mask].index
sparse

Index(['nuclear', 'behav', 'rank_just', 'phen_pylo', 'distance_based',
       'acoustic', 'phylo_sd'],
      dtype='object')

In [13]:
import os

# load a classifier
import joblib

base_filepath = r"C:\Users\conix\Documents\Corpus\classifier_models"

# Load the vectorizer
vectorizer_filepath = os.path.join(base_filepath, "vectorizer.pkl")
vectorizer = joblib.load(vectorizer_filepath)

# Load each model dynamically based on label names
loaded_models = {}
for label in y.columns:  # Ensure this matches the original label set
    model_filepath = os.path.join(base_filepath, f"model_{label}.pkl")
    loaded_models[label] = joblib.load(model_filepath)

In [14]:
new_samples_dct = {key:value for key,value in methods_paras.items() if key not in combined_df.id.values}
new_models_dict = {key:value for key,value in loaded_models.items() if key in sparse}

In [15]:
# New samples
import random

new_sample_keys = random.sample(list(new_samples_dct.keys()), 5000)
new_samples = [new_samples_dct[i] for i in new_sample_keys]


# Preprocess using the vectorizer
new_samples_vectorized = vectorizer.transform(new_samples)
predictions = {}
probabilities = {}

for label, model in new_models_dict.items():
    predictions[label] = model.predict(new_samples_vectorized)
    probabilities[label] = model.predict_proba(new_samples_vectorized) 

# Convert probabilities to a DataFrame
import pandas as pd

probabilities_df = pd.DataFrame({label: prob[:, 1] for label, prob in probabilities.items()})

In [16]:
# check where the model is most uncertain by looking at the difference in pron between the two classes
# smaller is more uncertain

uncertainty = probabilities_df.apply(lambda x: abs(x - 0.5), axis=1)
most_uncertain_indices = uncertainty.sum(axis=1).nsmallest(n=10).index

In [35]:
sample_indices = {}
for i in uncertainty.columns:

    #sample the 5 most informative for each
    n = 5
    # get indices of n most uncertain samples
    indices = uncertainty.loc[:,i].sort_values()[:n].index.values
    sample_indices[i] = indices

indices = list(set([item for sublist in sample_indices.values() for item in sublist]))

for i in [(0.3,6), (0.25,4), (0.18,3), (0.02,2)]:
    extra = list(uncertainty[(uncertainty <i[0]).sum(axis=1) >= i[1]].index.values)
    indices.extend(extra)  

indices = list(set(indices))
print(len(indices))

111


In [36]:
# make a dataset to annotate

samples_to_annotate = {new_sample_keys[i]:new_samples[i] for i in indices}


output_file_path = r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\fourthTry\active_learning_samples1.json'


# Write the random key-value pairs to the output file
with open(output_file_path, 'w') as f:
    for key, text in samples_to_annotate.items():
        entry = {
            "id": key,
            "text": text,
            "annotations": []
        }
        f.write(json.dumps(entry) + '\n')