In [2]:
import pandas as pd
import numpy as np
import json
import pickle

# Paragraphs

In [2]:
# open the paragraphs file if you don't want to run the above again

filename = r"C:\Users\conix\Documents\Corpus\methods_corpus\methods_paragraph_corpus.pickle"
with open(filename, "rb") as f:
    methods_paras = pickle.load(f)


filename = r"C:\Users\conix\Documents\Corpus\methods_sentencesrawtext.pickle"        
with open(filename, "rb") as f:
    methods_sentences = pickle.load(f)

print(f'total number of methods paragraphs: {len(methods_paras)}')
print(f'total number of methods sentences: {len(methods_sentences)}')

total number of methods paragraphs: 44853
total number of methods sentences: 641892


# Convenience functions

In [5]:
# Function to create a list of the columns from the dictionary with the classification
def extract_strings(d):
    strings = []
    if isinstance(d, dict):
        for key, value in d.items():
            strings.append(key)
            strings.extend(extract_strings(value))
    elif isinstance(d, list):
        for item in d:
            strings.extend(extract_strings(item))
    elif isinstance(d, str):
        strings.append(d)
    return strings

# read json file with the annotations
def read_jsonfile(filepath):
    data = []
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Error decoding line: {line}")
    except FileNotFoundError:
        print(f"The file {filepath} was not found.")
    return data

# Function to get the labels from the jsonl
def extract_terms_after_colons(data):
    terms = []
    
    def traverse_dict(d):
        for key, value in d.items():
            if isinstance(value, dict):
                traverse_dict(value)  # Recursively traverse if value is a dict
            else:
                # Extract the part after ':::'
                if isinstance(value, str) and ':::' in value:
                    terms.append(value.split(':::')[-1])
    
    traverse_dict(data)
    return terms

# Function to update the df for higher categories (in place)
def recursive_update(df, classif):
    # Go through each item in the classification
    for key, value in classif.items():
        if isinstance(value, list):
            # First process subcategories (go deeper into the hierarchy)
            for v in value:
                if isinstance(v, dict):
                    # Recursively update subcategories first
                    recursive_update(df, v)
            
            # After processing subcategories, update the current category
            sub_categories = [v if isinstance(v, str) else list(v.keys())[0] for v in value]
            df.loc[df[sub_categories].eq(1).any(axis=1), key] = 1
    
    # Return the DataFrame to allow for chaining if needed
    return df

# Function to turn the jsonl into a complete df
def json_to_df(data, classif):
    # Get the columns from the classification
    all_strings = extract_strings(classif)
    extra_columns = ["id", "displayed_text"]
    columns = all_strings + extra_columns

    # Create a list to hold all rows
    rows = []

    # Loop over the data and create a new row for each entry
    for i in data:
        row = {}  # Dictionary for the current row

        # Extract labels
        labels = extract_terms_after_colons(i['label_annotations'])

        # Set values for all columns in all_strings
        for cat in all_strings:
            row[cat] = 1 if cat in labels else 0

        # Add extra columns
        row["displayed_text"] = i["displayed_text"]
        row['id'] = i['id']

        # Append the row to the list of rows
        rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns)



    # Update higher categories using recursive update (ensure in-place modification)
    df = recursive_update(df, classif)

    return df



# Our classification

## Old

In [3]:
# terms and classification used for the first batch of annotations
# for the updated terms, see classifbelow

classif_old = {
    "Phenotype": [
        {
            "Phen datatypes": [
                {"MORPH": 
               [ "quant morph","Interbreeding_morph",
                {
                    "qual morph": [
                        "color_pattern", 
                        "Shape", 
                        "Texture", 
                        "Ultrastructural", 
                        
                    ]
                }]},
                {
                    "BEHAV": [
                        "Acoustic data", 
                        "feeding", 
                        "Mating behavior"
                    ]
                },
                "ECOLOGY"
                            ]
        },
        {
            "Phen processing": [
                "IMAGING",
                "SAMPLING",
                "STORAGE"
            ]
        },
        {
            "Phen analysis": [
                "phen_regression",
                "phen_pylo"
            ]
        }
    ],
    "Genotype": [
        {
            "genot datatypes": [
                "Nuclear DNA",
                "Organellar DNA",
                "Transcriptomic data",
                "Proteomic data",
                "Microsatellites",
                "Whole genomes",
                "Exomes",
                "Genome-wide studies/SNPs",
                "Epigenetic data",
                "eDNA",
                {
                    "BIOCHEM": [
                        "Chemotax", 
                        "Cytotax"
                    ]
                }
            ]
        },
        {
            "gen processing": [
                {"SEQUENCING": ["gen1", 
                "gen2", 
                "gen3",]},
                "other"
                
                
            ]
        },
        {
            "gen analysis": [
                {
                    "GEN_NON_PHYLO": [
                        "Distance", 
                        "haplowebs", 
                        "Fixed alt character states", 
                        "Clustering", 
                        "fen_Interbreeding"
                    ]
                },
                 "PHYLO_SD",
                    
                {
                    "PHYLO_TREE": [
                        "Distance_based", 
                        "Character_based",
                         "Consensus_supertree",
                    ]
                },
               
                "Other",
                "ML_methods"
            ]
        }
    ],
    "Singletons": [
        "Interbreeding", 
        "spec justification", 
        "Phylogenetic", 
        "Specimen storage location", 
        "sampling location", 
        "abbreviations & terms", 
        "nomenclature & history",
        "BIOGEO"
    ]
}


## New

In [4]:
classif = {
    "PHENOTYPE": [
        {
            "Phen_data": [
                {"MORPH": 
               [ "quant_morph","interbr_morph",
                {
                    "qual_morph": [
                        "color_pattern", 
                        "shape", 
                        "texture", 
                        "ultrastruct", 
                        
                    ]
                }]},
                {
                    "BEHAV": [
                        "acoustic", 
                        "feeding", 
                        "mating"
                    ]
                },
                "ECOLOGY"
                            ]
        },
        {
            "Phen_proc": [
                "IMAGING",
                "SAMPLING",
                "STORAGE"
            ]
        },
        {
            "Phen_analysis": [
                "phen_nonphylo",
                "phen_pylo"
            ]
        }
    ],
    "GENOTYPE": [
        {
            "Gen_data": [
                "nuclear",
                "organellar",
                "transcriptomic",
                "proteomic",
                "tandem_repeats",
                "whole_genomes",
                "exomes",
                "genome_wide",
                "epigenetic",
                "eDNA",
                {
                    "BIOCHEM": [
                        "chemotax", 
                        "cytotax"
                    ]
                }
            ]
        },
        {
            "Gen_proc": [
                {"SEQUENCING": ["gen1", 
                "gen2", 
                "gen3"]},
                "genproc_other"
                
                
            ]
        },
        {
            "Gen_analysis": [
                {
                    "GEN_NON_PHYLO": [
                        "distance", 
                        "haplowebs", 
                        "fixed_alt_states", 
                        "clustering", 
                        "gen_interbr"
                    ]
                },
                 "PHYLO_SD",
                    
                {
                    "PHYLO_TREE": [
                        "distance_based", 
                        "character_based",
                         "consensus_supertree",
                    ]
                },
                "MACHINE_LEARNING"
            ]
        }
    ],
    "Singletons": [
        "interbreeding", 
        "rank_just", 
        "phylogenetic", 
        "specimen_storage_loc", 
        "sampling_loc", 
        "abbrev_terms", 
        "nomenclat_history",
        "biogeo"
    ]
}

# Annotate Data

We annotate in potato, using the immigration framing template with an adapted config file (implementing our classification)

## load newly annotated data from jsonl

This process includes reading the data and making sure that higher level categories are "1" if their leaf categories are "1"

In [None]:
filepath2 = r"C:\Users\conix\immigration_framing\annotation_output\stijn_conix@hotmail.com\annotated_instances.jsonl"
data2 = read_jsonfile(filepath2)
df2 = json_to_df(data2, classif)

## load newly annotated data from tsv

This process includes reading the data and making sure that higher level categories are "1" if their leaf categories are "1"

In [None]:
df1 = pd.read_csv(filepath1, sep = '\t')

terms = extract_strings(classif)
missing = [j for j in extract_strings(classif) if j not in [i.split(":::")[1] for i in a[3:]]]

df1.columns = ['user', 'id', 'displayed_text'] + [i.split(":::")[1] for i in a[3:]]
df1[missing] = np.nan
df1[terms] = df1[terms].notna().astype(int)
df1 = recursive_update(df1, classif)

## Compare and fix annotations by multiple annotators

In [97]:
# first make sure that the column names and shapes are identical
# get an idea of how much disagreement there is
df1.compare(dfx, result_names=('stijn','marlies')))

86

In [None]:
# first make sure that the column names and shapes are identical

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# the row you want to fix
r = 86

# get the id
idx = shared.iloc[r,:]['id']

dfx.iloc[r,:].compare(df1.iloc[r,:], result_names=('stijn','marlies'))

In [None]:
# the text
shared.iloc[r,:]['displayed_text']

In [332]:
# choose one df (e.g. dfx) to fix in place
# change various column row values in place

dfx.loc[dfx.id == idx, 'gen processing'] = 0
dfx.loc[dfx.id == idx, 'Specimen storage location'] = 1
dfx.loc[dfx.id == idx, 'Genotype'] = 0


# use this to save the corrected df each time you make changes
dfx.to_csv(r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\corrected_df2.csv', sep = ';')

In [254]:
dfx.loc[dfx.id == idx, 'gen analysis']

178    0
Name: gen analysis, dtype: int64

# Annotated Data

We want a df with a columns for every category, including all higher levels in the hierarchy, and for each sample for each column a 1 or 0.

## Load first batch

The first 100 coded samples (corrected) were done using slightly different terms (see classif_old). Update these terms here, and load the df.
Note: 'checked_samples' are the ones that both Marlies and I did. 'corrected_dfx' are my 403 coded samples, of which Marlies checked 100.

In [81]:
# first batch

# 100 checked by two annotators (Marlies and Stijn)
dfc = pd.read_csv(r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\checked_samples.csv', sep = ';')

# fix typos
dfc = dfc.rename(columns = {'gen alaysis': "gen analysis", "Phen_pylo": 'phen_pylo'})

# add new high-level category SEQUENCING
# this combines the three generations of sequencing

dfc["SEQUENCING"] = np.where(((dfc['gen1'] == 1) | (dfc['gen2'] == 1) | (dfc['gen3'] == 1)), 1,0)

# remove Other from the old classif
# it was not used, and has been removed after those 403

old_terms = extract_strings(classif_old)
old_terms.remove('Other')

# replace old terms by new ones

replace_dict = {old_terms[i]: extract_strings(classif)[i] for i in range(len(extract_strings(classif))) }
dfc = dfc.rename(columns=replace_dict)

dfc.head()

Unnamed: 0.1,Unnamed: 0,PHENOTYPE,Phen_data,MORPH,quant_morph,qual_morph,color_pattern,shape,texture,ultrastruct,...,bad sample,sampling_loc,abbrev_terms,nomenclat_history,biogeo,id,displayed_text,Gen_analysis,phen_pylo,SEQUENCING
0,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Pensoft/zookeys-315-055.json_0,"During each cruise, specimens were sorted onbo...",,,0
1,92,1,1,1,0,1,0,1,0,0,...,0,0,0,0,0,./Corpus/Zootaxa/4/1/zootaxa_4181_1_1.json_1,The discovered mines were documented by photog...,,,0
2,93,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,./Corpus/Zootaxa/4/7/zootaxa_4779_4_1.json_0,Imagines were reared from larvae in containers...,,,0
3,94,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,./Corpus/Zootaxa/2/zootaxa_2676_1_2.json_1,"Pristina jenkinae, Rhyacodrilus subterraneus, ...",,,0
4,226,1,1,1,1,0,0,0,0,0,...,0,1,0,0,1,./Corpus/Zootaxa/3/6/zootaxa_3637_4_1.json_0,A total of 20 sites was sampled throughout the...,,,0


In [4]:
# 400, of which the same 100 checked by two annotators (Marlies and Stijn)
dfc2 = pd.read_csv(r'C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\firstTry\corrected_dfx.csv', sep = ';')

dfc2.head()

Unnamed: 0.1,Unnamed: 0,Phenotype,Phen datatypes,MORPH,quant morph,qual morph,color_pattern,Shape,Texture,Ultrastructural,...,Specimen storage location,bad sample,sampling location,abbreviations & terms,nomenclature & history,BIOGEO,id,displayed_text,gen alaysis,Phen_pylo
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,./Corpus/EJT/10_5852_ejt_2021_735_1243.json_0,The material examined was collected in fragmen...,,
1,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,./Corpus/Zootaxa/1/zootaxa_1920_1_5.json_0,Invertebrate samples\nwere collected using a h...,,
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,./Corpus/Zootaxa/4/7/zootaxa_4729_2_8.json_0,The nymphs were collected in the stream by han...,,
3,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,./Corpus/Pensoft/phytokeys-47-059.json_0,We verified both the endemic status and the di...,,
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,./Corpus/Pensoft/zookeys-315-055.json_0,"During each cruise, specimens were sorted onbo...",,
