In [1]:
# Core libraries
import os
import re
import ast
import json
import pickle
from math import ceil

# Data handling
import numpy as np
import pandas as pd

# Progress bar
from tqdm import tqdm

# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.stats import t

# Scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    train_test_split,
)
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    hamming_loss,
    make_scorer,
    precision_recall_fscore_support,
    precision_score,
    recall_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier

# Other ML libraries
from lightgbm import LGBMClassifier
from skmultilearn.problem_transform import LabelPowerset
from iterstrat.ml_stratifiers import (
    MultilabelStratifiedKFold,
    MultilabelStratifiedShuffleSplit,
)

# Sentence transformers (e.g., SciBERT)
from sentence_transformers import SentenceTransformer

# Joblib
import joblib


# Data

In [2]:
# the data

df = pd.read_csv(r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\coding_trial\testData13052025.csv")
df = df.drop(columns = 'Unnamed: 0')


# add a column for which data was really randomly sampled
# this avoid testing on oversampled sparse classes

df['is_random'] = np.where(df.batch.isin(['batch1', 'batch2_m', 'batch2_s', 'batch2_l', 'batch2_j',
       'batch3_l']),1,0)

df.head(2)

Unnamed: 0,phenotype,phen_data,morph,quant_morph,qual_morph,color_pattern,shape,texture,ultrastruct,interbr_morph,...,abbrev_terms,nomenclat_history,biogeo,id,displayed_text,checked,sequencing,batch,not important,is_random
0,1,1,1,1,0,0,0,0,0,0,...,1,0,0,./Corpus/EJT/10_5852_ejt_2021_735_1243.json_0,The material examined was collected in fragmen...,0,0,batch1,,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,./Corpus/Zootaxa/1/zootaxa_1920_1_5.json_0,Invertebrate samples\nwere collected using a h...,0,0,batch1,,1


In [3]:
# load list with categories of interest
filename = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\data\categories_of_interest.txt"
with open(filename, 'r') as file:
    cats = json.load(file)

# load the dictionary to map category column names to gpt names
filename = r'C:\\Users\\conix\\Dropbox\\FNRS project taxonomy\\methods in taxonomy\\data\\category_names_dict.txt'
with open(filename, 'r') as file:
    category_names = json.load(file)

# descriptions of the various categories, used in the first try with gpt4omini
filename = r'C:\\Users\\conix\\Dropbox\\FNRS project taxonomy\\methods in taxonomy\\data\\methods_description_gpt4omini_first_try.txt'
with open(filename, 'r') as file:
    methods_old = json.load(file)

# descriptions of the various categories, used in the second try with gpt4omini
filename = r'C:\\Users\\conix\\Dropbox\\FNRS project taxonomy\\methods in taxonomy\\data\\methods_description_gpt4omini_second_try.txt'
with open(filename, 'r') as file:
    methods = json.load(file)

# hierarchical classification of the categories
filename = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\data\classification_categories.txt"
with open(filename, "r") as file:
    classif = json.load(file)

# Training and evaluation functions

In [4]:
# calculate metrics for reporting and validation
def fold_metrics(y_true, y_pred):
    """Return overall + per-label metrics in a flat dict."""
    out = {
        'macro_f1'  : f1_score(y_true, y_pred, average='macro'),
        'micro_f1'  : f1_score(y_true, y_pred, average='micro'),
        'macro_prec': precision_score(y_true, y_pred, average='macro'),
        'micro_prec': precision_score(y_true, y_pred, average='micro'),
        'macro_rec' : recall_score(y_true, y_pred, average='macro'),
        'micro_rec' : recall_score(y_true, y_pred, average='micro'),
        'hamming'   : hamming_loss(y_true, y_pred),
        'subset_acc': accuracy_score(y_true, y_pred),
    }
    p, r, f, s = precision_recall_fscore_support(
        y_true, y_pred, average=None, zero_division=0)
    for i, lab in enumerate(y_true.columns):
        out[f'{lab}_prec'] = p[i]
        out[f'{lab}_rec']  = r[i]
        out[f'{lab}_f1']   = f[i]
        out[f'{lab}_sup']  = s[i]
    return out
    

# function to propagate the hierarchy through the classifier results
# this at least partially captures label relations in this binary-relevance strategy
def propagate_hierarchy(pred_df, classif):
    """Return a copy where every parent/cross-link label is made 1
       if any of its children are 1."""
    out = pred_df.copy()

    def recurse(node):
        if isinstance(node, dict):
            for parent, children in node.items():
                recurse(children)
                child_keys = []
                for c in children:
                    if isinstance(c, str):
                        child_keys.append(c)
                    elif isinstance(c, dict):
                        child_keys.extend(c.keys())
                if child_keys:
                    out.loc[out[child_keys].eq(1).any(axis=1), parent] = 1
        elif isinstance(node, list):
            for item in node:
                recurse(item)

    recurse(classif)

    # cross-links outside the hierarchical classification
    if {'interbr_morph','gen_interbr','interbreeding'}.issubset(out):
        mask = (out.interbr_morph==1)|(out.gen_interbr==1)
        out.loc[mask, 'interbreeding'] = 1

    if {'distance_based','distance'}.issubset(out):
        out.loc[out.distance_based==1, 'distance'] = 1

    if {'phen_pylo','phylo_sd','phylo_tree','phylogenetic'}.issubset(out):
        mask = (out.phen_pylo==1)|(out.phylo_sd==1)|(out.phylo_tree==1)
        out.loc[mask, 'phylogenetic'] = 1

    return out





# Processing text and feature configuration

In [5]:
rand_df   = df[df['is_random'] == 1]      # unbiased rows
biased_df = df[df['is_random'] == 0]      # rows from biased sampling (to increase sparse categories)

X_rand, y_rand = rand_df['displayed_text'], rand_df[cats]
X_bias, y_bias = biased_df['displayed_text'], biased_df[cats]

# take 80/20 split from the unbiased part
# use this for training, not for validation or testing
# the biased samples are only used for training, i.e. appended to the training data within each fold's training set
# at the final retraining, the biased samples are also used for training
msss = MultilabelStratifiedShuffleSplit(test_size=0.20, random_state=42)
train_idx, test_idx = next(msss.split(X_rand, y_rand))

X_train_raw, y_train_raw = X_rand.iloc[train_idx], y_rand.iloc[train_idx]
X_test,      y_test      = X_rand.iloc[test_idx],  y_rand.iloc[test_idx]

# features & pipeline skeleton—unchanged
tfidf  = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95,
                         sublinear_tf=True, norm='l2', dtype=np.float32,
                         lowercase=True, strip_accents='unicode')

# for estimators that don't accept sparse matrices (rf, knn, gb)
densify = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)

# we choose macro_F1 as our classes are very imbalanced, and many are very sparse
score   = make_scorer(f1_score, average='macro')

# use multilabelstratifiedkfold to keep the label proportions roughly equal
# 5 folds in the outer loop (for model evaluation)
outer = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 3 folds in the inner loop (for hyperparameter tuning)
inner = MultilabelStratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [6]:
# vectorizer
# the one that came out as the best one in the binary relevance approach
plain_tfidf = TfidfVectorizer(
    analyzer='word',
    norm='l2',
    sublinear_tf=True,
    max_df=0.95,          
    min_df=5,             
    ngram_range=(1, 1),   
    stop_words=None       
)

# Classifier chain orders

We have information about the relations between labels, as well as information on how well LR can classify each. Using this information, a classifier chain might be better overall. The general strategy is to put children before parents, and within children, put the ones that are easiest to classify first. In addition to that, also try ordering them by correlation (if two labels correlate, put the one that is easiest to classify first).

In [8]:
def build_hier_corr_f1_order(hierarchy, y_df, mean_f1, alpha=0.7):
    """
    hierarchy : nested dict/list structure (your `classif`)
    y_df      : DataFrame of shape (n, L) with 0/1
    mean_f1   : pd.Series  index = label_name,  value = mean F1 from BR CV
    alpha     : weight for F1 in composite score  (0..1)

    returns   idx_order  (list[int]), name_order (list[str])
    """
    labels = list(y_df.columns)
    L      = len(labels)

    # 1. correlation matrix (φ-coefficient)
    corr = y_df.corr().abs()

    # 2. helper: composite score for child w.r.t. parent
    def score(child, parent):
        f1   = mean_f1.get(child, 0.0)
        rho  = corr.loc[parent, child] if (parent in corr and child in corr) else 0.0
        return alpha * f1 + (1 - alpha) * rho

    order, seen = [], set()

    def dfs(node):
        if isinstance(node, dict):
            for parent, children in node.items():
                # collect direct children names
                direct = []
                if isinstance(children, dict):
                    direct += list(children.keys())
                elif isinstance(children, list):
                    for c in children:
                        direct += (list(c.keys()) if isinstance(c, dict) else [c])
                # sort those children by composite score
                sorted_children = sorted(
                    direct,
                    key=lambda c: score(c, parent),
                    reverse=True
                )
                # recurse down each child-subtree in that order
                for c in sorted_children:
                    sub = ({c: children[c]} if isinstance(children, dict) and c in children
                           else next((d for d in children if isinstance(d, dict) and c in d), c))
                    dfs(sub)
                # finally emit parent
                if parent in labels and parent not in seen:
                    order.append(parent); seen.add(parent)
                # make sure to recurse into grandchildren
                dfs(children)
        elif isinstance(node, list):
            for item in node:
                dfs(item)
        elif isinstance(node, str):
            if node in labels and node not in seen:
                order.append(node); seen.add(node)

    dfs(hierarchy)

    # 3. append singletons by descending F1
    for lab in mean_f1.sort_values(ascending=False).index:
        if lab not in seen:
            order.append(lab); seen.add(lab)

    idx_order = [labels.index(l) for l in order]
    return idx_order, order

def compute_label_correlations(y_df):
    """
    Given y_df: a DataFrame of shape (n_samples, n_labels), with 0/1 entries,
    compute the L×L Pearson correlation matrix between columns. Then return:
      - corr_matrix: a DataFrame of shape (L, L) where corr_matrix.loc[i, j]
                     is Pearson( y_df.iloc[:, i], y_df.iloc[:, j] ).
      - total_abs_corr: a Series of length L where total_abs_corr[i] = 
                        sum_j | corr_matrix[i, j] | (i.e. how “connected” label i is).
    """
    # 1) .corr() on a binary DataFrame gives Pearson (which is equivalent to φ‐coefficient
    #    for 0/1 variables). This is symmetric, with ones on the diagonal.
    corr_matrix = y_df.corr()

    # 2) Compute sum of absolute correlations (excluding self-correlation if you like):
    #    We’ll exclude the diagonal to avoid counting corr=1 with itself.
    abs_corr = corr_matrix.abs().copy()
    np.fill_diagonal(abs_corr.values, 0.0)

    total_abs_corr = abs_corr.sum(axis=1)
    return corr_matrix, total_abs_corr
    

def correlation_greedy_order(y_df):
    """
    Return a permutation of [0..L-1] based on a greedy walk over the correlation graph:
     - Start with the label whose total abs-corr is largest.
     - Then repeatedly pick the as‐yet‐unused label that is most correlated
       (by absolute Pearson) with the last‐selected label.
    This ignores any hierarchical constraints.
    """
    # Compute correlation matrix and total absolute correlations:
    corr_df, total_abs_corr = compute_label_correlations(y_df)
    labels = list(y_df.columns)
    L = len(labels)

    # (A) Identify the first label: argmax of total_abs_corr
    start_label = total_abs_corr.idxmax()  # e.g. 'Cat'
    order_names = [start_label]
    used = {start_label}

    # (B) Greedy walk: at each step, pick the unused label most correlated with the last one
    while len(order_names) < L:
        last = order_names[-1]
        # Among the unused labels, find which one has max |corr(last, candidate)|.
        # corr_df[last] is a Series: correlation of `last` with every other label.
        candidates = [
            lbl for lbl in labels
            if lbl not in used
        ]
        # Compute absolute correlation to `last` for each candidate:
        best_next = max(candidates, key=lambda lbl: abs(corr_df.loc[last, lbl]))
        order_names.append(best_next)
        used.add(best_next)

    # Convert label‐names to indices in y_df.columns
    idx_order = [labels.index(lbl) for lbl in order_names]
    return idx_order, order_names

def build_parent_first_order(classif, cats):

    # STEP 1: build a “pre-order” list of all labels in parent→child order
    name_ord = []

    def _recurse_node(parent_name, children_list):
        # Emit the parent itself first
        name_ord.append(parent_name)

        # Then process its children (if any)
        for child in children_list:
            if isinstance(child, str):
                # plain‐string → leaf, just emit it
                name_ord.append(child)

            elif isinstance(child, dict):
                # dict → nested subtree.  Each dict should contain exactly one key→list.
                for k, v in child.items():
                    _recurse_node(k, v)
            else:
                raise TypeError(
                    f"Expected str or dict, but got {type(child)} inside children of {parent_name}"
                )

    # Walk every top‐level group in `classif`:
    for top_key, top_children in classif.items():
        if top_key == "singletons":
            # “singletons” is just a bucket of leaf‐names → emit each leaf directly
            for leaf in top_children:
                if not isinstance(leaf, str):
                    raise TypeError(
                        "Inside 'singletons' we expect only plain strings, got: "
                        f"{type(leaf)}"
                    )
                name_ord.append(leaf)

        else:
            # a “real” parent node → recurse in pre-order
            _recurse_node(top_key, top_children)

    # STEP 2: look up each name’s column‐index in y_train_raw
    # We assume y_train_raw is a pandas DataFrame:
    try:
        cols = cats
    except AttributeError:
        raise ValueError(
            "build_parent_first_order expects y_train_raw to be a pandas DataFrame with .columns set to all label names"
        )

    # Verify that every name actually exists in y_train_raw.columns
    missing = [n for n in name_ord if n not in cols]
    if missing:
        raise KeyError(
            "Some labels in the hierarchy were not found among y_train_raw.columns:\n"
            + ", ".join(missing)
        )

    idx_ord = [cols.index(n) for n in name_ord]
    return {'idx':idx_ord, 'name':name_ord}

In [20]:
# get the results from LR

with open("methods_paper_files/best_lr_stats.pkl", "rb") as f:
    lr_results = pickle.load(f)

lr_results = pd.DataFrame(lr_results['mean']).reset_index()
lr_f1 = lr_results.loc[lr_results['index'].str.contains('_f1')].iloc[2:]
lr_f1['cats'] = cats
lr_f1.columns = ['name','F1','labels']

In [22]:
mean_f1 = lr_f1['F1']
alpha = 0.7
mean_f1.index = y_train_raw.columns   # strip suffix

# ------------------------------------------------------------
# 1.  Decide which “principled” orders you want to try.
#     – vary α  (F1-vs-corr weight)        →  6 permutations
#     – optionally add a pure bottom-up    →  +1
#     – (optionally) add a pure corr walk  →  +1
# ------------------------------------------------------------
alphas = [0.0, 0.3, 0.7, 0.9]        # 0 = corr-only, 1 = F1-only

orders_info = {}                                # key → dict{idx_order, name_order}

for a in alphas:
    idx_ord, name_ord = build_hier_corr_f1_order(classif, y_train_raw,
                                                 mean_f1, alpha=a)
    key = f'alpha_{a:.1f}'
    orders_info[key] = {'idx': idx_ord, 'name': name_ord}



# Optional-2: pure correlation greedy walk (ignores hierarchy)
idx_corr, name_corr = correlation_greedy_order(y_train_raw)
orders_info['pure_corr'] = {'idx': idx_corr, 'name': name_corr}

# try parents first as well
orders_info['parents_first'] = build_parent_first_order(classif,cats)


# Classifier chain Training

In [24]:
# function to evaluate a chain with a particular order

def evaluate_chain_with_order(idx_order):
    """
    Run a 5-fold *outer* CV for a ClassifierChain that uses the exact
    integer permutation `idx_order`.

    Parameters
    ----------
    idx_order : list[int]
        A permutation of [0, 1, …, L-1] where L = number of labels.
        Each value is the column-index (in y_train_raw.columns) that should
        be predicted at that position in the chain.

    Returns
    -------
    results : dict
        {
          'mean'    : pd.Series   (mean of every metric across 5 folds)
          'std'     : pd.Series   (sample std-dev  ddof=1)
          'ci_95'   : pd.DataFrame   (rows 0.025 & 0.975 quantiles)
          'fold_df' : pd.DataFrame   (one row per outer fold with all metrics)
        }
    """
    # ---------- sanity-check ------------------------------------------------
    n_labels = y_train_raw.shape[1]
    if sorted(idx_order) != list(range(n_labels)):
        raise ValueError("idx_order must be a permutation of 0..L-1")

    # ---------- model -------------------------------------------------------
    # use the parameters from the binary relevance approach
    base_lr = LogisticRegression(
            C=10,
            class_weight='balanced',
            max_iter=2000,
            solver='saga',
            penalty = 'elasticnet',
            l1_ratio = 0.9
        )
    chain = ClassifierChain(
        base_estimator=base_lr,
        order=idx_order,
        random_state=42
    )
    pipe = Pipeline([
        ('vect',  vect),
        ('dense', densify),   
        ('clf',   chain)
    ])

    # ---------- outer-CV loop ----------------------------------------------
    fold_dicts = []

    for train_idx, val_idx in tqdm(
            outer.split(X_train_raw, y_train_raw),
            total=outer.get_n_splits(),
            desc="Outer CV (fixed chain order)"
    ):
        # training = unbiased subset + biased rows
        X_tr = pd.concat([X_train_raw.iloc[train_idx], X_bias], axis=0)
        y_tr = pd.concat([y_train_raw.iloc[train_idx], y_bias], axis=0)

        # validation = held-out slice of unbiased part
        X_val = X_train_raw.iloc[val_idx]
        y_val = y_train_raw.iloc[val_idx]

        # fit and predict
        pipe.fit(X_tr, y_tr)
        y_pred = pipe.predict(X_val)          

        # convert → DataFrame so hierarchy code can use column names
        pred_df = pd.DataFrame(
            y_pred,
            columns=y_val.columns,
            index=X_val.index
        )

        # enforce hierarchy / cross-links
        pred_df = propagate_hierarchy(pred_df, classif)
        pred_df = (
            pred_df
            .fillna(0)
            .astype(int)
            .reindex(columns=y_val.columns, fill_value=0)
        )

        # metrics for this fold
        fold_metrics_dict = fold_metrics(y_val, pred_df.values)
        fold_dicts.append(fold_metrics_dict)

    # ---------- aggregate results ------------------------------------------
    df_scores = pd.DataFrame(fold_dicts)

    return {
        'mean':    df_scores.mean(),
        'std':     df_scores.std(ddof=1),
        'ci_95':   df_scores.quantile([0.025, 0.975]),
        'fold_df': df_scores
    }

In [13]:
cv_results = {}
for key, info in orders_info.items():
    print(f'\nEvaluating order: {key}')
    res = evaluate_chain_with_order(info['idx'])
    cv_results[key] = res
    print(f"   macro-F1 mean ± std : {res['mean']['macro_f1']:.4f}"
          f" ± {res['std']['macro_f1']:.4f}")

# Pick the winner on macro-F1 (or any metric)

best_key = max(cv_results,
               key=lambda k: cv_results[k]['mean']['macro_f1'])
print('\n*** Best chain order (by mean macro-F1 across 5 folds) ***')
print('order key :', best_key)
print('macro-F1  :', cv_results[best_key]['mean']['macro_f1'])
print('name order:', orders_info[best_key]['name'])


▶ Evaluating order: alpha_0.0


Outer CV (fixed chain order): 100%|█████████████████████████████████████████████████████| 5/5 [18:37<00:00, 223.53s/it]


   macro-F1 mean ± std : 0.6490 ± 0.0213

▶ Evaluating order: alpha_0.3


Outer CV (fixed chain order): 100%|█████████████████████████████████████████████████████| 5/5 [17:29<00:00, 209.85s/it]


   macro-F1 mean ± std : 0.6486 ± 0.0213

▶ Evaluating order: alpha_0.7


Outer CV (fixed chain order): 100%|█████████████████████████████████████████████████████| 5/5 [14:58<00:00, 179.79s/it]


   macro-F1 mean ± std : 0.6525 ± 0.0193

▶ Evaluating order: alpha_0.9


Outer CV (fixed chain order): 100%|█████████████████████████████████████████████████████| 5/5 [09:44<00:00, 116.98s/it]


   macro-F1 mean ± std : 0.6542 ± 0.0172

▶ Evaluating order: pure_corr


Outer CV (fixed chain order): 100%|█████████████████████████████████████████████████████| 5/5 [13:48<00:00, 165.78s/it]


   macro-F1 mean ± std : 0.6307 ± 0.0161

▶ Evaluating order: parents_first


Outer CV (fixed chain order): 100%|█████████████████████████████████████████████████████| 5/5 [10:06<00:00, 121.35s/it]

   macro-F1 mean ± std : 0.6201 ± 0.0132

*** Best chain order (by mean macro-F1 across 5 folds) ***
order key : alpha_0.9
macro-F1  : 0.654244045126967
name order: ['quant_morph', 'interbr_morph', 'color_pattern', 'shape', 'qual_morph', 'morph', 'ecology', 'acoustic', 'behav', 'phen_data', 'imaging', 'storage', 'sampling', 'phen_proc', 'phen_nonphylo', 'phen_pylo', 'phen_analysis', 'phenotype', 'organellar', 'nuclear', 'gen_data', 'gen1', 'sequencing', 'gen_proc', 'character_based', 'distance_based', 'phylo_tree', 'phylo_sd', 'distance', 'gen_non_phylo', 'gen_analysis', 'genotype', 'phylogenetic', 'specimen_storage_loc', 'interbreeding', 'abbrev_terms', 'rank_just', 'biogeo']





# run chosen model on the test set

In [14]:
# ─── 1) Build the “final” base‐estimator using best_params ─────────────────
base_lr_final = LogisticRegression(
            C=10,
            class_weight='balanced',
            max_iter=2e4,
            solver='saga',
            penalty = 'elasticnet',
            l1_ratio = 0.9
        )

final_chain = ClassifierChain(
    base_estimator=base_lr_final,
    order=orders_info['alpha_0.9']['idx'],
    random_state=42
)

final_pipe = Pipeline([
    ('vect', vect),
    ('dense', densify),
    ('clf',  final_chain)
])

In [15]:
X_full_train = pd.concat([X_train_raw, X_bias], axis=0)
y_full_train = pd.concat([y_train_raw, y_bias], axis=0)

final_pipe.fit(X_full_train, y_full_train)

# ─── 5) Predict on X_test and enforce the hierarchy ────────────────────────
y_pred_test = final_pipe.predict(X_test)   # shape = (n_test, L)

# Re‐wrap into a DataFrame so propagate_hierarchy can use column‐names
pred_df_test = pd.DataFrame(
    y_pred_test,
    columns=y_test.columns,
    index=X_test.index
)

# Enforce your hierarchy (fills with NaN where needed), then binarize/fill
pred_df_test = propagate_hierarchy(pred_df_test, classif)
pred_df_test = (
    pred_df_test
    .fillna(0)
    .astype(int)
    .reindex(columns=y_test.columns, fill_value=0)
)

# ─── 6) Compute final metrics on the held‐out fold ─────────────────────────
# Assume fold_metrics(y_true_df, y_pred_array) → dict of metrics
final_metrics = fold_metrics(y_test, pred_df_test.values)



Final test‐set metrics:
  macro_f1: 0.6967
  micro_f1: 0.7910
  macro_prec: 0.7643
  micro_prec: 0.8591
  macro_rec: 0.6651
  micro_rec: 0.7329
  hamming: 0.0799
  subset_acc: 0.1683
  phenotype_prec: 0.9912
  phenotype_rec: 0.8396
  phenotype_f1: 0.9091
  phenotype_sup: 268.0000
  phen_data_prec: 0.9665
  phen_data_rec: 0.7457
  phen_data_f1: 0.8418
  phen_data_sup: 232.0000
  morph_prec: 0.9756
  morph_rec: 0.7583
  morph_f1: 0.8533
  morph_sup: 211.0000
  biogeo_prec: 0.6364
  biogeo_rec: 0.2917
  biogeo_f1: 0.4000
  biogeo_sup: 24.0000
  color_pattern_prec: 0.6923
  color_pattern_rec: 0.3913
  color_pattern_f1: 0.5000
  color_pattern_sup: 23.0000
  phen_proc_prec: 0.9286
  phen_proc_rec: 0.8535
  phen_proc_f1: 0.8895
  phen_proc_sup: 198.0000
  imaging_prec: 0.9008
  imaging_rec: 0.9291
  imaging_f1: 0.9147
  imaging_sup: 127.0000
  quant_morph_prec: 0.8403
  quant_morph_rec: 0.8264
  quant_morph_f1: 0.8333
  quant_morph_sup: 121.0000
  storage_prec: 0.8158
  storage_rec: 0.7686
  

In [None]:
# save results
with open("methods_paper_files/classifier_chain_tune_stats.pkl", "wb") as f:
    pickle.dump(cv_results, f)

with open("methods_paper_files/results/classifier_chain_results.pkl", "wb") as f:
    pickle.dump(pred_df_test, f)