In [31]:
import json

# we look at path "./eval_scores", in which there are json files with scores
import os
from pprint import pprint
import numpy as np
from qq import LanguageData
import math

ld = LanguageData.from_db()

In [32]:
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1", "sib": "eval_accuracy"}
tasks = f1.keys()
scores = {task: {} for task in tasks}
inf = math.inf


def best_scores(scores):
    best_scores = {}
    for lang, types in scores.items():
        highest = (-inf, "None")
        for type, value in types.items():
            if isinstance(value, float):
                if value > highest[0]:
                    highest = (value, type)
            else:
                for reconstructed, score in value.items():
                    if score > highest[0]:
                        highest = (score, reconstructed)

        # print(lang, highest)
        best_scores[lang] = highest
    pprint(best_scores)
    # we count how many time each type was the best
    best_types = {}
    for lang, (score, type) in best_scores.items():
        if type not in best_types.keys():
            best_types[type] = 0
        best_types[type] += 1
    pprint(best_types)


for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# Comparison with other papers

## EMEA
EMEA check NER and POS on quite a few languages.

In [33]:
# we print the highest 3 key-value pairs in a combination
def get_highest(task, language):
    result = {"baseline_en": -inf, "Best": (-inf, None)}
    for type, value in scores[task][language].items():
        # value = value*100
        # we get the baseline of english
        if type == "baseline_en":
            result["baseline_en"] = value
        if "baseline" not in type:
            if value > result["Best"][0]:
                result["Best"] = (value, type)

    return result


task = "ner"
to_check = ["mr", "bn", "ta", "fo", "no", "da", "be", "uk", "bg"]

for lang in to_check:
    if lang in scores[task].keys():
        print(lang, get_highest(task, lang))
    else:
        print(f"{lang} not in scores")

mr {'baseline_en': 0.37207165824529165, 'Best': (0.5046728971962616, 'reconstructed_morphological_threshold')}
bn {'baseline_en': 0.3659942363112392, 'Best': (0.5985275010827199, 'reconstructed_syntactic_threshold')}
ta {'baseline_en': 0.33454252317613864, 'Best': (0.4292682926829268, 'reconstructed_featural_limit')}
fo {'baseline_en': -inf, 'Best': (0.587360594795539, 'reconstructed_morphological_limit')}
no {'baseline_en': 0.7269464204137571, 'Best': (0.7522368421052632, 'reconstructed_syntactic_threshold')}
da {'baseline_en': 0.784997910572503, 'Best': (0.7937480419117904, 'reconstructed_featural_threshold')}
be {'baseline_en': 0.5907769007062734, 'Best': (0.7273440564927423, 'reconstructed_featural_limit')}
uk {'baseline_en': 0.5676052810476224, 'Best': (0.6025231397608616, 'reconstructed_featural')}
bg {'baseline_en': 0.6946546253356114, 'Best': (0.7437472722999966, 'reconstructed_featural_base')}


In [34]:
import pandas as pd

task2lang = {
    "ner": ["mr", "bn", "ta", "fo", "no", "da", "be", "uk", "bg"],
    "pos": ["mr", "bho", "ta", "fo", "no", "da", "be", "uk", "bg"],
}


# Initialize a dictionary to store results
data = {"baseline_en": [], "Best": []}

# Populate the dictionary with values for each language
task = "pos"
for lang in task2lang[task]:
    if lang in scores[task].keys():
        result = get_highest(task, lang)
        data["baseline_en"].append(result["baseline_en"])
        data["Best"].append(result["Best"][0])  # Append only the score from the tuple
    else:
        data["baseline_en"].append(None)
        data["Best"].append(None)

# Create a DataFrame
df = pd.DataFrame.from_dict(data, orient="index", columns=to_check)
# we add a row "relative improvement" which is the difference between the best and the baseline
df.loc["relative improvement"] = 1 - df.loc["baseline_en"] / df.loc["Best"]
df.loc["absolute improvement"] = df.loc["Best"] - df.loc["baseline_en"]
# we multiply all by 100
df = df * 100

Scores for EMEA:
Method mr bn ta avg. fo no da avg. be uk bg avg. avg.
En 48.0 54.4 29.6 44.0 57.5 73.3 80.5 70.4 67.1 67.6 71.1 68.6 61.0
EMEA-s10 57.5 63.2 38.3 53.0 61.6 74.9 82.0 72.8 72.9 72.9 75.1 73.6 66.5

In [35]:
# we make a dataframe with EMEA scores
emea = {
    "baseline_en": [
        48.0,
        54.4,
        29.6,
        57.5,
        73.3,
        80.5,
        67.1,
        67.6,
        71.1,
    ],
    "EMEA-s10": [57.5, 63.2, 38.3, 61.6, 74.9, 82.0, 72.9, 72.9, 75.1],
}
emea_df = pd.DataFrame.from_dict(emea, orient="index", columns=to_check)
# we have to divide by 100
emea_df.loc["relative improvement"] = 1 - emea_df.loc["baseline_en"] / emea_df.loc["EMEA-s10"]
emea_df.loc["absolute improvement"] = emea_df.loc["EMEA-s10"] - emea_df.loc["baseline_en"]
emea_df

Unnamed: 0,mr,bn,ta,fo,no,da,be,uk,bg
baseline_en,48.0,54.4,29.6,57.5,73.3,80.5,67.1,67.6,71.1
EMEA-s10,57.5,63.2,38.3,61.6,74.9,82.0,72.9,72.9,75.1
relative improvement,0.165217,0.139241,0.227154,0.066558,0.021362,0.018293,0.079561,0.072702,0.053262
absolute improvement,9.5,8.8,8.7,4.1,1.6,1.5,5.8,5.3,4.0


In [36]:
# we rename baseline_en index in df to "our_baseline_en"
df.rename(index={"baseline_en": "our_baseline_en"}, inplace=True)
df.rename(index={"Best": "Approximation_method"}, inplace=True)

# we rename the baseline_en index in emea_df to "emea_baseline_en"
emea_df.rename(index={"baseline_en": "emea_baseline_en"}, inplace=True)
# we only take the first two columns
emea_df = emea_df.iloc[:2, :]
df = df.iloc[:2, :]
# we concatenate the two dataframes
merged_df = pd.concat([emea_df, df])
merged_df

Unnamed: 0,mr,bn,ta,fo,no,da,be,uk,bg
emea_baseline_en,48.0,54.4,29.6,57.5,73.3,80.5,67.1,67.6,71.1
EMEA-s10,57.5,63.2,38.3,61.6,74.9,82.0,72.9,72.9,75.1
our_baseline_en,42.489162,33.508854,39.149474,54.998776,63.696679,77.977372,66.259707,61.913028,63.136868
Approximation_method,43.336831,33.885334,40.074814,57.738947,64.388078,82.669088,67.447689,62.776158,63.356007


# No train but gain
ner:
ar bg de el es fr hi ru sw tr ur vi zh

In [37]:
# we look at the languages from no train but gain paper
to_test = ["ar", "bg", "de", "el", "es", "fr", "hi", "ru", "sw", "tr", "ur", "vi", "zh"]
# we get the scores for these languages
for lang in to_test:
    if lang in scores["ner"].keys():
        print(lang, get_highest("ner", lang))
    else:
        print(f"{lang} not in scores")

ar {'baseline_en': 0.2433960213066638, 'Best': (0.3701895128473433, 'reconstructed_morphological_limit')}
bg {'baseline_en': 0.6946546253356114, 'Best': (0.7437472722999966, 'reconstructed_featural_base')}
de {'baseline_en': 0.7022521008403362, 'Best': (0.7159136884693189, 'reconstructed_syntactic_threshold')}
el {'baseline_en': 0.6577599815192701, 'Best': (0.72478919455149, 'reconstructed_morphological_limit')}
es {'baseline_en': 0.7115317751593586, 'Best': (0.7245094267025779, 'no_train_gain')}
fr {'baseline_en': 0.7141884385191557, 'Best': (0.7355297017143272, 'reconstructed_syntactic_limit')}
hi {'baseline_en': 0.5677308024158757, 'Best': (0.6572411157814291, 'reconstructed_morphological_threshold')}
ru {'baseline_en': 0.5094573519414565, 'Best': (0.634243480258875, 'reconstructed_morphological_limit')}
sw {'baseline_en': 0.6110886280857952, 'Best': (0.6800986842105263, 'reconstructed_morphological_threshold')}
tr {'baseline_en': 0.5816221413364467, 'Best': (0.6042429686960127, 're

# QXUAD
F1 scores:
Model 	en 	ar 	de 	el 	es 	hi 	ru 	th 	tr 	vi 	zh 	ro 	avg
mBERT 	83.5 	61.5 	70.6 	62.6 	75.5 	59.2 	71.3 	42.7 	55.4 	69.5 	58.0 	72.7 	65.2
XLM-R Large 	86.5 	68.6 	80.4 	79.8 	82.0 	76.7 	80.1 	74.2 	75.9 	79.1 	59.3 	83.6 	77.2
Translate-train mBERT 	83.5 	68.0 	75.6 	70.0 	80.2 	69.6 	75.0 	36.9 	68.9 	75.6 	66.2 	- 	70.0
Translate-test BERT-L 	87.9 	73.7 	79.8 	79.4 	82.0 	74.9 	79.9 	64.6 	67.4 	76.3 	73.7 	- 	76.3

In [38]:
# we make a dataframe out of this
to_test = ["en", "ar", "de", "el", "es", "hi", "ru", "th", "tr", "vi", "zh", "ro"]
qx = {
    "mBERT": [83.5, 61.5, 70.6, 62.6, 75.5, 59.2, 71.3, 42.7, 55.4, 69.5, 58.0, 72.7],
    "XLM-R Large": [86.5, 68.6, 80.4, 79.8, 82.0, 76.7, 80.1, 74.2, 75.9, 79.1, 59.3, 83.6],
    # "Translate-train mBERT": [83.5, 68.0, 75.6, 70.0, 80.2, 69.6, 75.0, 36.9, 68.9, 75.6],
    # "Translate-test BERT-L": [87.9, 73.7, 79.8, 79.4, 82.0, 74.9, 79.9, 64.6, 67.4, 76.3, 73.7],
}
qx_df = pd.DataFrame.from_dict(qx, orient="index", columns=to_test)
qx_df

Unnamed: 0,en,ar,de,el,es,hi,ru,th,tr,vi,zh,ro
mBERT,83.5,61.5,70.6,62.6,75.5,59.2,71.3,42.7,55.4,69.5,58.0,72.7
XLM-R Large,86.5,68.6,80.4,79.8,82.0,76.7,80.1,74.2,75.9,79.1,59.3,83.6


In [39]:
# we add our scores to the dataframe
task = "qa"
for lang in to_test:
    if lang in scores["qa"].keys():
        result = get_highest("qa", lang)
        # qx_df.loc["XLM-R Base", lang] = round(scores[task][lang]["finetune"]*100, 1)
        qx_df.loc["MAD-X", lang] = round(scores[task][lang]["baseline_closest_featural"] * 100, 1)
        # qx_df.loc["Approximation_method", lang] = round(scores[task][lang]["reconstructed_featural"]*100, 1)
        qx_df.loc["No Train but Gain", lang] = round(scores[task][lang]["no_train_gain"] * 100, 1)
        qx_df.loc["TIPA", lang] = round(result["Best"][0] * 100, 1)

qx_df

Unnamed: 0,en,ar,de,el,es,hi,ru,th,tr,vi,zh,ro
mBERT,83.5,61.5,70.6,62.6,75.5,59.2,71.3,42.7,55.4,69.5,58.0,72.7
XLM-R Large,86.5,68.6,80.4,79.8,82.0,76.7,80.1,74.2,75.9,79.1,59.3,83.6
MAD-X,83.3,66.8,74.0,71.8,75.0,68.6,74.0,68.4,67.8,73.2,65.9,76.6
No Train but Gain,83.3,66.6,75.5,72.9,75.5,68.5,74.5,68.9,68.4,73.7,64.4,78.1
TIPA,83.6,67.9,76.1,73.1,75.9,69.2,75.0,69.2,69.0,73.8,66.7,78.9


# Table to be included in the paper! qa results
- Our method is better than finetuning mBERT, and very efficient, extendable to all languages.
- Here we take the best approximation method, as discussed in _distance_comparison

In [40]:
# we transform to Latex with the formatters etc.

# 1) compute global [min, median, max] over every numeric cell
all_vals = pd.to_numeric(qx_df.values.ravel(), errors="coerce")
mn, md, mx = np.nanmin(all_vals), float(np.nanmedian(all_vals)), np.nanmax(all_vals)

# 2) record each column’s maximum (for bolding)
col_max = qx_df.max(axis=0)

# 3) build a string‐typed DataFrame, applying global gradient + bold on column‐max
qx_str = qx_df.astype(object).copy()

for idx, row in qx_df.iterrows():
    for col in qx_df.columns:
        x = row[col]
        if pd.isna(x):
            cell = ""
        else:
            # global gradient
            grad = f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.1f}}}"
            # bold if it’s the max in its column
            cell = f"\\textbf{{{grad}}}" if x == col_max[col] else grad
        qx_str.at[idx, col] = cell

# 4) export to LaTeX (letting \tgrad and \textbf pass through)
latex = qx_str.to_latex(
    escape=False,
    multirow=True,  # keep multirow on the index if you like
)
print(latex)

\begin{tabular}{lllllllllllll}
\toprule
 & en & ar & de & el & es & hi & ru & th & tr & vi & zh & ro \\
\midrule
mBERT & \tgrad[42.700][73.150][86.500]{83.5} & \tgrad[42.700][73.150][86.500]{61.5} & \tgrad[42.700][73.150][86.500]{70.6} & \tgrad[42.700][73.150][86.500]{62.6} & \tgrad[42.700][73.150][86.500]{75.5} & \tgrad[42.700][73.150][86.500]{59.2} & \tgrad[42.700][73.150][86.500]{71.3} & \tgrad[42.700][73.150][86.500]{42.7} & \tgrad[42.700][73.150][86.500]{55.4} & \tgrad[42.700][73.150][86.500]{69.5} & \tgrad[42.700][73.150][86.500]{58.0} & \tgrad[42.700][73.150][86.500]{72.7} \\
XLM-R Large & \textbf{\tgrad[42.700][73.150][86.500]{86.5}} & \textbf{\tgrad[42.700][73.150][86.500]{68.6}} & \textbf{\tgrad[42.700][73.150][86.500]{80.4}} & \textbf{\tgrad[42.700][73.150][86.500]{79.8}} & \textbf{\tgrad[42.700][73.150][86.500]{82.0}} & \textbf{\tgrad[42.700][73.150][86.500]{76.7}} & \textbf{\tgrad[42.700][73.150][86.500]{80.1}} & \textbf{\tgrad[42.700][73.150][86.500]{74.2}} & \textbf{\tgr

## Exact match
Model 	en 	ar 	de 	el 	es 	hi 	ru 	th 	tr 	vi 	zh 	ro 	avg
mBERT 	72.2 	45.1 	54.0 	44.9 	56.9 	46.0 	53.3 	33.5 	40.1 	49.6 	48.3 	59.9 	50.3
XLM-R Large 	75.7 	49.0 	63.4 	61.7 	63.9 	59.7 	64.3 	62.8 	59.3 	59.0 	50.0 	69.7 	61.5
Translate-train mBERT 	72.2 	51.1 	60.7 	53.0 	63.1 	55.4 	59.7 	33.5 	54.8 	56.2 	56.6 	- 	56.0
Translate-test BERT-L 	77.1 	58.8 	66.7 	65.5 	68.4 	60.1 	66.7 	50.0 	49.6 	61.5 	59.1 	- 	62.1

In [41]:
xq_em = {
    "mBERT": [72.2, 45.1, 54.0, 44.9, 56.9, 46.0, 53.3, 33.5, 40.1, 49.6, 48.3, 59.9],
    "XLM-R Large": [75.7, 49.0, 63.4, 61.7, 63.9, 59.7, 64.3, 62.8, 59.3, 59.0, 50.0],
    "Translate-train mBERT": [72.2, 51.1, 60.7, 53.0, 63.1, 55.4, 59.7, 33.5, 54.8],
    "Translate-test BERT-L": [77.1, 58.8, 66.7, 65.5, 68.4, 60.1, 66.7, 50.0, 49.6, 61.5, 59.1],
}

xq_em_df = pd.DataFrame.from_dict(xq_em, orient="index", columns=to_test)
# we add our scores to the dataframe
# we add our scores to the dataframe
task = "qa"
for lang in to_test:
    if lang in scores["qa"].keys():
        result = get_highest("qa", lang)
        xq_em_df.loc["our_baseline_en", lang] = scores[task][lang]["baseline_en"]
        xq_em_df.loc["Approximation_method", lang] = scores[task][lang]["improved_reconstructed_featural_all"]
        xq_em_df.loc["Target language adapter", lang] = scores[task][lang]["baseline_closest_featural"]
    else:
        xq_em_df.loc["our_baseline_en", lang] = None
        xq_em_df.loc["Approximation_method", lang] = None
xq_em_df

KeyError: 'improved_reconstructed_featural_all'

# Kunz & Holstrom

In [None]:
# Data for XLM-R results on COPA
data = {
    "Target adapter": [55.2, 55.3, 53.1, 55.7, 54.1, 54.0, 51.2, 51.4, 53.8],
    "English adapter": [55.0, 54.9, 51.9, 53.6, 50.7, 49.7, 48.6, 51.2, 52.0],
    "None": [54.3, 55.1, 51.2, 53.4, 52.3, 52.0, 50.6, 49.6, 52.3],
    "Nonetr": [49.4, 52.8, 49.3, 49.8, 51.4, 49.7, 49.6, 50.2, 50.3],
}

index = ["zh", "vi", "tr", "id", "et", "sw", "ht", "qu", "Average"]

# Create DataFrame
df_xlmr = pd.DataFrame(data, index=index)
# we drop "None" and "Nonetr"
df_xlmr.drop(columns=["None", "Nonetr"], inplace=True)
# we add a column for our COPA scores, for each of the languages
task = "copa"
for lang in index:
    if lang in scores[task].keys():
        result = get_highest(task, lang)
        df_xlmr.loc[lang, "our_target"] = round(scores[task][lang]["baseline_closest_featural"], 3) * 100
        df_xlmr.loc[lang, "our_baseline_en"] = round(scores[task][lang]["baseline_en"], 3) * 100
        df_xlmr.loc[lang, "Approximation_method"] = round(result["Best"][0], 3) * 100

    else:
        df_xlmr.loc[lang, "our_baseline_en"] = None
        df_xlmr.loc[lang, "Approximation_method"] = None
# we add the "Average" row for our scores
df_xlmr.loc["Average", "our_baseline_en"] = round(df_xlmr["our_baseline_en"].mean(), 1)
df_xlmr.loc["Average", "Approximation_method"] = round(df_xlmr["Approximation_method"].mean(), 1)
df_xlmr.loc["Average", "our_target"] = round(df_xlmr["our_target"].mean(), 1)
# define new MultiIndex for the columns
df_xlmr.columns = pd.MultiIndex.from_tuples(
    [
        ("Kunz", "Target adapter"),
        ("Kunz", "English adapter"),
        ("Ours", "our_target"),
        ("Ours", "our_baseline_en"),
        ("Ours", "Approximation_method"),
    ],
    names=["Source", "Method"],
)
df_xlmr.rename(
    columns={"our_baseline_en": "English adapter", "our_target": "Target adapter", "Approximation_method": "TIPA"},
    level="Method",
    inplace=True,
)
# we rotate the dataframe
df_xlmr = df_xlmr.T
df_xlmr

In [None]:
# 0) (Optional) escape underscores in your XLM-R columns if present
#    — only needed if any metric name contains '_' and you want it literal in LaTeX
import numpy as np

df_for_latex = df_xlmr.copy()
""" We want row-wise stats instead!
# 1) Compute per-column stats on df_for_latex 
col_stats = {}
for col in df_for_latex.columns:
    vals = df_for_latex[col].dropna().astype(float)
    mn, md, mx = vals.min(), float(np.median(vals)), vals.max()
    col_stats[col] = (mn, md, mx)

# 2) Build your formatters dict using exactly the same MultiIndex column keys
formatters = {}
for col, (mn, md, mx) in col_stats.items():
    # default-argument trick to bind mn, md, mx at definition time
    fmt = lambda x, mn=mn, md=md, mx=mx: (
        f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.3f}}}"
        if not pd.isna(x) else ""
    )
    formatters[col] = fmt
# 3) Export to LaTeX
latex_table = df_for_latex.to_latex(
    escape=False,        # let \tgrad[...] pass through
    formatters=formatters,
    multirow=True
)
"""
# 1) Compute per-row (min, med, max) stats
row_stats = {
    idx: (row.min(skipna=True), float(row.median(skipna=True)), row.max(skipna=True))
    for idx, row in df_for_latex.astype(float).iterrows()
}

# 2) Build a new DataFrame of formatted strings
formatted = pd.DataFrame(index=df_for_latex.index, columns=df_for_latex.columns, dtype=object)

for idx in df_for_latex.index:
    mn, md, mx = row_stats[idx]
    for col in df_for_latex.columns:
        x = df_for_latex.at[idx, col]
        if pd.isna(x):
            formatted.at[idx, col] = ""
        else:
            formatted.at[idx, col] = f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.3f}}}"
# 3) Export the already-formatted table to LaTeX
latex_table = formatted.to_latex(
    escape=False,  # our macros must pass through
    multirow=True,  # if you still want multirow on the first index level
)


print(latex_table)

In [None]:
import numpy as np
import pandas as pd

df_for_latex = df_xlmr.copy()

# 1) Compute global stats over all numeric cells
all_vals = pd.to_numeric(df_for_latex.values.ravel(), errors="coerce")
mn, md, mx = np.nanmin(all_vals), np.nanmedian(all_vals), np.nanmax(all_vals)


# 2) Build one formatter that uses the global [mn, md, mx]
def global_fmt(x, mn=mn, md=md, mx=mx):
    return f"\\tgrad[{mn:.1f}][{md:.1f}][{mx:.1f}]{{{x:.1f}}}" if not pd.isna(x) else ""


# assign the same formatter to every column
formatters = {col: global_fmt for col in df_for_latex.columns}

# 3) Export to LaTeX
latex_table = df_for_latex.to_latex(
    escape=False,  # let \tgrad[...] pass through
    formatters=formatters,
    multirow=True,
)
print(latex_table)

In [None]:
df_for_latex = df_xlmr.copy()

# 1) Compute global stats over all numeric cells
all_vals = pd.to_numeric(df_for_latex.values.ravel(), errors="coerce")
mn, md, mx = np.nanmin(all_vals), np.nanmedian(all_vals), np.nanmax(all_vals)

# 2) Precompute the max score in each column
col_max = df_for_latex.max(axis=0)

# 3) Build a new “string” DataFrame, applying gradient + bold on the column‐max
df_str = df_for_latex.astype(object).copy()

for idx, row in df_for_latex.iterrows():
    for col in df_for_latex.columns:
        x = row[col]
        if pd.isna(x):
            cell = ""
        else:
            # always wrap in the global gradient
            grad = f"\\tgrad[{mn:.1f}][{md:.1f}][{mx:.1f}]{{{x:.1f}}}"
            # bold if it’s the max in its column
            if x == col_max[col]:
                cell = f"\\textbf{{{grad}}}"
            else:
                cell = grad
        df_str.at[idx, col] = cell

# 4) Export to LaTeX
latex_table = df_str.to_latex(
    escape=False,  # allow \tgrad and \textbf through
    multirow=True,
)
print(latex_table)

SIB
-> They make a table for XLM-R in function of family

In [57]:
# we load in the language family to the dataframe
import pickle

with open("../pickles/families.pkl", "rb") as f:
    families = pickle.load(f)
# SIB scores, extracted from the paper (p230 aka 5)
sib_og = {
    "Indo-European": 82.4,
    "Atlantic-Congo": 41.4,
    "Afro-Asiatic": 67.4,
    "Austronesian": 64.0,
    "Turkic": 80.2,
    "Sino-Tibetan": 57.9,
    "Nilotic": 34.8,
    "Dravidian": 87.8,
    "Tai-Kadai": 68.4,
    "Uralic": 89.1,
    "Austroasiatic": 67.5,
    "Mande": 32.5,
    "Japonic": 89.3,
    "Koreanic": 88.7,
    "Mongolic-Khitan": 86.1,
    "Constructed": 88.5,
    "Quechuan": 46.3,
    "Basque": 89.2,
    "Aymaran": 39.1,
    "Tupian": 61.3,
    "Kartvelian": 89.1,
}

# we check if each of these families is present in our dataframe
for family in sib_og.keys():
    if family not in families.values():
        print(f"\t!!!{family} not in dataframe")
        if family == "Basque":
            print("\tIn our system, Basque is 'unknown', so we can add it manually.")
        elif family == "Constructed":
            print("\In our system, Constructed is 'Artificial Language', so we can add it manually.")
    else:
        print(f"{family} is in dataframe")
# our later analysis show that Basque is Unknown in our calculation, but "basque" in SIB -> we fix this
families["eu"] = "Basque"
families["eo"] = "Constructed"  # Esperanto

Indo-European is in dataframe
Atlantic-Congo is in dataframe
Afro-Asiatic is in dataframe
Austronesian is in dataframe
Turkic is in dataframe
Sino-Tibetan is in dataframe
Nilotic is in dataframe
Dravidian is in dataframe
Tai-Kadai is in dataframe
Uralic is in dataframe
Austroasiatic is in dataframe
Mande is in dataframe
Japonic is in dataframe
Koreanic is in dataframe
Mongolic-Khitan is in dataframe
	!!!Constructed not in dataframe
\In our system, Constructed is 'Artificial Language', so we can add it manually.
Quechuan is in dataframe
	!!!Basque not in dataframe
	In our system, Basque is 'unknown', so we can add it manually.
Aymaran is in dataframe
Tupian is in dataframe
Kartvelian is in dataframe


In [58]:
# we make a dataframe with sib scores for all languages
sib = scores["sib"]
sib_df = pd.DataFrame.from_dict(sib, orient="index")
# we multiply all our values by 100
sib_df = sib_df * 100

# families is structured as {iso:family} so we can map it to the langs of the dataframe
sib_df["Family"] = sib_df.index.map(families)
# we filter the categories: we know, from previous analysis, that "reconstructed_syntactic" is the best
# we also keep the "closest featural", "no train but gain" and "finetune"
sib_df = sib_df[["reconstructed_syntactic_limit", "baseline_closest_featural", "no_train_gain", "finetune", "Family"]]
rename_dict = {
    "reconstructed_syntactic_limit": "TIPA",
    "baseline_closest_featural": "Closest Featural",
    "no_train_gain": "No Train but Gain",
    "finetune": "XLM-R base",
    "Family": "Language Family",
}
sib_df.rename(columns=rename_dict, inplace=True)
# we drop languages that have "NaN" values -> 20-ish from 200-ish
sib_df = sib_df.dropna(axis=0, how="any")
sib_df

Unnamed: 0,TIPA,Closest Featural,No Train but Gain,XLM-R base,Language Family
ace,42.647059,40.931373,42.156863,35.049020,Austronesian
acm,85.294118,74.019608,84.803922,83.823529,Afro-Asiatic
aeb,80.882353,53.921569,78.431373,80.392157,Afro-Asiatic
af,85.294118,83.823529,85.784314,87.254902,Indo-European
als,86.764706,82.352941,88.235294,87.745098,Indo-European
...,...,...,...,...,...
yo,25.980392,18.137255,21.078431,24.509804,Atlantic-Congo
yue,87.254902,86.764706,88.235294,87.254902,Sino-Tibetan
zh,88.235294,88.235294,87.990196,88.235294,Sino-Tibetan
zsm,90.686275,89.705882,88.725490,87.254902,Austronesian


In [59]:
# we make a dataframe "sib_averages" in which we average the scores for each family
sib_averages = sib_df.groupby("Language Family").mean().reset_index()
# we add the original SIB scores to the dataframe
sib_averages["XLM-R large"] = sib_averages["Language Family"].map(sib_og)
# we add a column "average" for each method, loaded in from the "sib_average" dict
sib_averages

Unnamed: 0,Language Family,TIPA,Closest Featural,No Train but Gain,XLM-R base,XLM-R large
0,Afro-Asiatic,60.625721,50.259516,59.155133,58.621684,67.4
1,Atlantic-Congo,32.978364,29.276538,31.135903,30.054091,41.4
2,Austroasiatic,65.522876,65.03268,61.111111,66.503268,67.5
3,Austronesian,62.312572,59.602076,61.317762,58.693772,64.0
4,Aymaran,36.764706,41.666667,36.27451,21.078431,39.1
5,Basque,83.333333,80.882353,82.843137,80.392157,89.2
6,Constructed,83.333333,56.862745,82.352941,84.803922,88.5
7,Dravidian,82.47549,71.20098,81.004902,81.25,87.8
8,Indo-European,77.62552,68.872549,75.902406,75.542187,82.4
9,Japonic,87.254902,88.235294,88.72549,87.745098,89.3


In [60]:
# we drop languages with NaN again
sib_averages = sib_averages.dropna(axis=0, how="any")
sib_averages = sib_averages.set_index("Language Family")

new_columns = [
    ("Adapter‐based methods", "TIPA"),
    ("Adapter‐based methods", "Closest Featural"),
    ("Adapter‐based methods", "No Train but Gain"),
    ("Fine‐tuning methods", "XLM‐R base"),
    ("Fine‐tuning methods", "XLM‐R large"),
]
sib_averages.columns = pd.MultiIndex.from_tuples(new_columns)

sib_averages

Unnamed: 0_level_0,Adapter‐based methods,Adapter‐based methods,Adapter‐based methods,Fine‐tuning methods,Fine‐tuning methods
Unnamed: 0_level_1,TIPA,Closest Featural,No Train but Gain,XLM‐R base,XLM‐R large
Language Family,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Afro-Asiatic,60.625721,50.259516,59.155133,58.621684,67.4
Atlantic-Congo,32.978364,29.276538,31.135903,30.054091,41.4
Austroasiatic,65.522876,65.03268,61.111111,66.503268,67.5
Austronesian,62.312572,59.602076,61.317762,58.693772,64.0
Aymaran,36.764706,41.666667,36.27451,21.078431,39.1
Basque,83.333333,80.882353,82.843137,80.392157,89.2
Constructed,83.333333,56.862745,82.352941,84.803922,88.5
Dravidian,82.47549,71.20098,81.004902,81.25,87.8
Indo-European,77.62552,68.872549,75.902406,75.542187,82.4
Japonic,87.254902,88.235294,88.72549,87.745098,89.3


In [61]:
# we export the dataframe to latex
latex_sib = sib_averages.to_latex(escape=False, multirow=True, float_format="%.1f")
print(latex_sib)

\begin{tabular}{lrrrrr}
\toprule
 & \multicolumn{3}{r}{Adapter‐based methods} & \multicolumn{2}{r}{Fine‐tuning methods} \\
 & TIPA & Closest Featural & No Train but Gain & XLM‐R base & XLM‐R large \\
Language Family &  &  &  &  &  \\
\midrule
Afro-Asiatic & 60.6 & 50.3 & 59.2 & 58.6 & 67.4 \\
Atlantic-Congo & 33.0 & 29.3 & 31.1 & 30.1 & 41.4 \\
Austroasiatic & 65.5 & 65.0 & 61.1 & 66.5 & 67.5 \\
Austronesian & 62.3 & 59.6 & 61.3 & 58.7 & 64.0 \\
Aymaran & 36.8 & 41.7 & 36.3 & 21.1 & 39.1 \\
Basque & 83.3 & 80.9 & 82.8 & 80.4 & 89.2 \\
Constructed & 83.3 & 56.9 & 82.4 & 84.8 & 88.5 \\
Dravidian & 82.5 & 71.2 & 81.0 & 81.2 & 87.8 \\
Indo-European & 77.6 & 68.9 & 75.9 & 75.5 & 82.4 \\
Japonic & 87.3 & 88.2 & 88.7 & 87.7 & 89.3 \\
Kartvelian & 83.8 & 77.9 & 82.8 & 77.0 & 89.1 \\
Koreanic & 86.3 & 64.7 & 85.3 & 83.8 & 88.7 \\
Mande & 26.7 & 22.5 & 26.2 & 27.0 & 32.5 \\
Mongolic-Khitan & 81.4 & 31.4 & 76.5 & 79.9 & 86.1 \\
Nilotic & 24.5 & 19.4 & 22.5 & 24.0 & 34.8 \\
Quechuan & 47.5 & 57.4 

In [62]:
# 1) compute global [min, median, max] over every numeric cell
all_vals = pd.to_numeric(sib_averages.values.ravel(), errors="coerce")
mn, md, mx = np.nanmin(all_vals), float(np.nanmedian(all_vals)), np.nanmax(all_vals)

# 2) build a string‐typed DataFrame, applying global gradient + bold on row‐max
sib_str = sib_averages.astype(object).copy()

for idx, row in sib_averages.iterrows():
    # compute the maximum value in this row (ignoring NaNs)
    row_max = row.max()
    for col in sib_averages.columns:
        x = row[col]
        if pd.isna(x):
            cell = ""
        else:
            # global gradient
            grad = f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.1f}}}"
            # bold if it’s the max in its row
            cell = f"\\textbf{{{grad}}}" if x == row_max else grad
        sib_str.at[idx, col] = cell

# 3) export to LaTeX (letting \tgrad and \textbf pass through)
latex = sib_str.to_latex(
    escape=False,
    multirow=True,  # keep multirow on the index if you like
)
print(latex)

\begin{tabular}{llllll}
\toprule
 & \multicolumn{3}{r}{Adapter‐based methods} & \multicolumn{2}{r}{Fine‐tuning methods} \\
 & TIPA & Closest Featural & No Train but Gain & XLM‐R base & XLM‐R large \\
Language Family &  &  &  &  &  \\
\midrule
Afro-Asiatic & \tgrad[19.363][65.033][89.300]{60.6} & \tgrad[19.363][65.033][89.300]{50.3} & \tgrad[19.363][65.033][89.300]{59.2} & \tgrad[19.363][65.033][89.300]{58.6} & \textbf{\tgrad[19.363][65.033][89.300]{67.4}} \\
Atlantic-Congo & \tgrad[19.363][65.033][89.300]{33.0} & \tgrad[19.363][65.033][89.300]{29.3} & \tgrad[19.363][65.033][89.300]{31.1} & \tgrad[19.363][65.033][89.300]{30.1} & \textbf{\tgrad[19.363][65.033][89.300]{41.4}} \\
Austroasiatic & \tgrad[19.363][65.033][89.300]{65.5} & \tgrad[19.363][65.033][89.300]{65.0} & \tgrad[19.363][65.033][89.300]{61.1} & \tgrad[19.363][65.033][89.300]{66.5} & \textbf{\tgrad[19.363][65.033][89.300]{67.5}} \\
Austronesian & \tgrad[19.363][65.033][89.300]{62.3} & \tgrad[19.363][65.033][89.300]{59.6} & \

Or, only bolding adapter-based methods

In [63]:
import numpy as np
import pandas as pd

# 1) compute global [min, median, max] over every numeric cell
all_vals = pd.to_numeric(sib_averages.values.ravel(), errors="coerce")
mn, md, mx = np.nanmin(all_vals), float(np.nanmedian(all_vals)), np.nanmax(all_vals)

# 2) build a string‐typed DataFrame, applying global gradient + bold on adapter‐based row‐max
sib_str = sib_averages.astype(object).copy()

for idx, row in sib_averages.iterrows():
    # compute the maximum over only the "Adapter-based methods" columns
    adapter_row = row["Adapter‐based methods"]
    finetune_row = row["Fine‐tuning methods"]
    row_max_adapter = adapter_row.max()
    row_max_finetune = finetune_row.max()
    for col in sib_averages.columns:
        x = row[col]
        if pd.isna(x):
            cell = ""
        else:
            # global gradient
            grad = f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.1f}}}"
            # bold if it’s under "Adapter-based methods" and equals that row’s adapter‐max
            if col[0] == "Adapter‐based methods" and x == row_max_adapter:
                cell = f"\\textbf{{{grad}}}"
            elif col[0] == "Fine‐tuning methods" and x == row_max_finetune:
                cell = f"\\textbf{{{grad}}}"
            else:
                cell = grad
        sib_str.at[idx, col] = cell

# 3) export to LaTeX (letting \tgrad and \textbf pass through)
latex = sib_str.to_latex(escape=False, multirow=True)
print(latex)

\begin{tabular}{llllll}
\toprule
 & \multicolumn{3}{r}{Adapter‐based methods} & \multicolumn{2}{r}{Fine‐tuning methods} \\
 & TIPA & Closest Featural & No Train but Gain & XLM‐R base & XLM‐R large \\
Language Family &  &  &  &  &  \\
\midrule
Afro-Asiatic & \textbf{\tgrad[19.363][65.033][89.300]{60.6}} & \tgrad[19.363][65.033][89.300]{50.3} & \tgrad[19.363][65.033][89.300]{59.2} & \tgrad[19.363][65.033][89.300]{58.6} & \textbf{\tgrad[19.363][65.033][89.300]{67.4}} \\
Atlantic-Congo & \textbf{\tgrad[19.363][65.033][89.300]{33.0}} & \tgrad[19.363][65.033][89.300]{29.3} & \tgrad[19.363][65.033][89.300]{31.1} & \tgrad[19.363][65.033][89.300]{30.1} & \textbf{\tgrad[19.363][65.033][89.300]{41.4}} \\
Austroasiatic & \textbf{\tgrad[19.363][65.033][89.300]{65.5}} & \tgrad[19.363][65.033][89.300]{65.0} & \tgrad[19.363][65.033][89.300]{61.1} & \tgrad[19.363][65.033][89.300]{66.5} & \textbf{\tgrad[19.363][65.033][89.300]{67.5}} \\
Austronesian & \textbf{\tgrad[19.363][65.033][89.300]{62.3}} & \tgr

In [77]:
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# we print the languages in the following families
families_of_interest = ["Aymaran", "Japonic", "Tupian", "Quechuan"]
for family in families_of_interest:
    print(f"\n{family}:")
    langs = sib_df[sib_df["Language Family"] == family].index.tolist()
    for lang in langs:
        print(f"\t{lang} -> {sib_df.loc[lang, 'TIPA']:.1f}")
        if lang in existing_adapters:
            print(f"\t\tAdapter already exists for {lang}")
        else:
            if lang == "quy":
                print("\t\tAdapter exists for the macro 'qu' which is Quechua!")
            print(f"\t\tNo adapter for {lang}")


Aymaran:
	ayr -> 36.8
		No adapter for ayr

Japonic:
	ja -> 87.3
		Adapter already exists for ja

Tupian:
	gn -> 62.3
		Adapter already exists for gn

Quechuan:
	quy -> 47.5
		Adapter exists for the macro 'qu' which is Quechua!
		No adapter for quy
