In [1]:
import json

# we look at path "./eval_scores", in which there are json files with scores
import os
from pprint import pprint
import numpy as np
from qq import LanguageData
import math

ld = LanguageData.from_db()

In [17]:
scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1"}
inf = math.inf


def best_scores(scores):
    best_scores = {}
    for lang, types in scores.items():
        highest = (-inf, "None")
        for type, value in types.items():
            if isinstance(value, float):
                if value > highest[0]:
                    highest = (value, type)
            else:
                for reconstructed, score in value.items():
                    if score > highest[0]:
                        highest = (score, reconstructed)

        # print(lang, highest)
        best_scores[lang] = highest
    pprint(best_scores)
    # we count how many time each type was the best
    best_types = {}
    for lang, (score, type) in best_scores.items():
        if type not in best_types.keys():
            best_types[type] = 0
        best_types[type] += 1
    pprint(best_types)


for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# Comparison with other papers

## EMEA
EMEA check NER and POS on quite a few languages.

In [3]:
# we print the highest 3 key-value pairs in a combination
def get_highest(task, language):
    result = {"baseline_en": -inf, "Best": (-inf, None)}
    for type, value in scores[task][language].items():
        # value = value*100
        # we get the baseline of english
        if type == "baseline_en":
            result["baseline_en"] = value
        if "baseline" not in type:
            if value > result["Best"][0]:
                result["Best"] = (value, type)

    return result


task = "ner"
to_check = ["mr", "bn", "ta", "fo", "no", "da", "be", "uk", "bg"]

for lang in to_check:
    if lang in scores[task].keys():
        print(lang, get_highest(task, lang))
    else:
        print(f"{lang} not in scores")

mr {'baseline_en': 0.37207165824529165, 'Best': (0.5046728971962616, 'reconstructed_morphological_threshold')}
bn {'baseline_en': 0.3659942363112392, 'Best': (0.5985275010827199, 'reconstructed_syntactic_threshold')}
ta {'baseline_en': 0.33454252317613864, 'Best': (0.4292682926829268, 'reconstructed_featural_limit')}
fo {'baseline_en': -inf, 'Best': (0.587360594795539, 'reconstructed_morphological_limit')}
no {'baseline_en': 0.7269464204137571, 'Best': (0.7522368421052632, 'reconstructed_syntactic_threshold')}
da {'baseline_en': 0.784997910572503, 'Best': (0.7937480419117904, 'reconstructed_featural_threshold')}
be {'baseline_en': 0.5907769007062734, 'Best': (0.7273440564927423, 'reconstructed_featural_limit')}
uk {'baseline_en': 0.5676052810476224, 'Best': (0.6025231397608616, 'reconstructed_featural')}
bg {'baseline_en': 0.6946546253356114, 'Best': (0.7437472722999966, 'reconstructed_featural_base')}


In [4]:
import pandas as pd

task2lang = {
    "ner": ["mr", "bn", "ta", "fo", "no", "da", "be", "uk", "bg"],
    "pos": ["mr", "bho", "ta", "fo", "no", "da", "be", "uk", "bg"],
}


# Initialize a dictionary to store results
data = {"baseline_en": [], "Best": []}

# Populate the dictionary with values for each language
task = "pos"
for lang in task2lang[task]:
    if lang in scores[task].keys():
        result = get_highest(task, lang)
        data["baseline_en"].append(result["baseline_en"])
        data["Best"].append(result["Best"][0])  # Append only the score from the tuple
    else:
        data["baseline_en"].append(None)
        data["Best"].append(None)

# Create a DataFrame
df = pd.DataFrame.from_dict(data, orient="index", columns=to_check)
# we add a row "relative improvement" which is the difference between the best and the baseline
df.loc["relative improvement"] = 1 - df.loc["baseline_en"] / df.loc["Best"]
df.loc["absolute improvement"] = df.loc["Best"] - df.loc["baseline_en"]
# we multiply all by 100
df = df * 100

Scores for EMEA:
Method mr bn ta avg. fo no da avg. be uk bg avg. avg.
En 48.0 54.4 29.6 44.0 57.5 73.3 80.5 70.4 67.1 67.6 71.1 68.6 61.0
EMEA-s10 57.5 63.2 38.3 53.0 61.6 74.9 82.0 72.8 72.9 72.9 75.1 73.6 66.5

In [5]:
# we make a dataframe with EMEA scores
emea = {
    "baseline_en": [
        48.0,
        54.4,
        29.6,
        57.5,
        73.3,
        80.5,
        67.1,
        67.6,
        71.1,
    ],
    "EMEA-s10": [57.5, 63.2, 38.3, 61.6, 74.9, 82.0, 72.9, 72.9, 75.1],
}
emea_df = pd.DataFrame.from_dict(emea, orient="index", columns=to_check)
# we have to divide by 100
emea_df.loc["relative improvement"] = 1 - emea_df.loc["baseline_en"] / emea_df.loc["EMEA-s10"]
emea_df.loc["absolute improvement"] = emea_df.loc["EMEA-s10"] - emea_df.loc["baseline_en"]
emea_df

Unnamed: 0,mr,bn,ta,fo,no,da,be,uk,bg
baseline_en,48.0,54.4,29.6,57.5,73.3,80.5,67.1,67.6,71.1
EMEA-s10,57.5,63.2,38.3,61.6,74.9,82.0,72.9,72.9,75.1
relative improvement,0.165217,0.139241,0.227154,0.066558,0.021362,0.018293,0.079561,0.072702,0.053262
absolute improvement,9.5,8.8,8.7,4.1,1.6,1.5,5.8,5.3,4.0


In [6]:
# we rename baseline_en index in df to "our_baseline_en"
df.rename(index={"baseline_en": "our_baseline_en"}, inplace=True)
df.rename(index={"Best": "Approximation_method"}, inplace=True)

# we rename the baseline_en index in emea_df to "emea_baseline_en"
emea_df.rename(index={"baseline_en": "emea_baseline_en"}, inplace=True)
# we only take the first two columns
emea_df = emea_df.iloc[:2, :]
df = df.iloc[:2, :]
# we concatenate the two dataframes
merged_df = pd.concat([emea_df, df])
merged_df

Unnamed: 0,mr,bn,ta,fo,no,da,be,uk,bg
emea_baseline_en,48.0,54.4,29.6,57.5,73.3,80.5,67.1,67.6,71.1
EMEA-s10,57.5,63.2,38.3,61.6,74.9,82.0,72.9,72.9,75.1
our_baseline_en,42.489162,33.508854,39.149474,54.998776,63.696679,77.977372,66.259707,61.913028,63.136868
Approximation_method,43.336831,33.885334,40.074814,57.738947,64.388078,82.669088,67.447689,62.776158,63.356007


# No train but gain
ner:
ar bg de el es fr hi ru sw tr ur vi zh

In [7]:
# we look at the languages from no train but gain paper
to_test = ["ar", "bg", "de", "el", "es", "fr", "hi", "ru", "sw", "tr", "ur", "vi", "zh"]
# we get the scores for these languages
for lang in to_test:
    if lang in scores["ner"].keys():
        print(lang, get_highest("ner", lang))
    else:
        print(f"{lang} not in scores")

ar {'baseline_en': 0.2433960213066638, 'Best': (0.3701895128473433, 'reconstructed_morphological_limit')}
bg {'baseline_en': 0.6946546253356114, 'Best': (0.7437472722999966, 'reconstructed_featural_base')}
de {'baseline_en': 0.7022521008403362, 'Best': (0.7159136884693189, 'reconstructed_syntactic_threshold')}
el {'baseline_en': 0.6577599815192701, 'Best': (0.72478919455149, 'reconstructed_morphological_limit')}
es {'baseline_en': 0.7115317751593586, 'Best': (0.7245094267025779, 'no_train_gain')}
fr {'baseline_en': 0.7141884385191557, 'Best': (0.7355297017143272, 'reconstructed_syntactic_limit')}
hi {'baseline_en': 0.5677308024158757, 'Best': (0.6572411157814291, 'reconstructed_morphological_threshold')}
ru {'baseline_en': 0.5094573519414565, 'Best': (0.634243480258875, 'reconstructed_morphological_limit')}
sw {'baseline_en': 0.6110886280857952, 'Best': (0.6800986842105263, 'reconstructed_morphological_threshold')}
tr {'baseline_en': 0.5816221413364467, 'Best': (0.6042429686960127, 're

# QXUAD
F1 scores:
Model 	en 	ar 	de 	el 	es 	hi 	ru 	th 	tr 	vi 	zh 	ro 	avg
mBERT 	83.5 	61.5 	70.6 	62.6 	75.5 	59.2 	71.3 	42.7 	55.4 	69.5 	58.0 	72.7 	65.2
XLM-R Large 	86.5 	68.6 	80.4 	79.8 	82.0 	76.7 	80.1 	74.2 	75.9 	79.1 	59.3 	83.6 	77.2
Translate-train mBERT 	83.5 	68.0 	75.6 	70.0 	80.2 	69.6 	75.0 	36.9 	68.9 	75.6 	66.2 	- 	70.0
Translate-test BERT-L 	87.9 	73.7 	79.8 	79.4 	82.0 	74.9 	79.9 	64.6 	67.4 	76.3 	73.7 	- 	76.3

In [63]:
# we make a dataframe out of this
to_test = ["en", "ar", "de", "el", "es", "hi", "ru", "th", "tr", "vi", "zh", "ro"]
qx = {
    "mBERT": [83.5, 61.5, 70.6, 62.6, 75.5, 59.2, 71.3, 42.7, 55.4, 69.5, 58.0, 72.7],
    "XLM-R Large": [86.5, 68.6, 80.4, 79.8, 82.0, 76.7, 80.1, 74.2, 75.9, 79.1, 59.3, 83.6],
    # "Translate-train mBERT": [83.5, 68.0, 75.6, 70.0, 80.2, 69.6, 75.0, 36.9, 68.9, 75.6],
    # "Translate-test BERT-L": [87.9, 73.7, 79.8, 79.4, 82.0, 74.9, 79.9, 64.6, 67.4, 76.3, 73.7],
}
qx_df = pd.DataFrame.from_dict(qx, orient="index", columns=to_test)
qx_df

Unnamed: 0,en,ar,de,el,es,hi,ru,th,tr,vi,zh,ro
mBERT,83.5,61.5,70.6,62.6,75.5,59.2,71.3,42.7,55.4,69.5,58.0,72.7
XLM-R Large,86.5,68.6,80.4,79.8,82.0,76.7,80.1,74.2,75.9,79.1,59.3,83.6


In [64]:
# we add our scores to the dataframe
task = "qa"
for lang in to_test:
    if lang in scores["qa"].keys():
        result = get_highest("qa", lang)
        # qx_df.loc["XLM-R Base", lang] = round(scores[task][lang]["finetune"]*100, 1)
        qx_df.loc["MAD-X", lang] = round(scores[task][lang]["baseline_closest_featural"] * 100, 1)
        # qx_df.loc["Approximation_method", lang] = round(scores[task][lang]["reconstructed_featural"]*100, 1)
        qx_df.loc["Our Method", lang] = round(result["Best"][0] * 100, 1)
qx_df

Unnamed: 0,en,ar,de,el,es,hi,ru,th,tr,vi,zh,ro
mBERT,83.5,61.5,70.6,62.6,75.5,59.2,71.3,42.7,55.4,69.5,58.0,72.7
XLM-R Large,86.5,68.6,80.4,79.8,82.0,76.7,80.1,74.2,75.9,79.1,59.3,83.6
XLM-R Base,81.2,23.4,61.1,52.8,61.1,33.8,58.2,45.7,46.9,61.8,52.1,62.4
MAD-X,83.3,66.8,74.0,71.8,75.0,68.6,74.0,68.4,67.8,73.2,65.9,76.6
Our Method,83.6,67.9,76.1,73.1,75.9,69.2,75.0,69.2,69.0,73.8,66.7,78.9


# Table to be included in the paper! qa results
- Our method is better than finetuning mBERT, and very efficient, extendable to all languages.
- Here we take the best approximation method, as discussed in _distance_comparison

In [65]:
# we transform to Latex with the formatters etc.

# 1) compute min/median/max per language‐column
col_stats = {}
for col in qx_df.columns:
    vals = qx_df[col].dropna().astype(float)
    mn, md, mx = vals.min(), float(np.median(vals)), vals.max()
    col_stats[col] = (mn, md, mx)

# 2) build a formatter for each column
formatters = {}
for col, (mn, md, mx) in col_stats.items():
    # bind mn,md,mx into the lambda default args
    formatters[col] = (
        lambda mn, md, mx: lambda x: (f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.1f}}}" if not pd.isna(x) else "")
    )(mn, md, mx)

# 3) export with column‐wise formatting
latex = qx_df.to_latex(
    escape=False,  # so our \tgrad[...] macros are passed through
    formatters=formatters,
)
print(latex)

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & en & ar & de & el & es & hi & ru & th & tr & vi & zh & ro \\
\midrule
mBERT & \tgrad[81.200][83.500][86.500]{83.5} & \tgrad[23.400][66.800][68.600]{61.5} & \tgrad[61.100][74.000][80.400]{70.6} & \tgrad[52.800][71.800][79.800]{62.6} & \tgrad[61.100][75.500][82.000]{75.5} & \tgrad[33.800][68.600][76.700]{59.2} & \tgrad[58.200][74.000][80.100]{71.3} & \tgrad[42.700][68.400][74.200]{42.7} & \tgrad[46.900][67.800][75.900]{55.4} & \tgrad[61.800][73.200][79.100]{69.5} & \tgrad[52.100][59.300][66.700]{58.0} & \tgrad[62.400][76.600][83.600]{72.7} \\
XLM-R Large & \tgrad[81.200][83.500][86.500]{86.5} & \tgrad[23.400][66.800][68.600]{68.6} & \tgrad[61.100][74.000][80.400]{80.4} & \tgrad[52.800][71.800][79.800]{79.8} & \tgrad[61.100][75.500][82.000]{82.0} & \tgrad[33.800][68.600][76.700]{76.7} & \tgrad[58.200][74.000][80.100]{80.1} & \tgrad[42.700][68.400][74.200]{74.2} & \tgrad[46.900][67.800][75.900]{75.9} & \tgrad[61.800][73.200][79.100]{79.1} & \tgrad

## Exact match
Model 	en 	ar 	de 	el 	es 	hi 	ru 	th 	tr 	vi 	zh 	ro 	avg
mBERT 	72.2 	45.1 	54.0 	44.9 	56.9 	46.0 	53.3 	33.5 	40.1 	49.6 	48.3 	59.9 	50.3
XLM-R Large 	75.7 	49.0 	63.4 	61.7 	63.9 	59.7 	64.3 	62.8 	59.3 	59.0 	50.0 	69.7 	61.5
Translate-train mBERT 	72.2 	51.1 	60.7 	53.0 	63.1 	55.4 	59.7 	33.5 	54.8 	56.2 	56.6 	- 	56.0
Translate-test BERT-L 	77.1 	58.8 	66.7 	65.5 	68.4 	60.1 	66.7 	50.0 	49.6 	61.5 	59.1 	- 	62.1

In [11]:
xq_em = {
    "mBERT": [72.2, 45.1, 54.0, 44.9, 56.9, 46.0, 53.3, 33.5, 40.1, 49.6, 48.3, 59.9],
    "XLM-R Large": [75.7, 49.0, 63.4, 61.7, 63.9, 59.7, 64.3, 62.8, 59.3, 59.0, 50.0],
    "Translate-train mBERT": [72.2, 51.1, 60.7, 53.0, 63.1, 55.4, 59.7, 33.5, 54.8],
    "Translate-test BERT-L": [77.1, 58.8, 66.7, 65.5, 68.4, 60.1, 66.7, 50.0, 49.6, 61.5, 59.1],
}

xq_em_df = pd.DataFrame.from_dict(xq_em, orient="index", columns=to_test)
# we add our scores to the dataframe
# we add our scores to the dataframe
task = "qa"
for lang in to_test:
    if lang in scores["qa"].keys():
        result = get_highest("qa", lang)
        xq_em_df.loc["our_baseline_en", lang] = scores[task][lang]["baseline_en"]
        xq_em_df.loc["Approximation_method", lang] = scores[task][lang]["improved_reconstructed_featural_all"]
        xq_em_df.loc["Target language adapter", lang] = scores[task][lang]["baseline_closest_featural"]
    else:
        xq_em_df.loc["our_baseline_en", lang] = None
        xq_em_df.loc["Approximation_method", lang] = None
xq_em_df

Unnamed: 0,en,ar,de,el,es,hi,ru,th,tr,vi,zh,ro
mBERT,72.2,45.1,54.0,44.9,56.9,46.0,53.3,33.5,40.1,49.6,48.3,59.9
XLM-R Large,75.7,49.0,63.4,61.7,63.9,59.7,64.3,62.8,59.3,59.0,50.0,
Translate-train mBERT,72.2,51.1,60.7,53.0,63.1,55.4,59.7,33.5,54.8,,,
Translate-test BERT-L,77.1,58.8,66.7,65.5,68.4,60.1,66.7,50.0,49.6,61.5,59.1,
our_baseline_en,,,,,,,,,,,,
Approximation_method,,,,,,,,,,,,


# Kunz & Holstrom

In [55]:
# Data for XLM-R results on COPA
data = {
    "Target": [55.2, 55.3, 53.1, 55.7, 54.1, 54.0, 51.2, 51.4, 53.8],
    "English": [55.0, 54.9, 51.9, 53.6, 50.7, 49.7, 48.6, 51.2, 52.0],
    "None": [54.3, 55.1, 51.2, 53.4, 52.3, 52.0, 50.6, 49.6, 52.3],
    "Nonetr": [49.4, 52.8, 49.3, 49.8, 51.4, 49.7, 49.6, 50.2, 50.3],
}

index = ["zh", "vi", "tr", "id", "et", "sw", "ht", "qu", "Average"]

# Create DataFrame
df_xlmr = pd.DataFrame(data, index=index)
# we drop "None" and "Nonetr"
df_xlmr.drop(columns=["None", "Nonetr"], inplace=True)
# we add a column for our COPA scores, for each of the languages
task = "copa"
for lang in index:
    if lang in scores[task].keys():
        result = get_highest(task, lang)
        df_xlmr.loc[lang, "our_target"] = round(scores[task][lang]["baseline_closest_featural"], 3) * 100
        df_xlmr.loc[lang, "our_baseline_en"] = round(scores[task][lang]["baseline_en"], 3) * 100
        df_xlmr.loc[lang, "Approximation_method"] = round(result["Best"][0], 3) * 100

    else:
        df_xlmr.loc[lang, "our_baseline_en"] = None
        df_xlmr.loc[lang, "Approximation_method"] = None
# we add the "Average" row for our scores
df_xlmr.loc["Average", "our_baseline_en"] = round(df_xlmr["our_baseline_en"].mean(), 1)
df_xlmr.loc["Average", "Approximation_method"] = round(df_xlmr["Approximation_method"].mean(), 1)
df_xlmr.loc["Average", "our_target"] = round(df_xlmr["our_target"].mean(), 1)
# define new MultiIndex for the columns
df_xlmr.columns = pd.MultiIndex.from_tuples(
    [
        ("Kunz", "Target"),
        ("Kunz", "English"),
        ("Ours", "our_target"),
        ("Ours", "our_baseline_en"),
        ("Ours", "Approximation_method"),
    ],
    names=["Source", "Metric"],
)
df_xlmr.rename(
    columns={"our_baseline_en": "English", "our_target": "Target", "Approximation_method": "Approximation method"},
    level="Metric",
    inplace=True,
)
df_xlmr

Source,Kunz,Kunz,Ours,Ours,Ours
Metric,Target,English,Target,English,Approximation method
zh,55.2,55.0,59.2,58.8,62.0
vi,55.3,54.9,57.4,59.6,59.8
tr,53.1,51.9,54.6,53.6,59.0
id,55.7,53.6,60.8,57.4,59.6
et,54.1,50.7,55.6,52.4,58.4
sw,54.0,49.7,56.8,49.0,53.2
ht,51.2,48.6,52.4,42.0,48.6
qu,51.4,51.2,50.8,46.8,53.6
Average,53.8,52.0,56.0,52.4,56.8


In [56]:
# 0) (Optional) escape underscores in your XLM-R columns if present
#    — only needed if any metric name contains '_' and you want it literal in LaTeX
import numpy as np

df_for_latex = df_xlmr.copy()
""" We want row-wise stats instead!
# 1) Compute per-column stats on df_for_latex 
col_stats = {}
for col in df_for_latex.columns:
    vals = df_for_latex[col].dropna().astype(float)
    mn, md, mx = vals.min(), float(np.median(vals)), vals.max()
    col_stats[col] = (mn, md, mx)

# 2) Build your formatters dict using exactly the same MultiIndex column keys
formatters = {}
for col, (mn, md, mx) in col_stats.items():
    # default-argument trick to bind mn, md, mx at definition time
    fmt = lambda x, mn=mn, md=md, mx=mx: (
        f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.3f}}}"
        if not pd.isna(x) else ""
    )
    formatters[col] = fmt
# 3) Export to LaTeX
latex_table = df_for_latex.to_latex(
    escape=False,        # let \tgrad[...] pass through
    formatters=formatters,
    multirow=True
)
"""
# 1) Compute per-row (min, med, max) stats
row_stats = {
    idx: (row.min(skipna=True), float(row.median(skipna=True)), row.max(skipna=True))
    for idx, row in df_for_latex.astype(float).iterrows()
}

# 2) Build a new DataFrame of formatted strings
formatted = pd.DataFrame(index=df_for_latex.index, columns=df_for_latex.columns, dtype=object)

for idx in df_for_latex.index:
    mn, md, mx = row_stats[idx]
    for col in df_for_latex.columns:
        x = df_for_latex.at[idx, col]
        if pd.isna(x):
            formatted.at[idx, col] = ""
        else:
            formatted.at[idx, col] = f"\\tgrad[{mn:.3f}][{md:.3f}][{mx:.3f}]{{{x:.3f}}}"
# 3) Export the already-formatted table to LaTeX
latex_table = formatted.to_latex(
    escape=False,  # our macros must pass through
    multirow=True,  # if you still want multirow on the first index level
)


print(latex_table)

\begin{tabular}{llllll}
\toprule
Source & \multicolumn{2}{r}{Kunz} & \multicolumn{3}{r}{Ours} \\
Metric & Target & English & Target & English & Approximation method \\
\midrule
zh & \tgrad[55.000][58.800][62.000]{55.200} & \tgrad[55.000][58.800][62.000]{55.000} & \tgrad[55.000][58.800][62.000]{59.200} & \tgrad[55.000][58.800][62.000]{58.800} & \tgrad[55.000][58.800][62.000]{62.000} \\
vi & \tgrad[54.900][57.400][59.800]{55.300} & \tgrad[54.900][57.400][59.800]{54.900} & \tgrad[54.900][57.400][59.800]{57.400} & \tgrad[54.900][57.400][59.800]{59.600} & \tgrad[54.900][57.400][59.800]{59.800} \\
tr & \tgrad[51.900][53.600][59.000]{53.100} & \tgrad[51.900][53.600][59.000]{51.900} & \tgrad[51.900][53.600][59.000]{54.600} & \tgrad[51.900][53.600][59.000]{53.600} & \tgrad[51.900][53.600][59.000]{59.000} \\
id & \tgrad[53.600][57.400][60.800]{55.700} & \tgrad[53.600][57.400][60.800]{53.600} & \tgrad[53.600][57.400][60.800]{60.800} & \tgrad[53.600][57.400][60.800]{57.400} & \tgrad[53.600][57.400