In [2]:
import os
import json
import math
import numpy as np
import scipy.stats as stats

existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1"}
inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]
# We prepare a subset of the scores where all distance types are included
scores_dist = {}
to_include = [
    "reconstructed_featural",
    "reconstructed_morphological_distVar",
    "reconstructed_syntactic_distVar",
    "baseline_en",
    "baseline_avg_adapter",
    "baseline_closest_featural",
    "no_train_gain",
]
# for each key, if not ALL of the to_include keys are in the list, we skip the language
for task_name in scores:
    scores_dist[task_name] = {}
    for lang_name in scores[task_name]:
        if all(col in scores[task_name][lang_name] for col in to_include):
            scores_dist[task_name][lang_name] = {col: scores[task_name][lang_name][col] for col in to_include}
        else:
            # print(f"skipping {lang_name} for {task_name}")
            continue
print("len(scores_dist)", len(scores_dist["pos"]))
print("og len(scores)", len(scores["pos"]))

ModuleNotFoundError: No module named 'numpy'

In [20]:
def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val

## POS comparison

In [21]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "pos"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.4485338790223098
baseline_en: 0.4443287769237761
t-statistic: 1.471682968258695
p-value: 0.07312353162482992
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.4495814021381374
baseline_en: 0.4443287769237761
t-statistic: 1.745247538164272
p-value: 0.042988729413020815
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.44801410251010987
baseline_en: 0.4443287769237761
t-statistic: 1.3395865924575125
p-value: 0.09267468908885493
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores

In [22]:
# we check if
task = "pos"
test_columns = [
    "reconstructed_morphological_distVar",
]
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

test_column: reconstructed_morphological_distVar, baseline reconstructed_featural
average scores
reconstructed_morphological_distVar: 0.4495814021381374
reconstructed_featural: 0.4485338790223098
t-statistic: 1.2566139823269367
p-value: 0.10684215706420998
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline reconstructed_syntactic_distVar
average scores
reconstructed_morphological_distVar: 0.4495814021381374
reconstructed_syntactic_distVar: 0.44801410251010987
t-statistic: 1.7033909635405693
p-value: 0.04679257582016358
The difference is statistically significant
-----------------------------------


## Results POS distance comparison
As different languages lack some distance types, the evaluation is slightly different.
We retain only the languages which have full coverage of all distance type evaluations.
Our inspection of the dropped languages shows that they are mostly low-resource languages, which is not surprising but has some implications:
Our method has been shown in section [SECTION ANALYSIS RESOURCEDNESS] to have a higher impact on lower-resourced languages.

-> We see that morphological distance is most beneficial for POS, only one that outperforms all baselines. (Significantly for all but no_train_gain), followed by featural distance and syntactic distance (slightly less performant). These two still outperform all baselines significantly BUT the english source adapter setting, and slightly underperform compared to no_train_gain

Morphological distance significantly outperforms syntactic distance on POS

# NER comparison

In [23]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "ner"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.49947934356251256
baseline_en: 0.4647875590055643
t-statistic: 5.319830048696973
p-value: 4.2321140086576374e-07
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.5001237474726075
baseline_en: 0.4647875590055643
t-statistic: 5.329260470386214
p-value: 4.0712127039246156e-07
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.49944428243953554
baseline_en: 0.4647875590055643
t-statistic: 5.299914249860668
p-value: 4.5926651828571e-07
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores
re

In [24]:
# we check if
task = "ner"
test_columns = [
    "reconstructed_morphological_distVar",
]
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

test_column: reconstructed_morphological_distVar, baseline reconstructed_featural
average scores
reconstructed_morphological_distVar: 0.5001237474726075
reconstructed_featural: 0.49947934356251256
t-statistic: 1.3028120039229474
p-value: 0.09809962112247249
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline reconstructed_syntactic_distVar
average scores
reconstructed_morphological_distVar: 0.5001237474726075
reconstructed_syntactic_distVar: 0.49944428243953554
t-statistic: 1.3288125640435824
p-value: 0.09375411005382189
The difference is not statistically significant
-----------------------------------


## Results NER distance comparison
Again, morphological distance seems to be most informative, only one to significantly outperform all baselines. Other two methods are almost identical in score, outperforming all baselines significantly except for no_train_gain (which they still outperform)

# QA comparison

In [25]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "qa"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.7271252762822759
baseline_en: 0.7210740878836793
t-statistic: 2.4633369914457828
p-value: 0.015745852365037824
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.7272652915477485
baseline_en: 0.7210740878836793
t-statistic: 2.609570550253358
p-value: 0.012138048056661924
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.7270498698545946
baseline_en: 0.7210740878836793
t-statistic: 2.4029050957563642
p-value: 0.017525950911712523
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores
recon

## Results QA distance comparison
All distance types are very, very similar. All outperform all baselines, significantly in all cases but no_train_gain (smaller difference, less than 0,25% relative improvement compared to 36% relative to finetune baseline).

In [27]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "copa"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.5498181818181818
baseline_en: 0.532909090909091
t-statistic: 3.0259307025372006
p-value: 0.006382762458236786
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.5496363636363636
baseline_en: 0.532909090909091
t-statistic: 3.4130301957327007
p-value: 0.0033122445108849075
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.5496363636363637
baseline_en: 0.532909090909091
t-statistic: 2.878656867580582
p-value: 0.008211639734344816
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores
reconst

## Results COPA distance comparison
Outperforms baseline_en in all cases. Featural is best, but small difference.
--> Does not have to be analysed as this is done in main experiment
# Limit analysis

In [46]:
# we will now investigate if using limits and thresholds can further improve results
scores_limit = [
    "reconstructed_featural_limit",
    "reconstructed_syntactic_limit",
    "reconstructed_morphological_limit",
    "reconstructed_featural_threshold",
    "reconstructed_syntactic_threshold",
    "reconstructed_morphological_threshold",
    "reconstructed_featural",
    "reconstructed_syntactic_distVar",
    "reconstructed_morphological_distVar",
]
# we refine the subset to only include the languages that have all relevant columns
scores_limit_subset = {}
for task_name in scores:
    scores_limit_subset[task_name] = {}
    for lang_name in scores[task_name]:
        if all(col in scores[task_name][lang_name] for col in scores_limit):
            scores_limit_subset[task_name][lang_name] = {col: scores[task_name][lang_name][col] for col in scores_limit}
        else:
            # print(f"skipping {lang_name} for {task_name}")
            continue

In [51]:
task = "ner"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.5052744017684326
reconstructed_featural: 0.49947934356251256
t-statistic: 3.866540224747911
p-value: 0.00010845627027128958
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.5140822727352715
reconstructed_featural: 0.49947934356251256
t-statistic: 5.099558384560621
p-value: 1.0373958885215319e-06
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.5062823157986861
reconstructed_syntactic_distVar: 0.49944428243953554
t-statistic: 3.704583025826268
p-value: 0.00018907176248542088
The difference is statistically significant
----------------------

## Results NER distance comparison with limits
Both Limit and Thresholds significantly outperform their relative counterparts (no limitation).
best performing = limit, especially featural (0.514 vs 0.499 for overall NER)
==> Shows that our method has more potential, and can be further refined by looking for better limit and threshold values

In [52]:
task = "pos"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.4483538634919548
reconstructed_featural: 0.4485338790223098
t-statistic: -0.12885942222239558
p-value: 0.5510535925349446
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.4484558278153162
reconstructed_featural: 0.4485338790223098
t-statistic: -0.05020410696226338
p-value: 0.5199381152963118
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.44867461361811517
reconstructed_syntactic_distVar: 0.44801410251010987
t-statistic: 0.44166442528108035
p-value: 0.33014690506025063
The difference is not statistically significant
--------------

## Results POS distance comparison with limits
For POS, limiting or putting a threshold has no demonstrable positive effect on the results

In [53]:
task = "qa"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.7266866467655285
reconstructed_featural: 0.7271252762822759
t-statistic: -1.2504480934463686
p-value: 0.8814591421280972
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.7281344543568334
reconstructed_featural: 0.7271252762822759
t-statistic: 1.0506349044006917
p-value: 0.15798021009887603
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.7271196088629276
reconstructed_syntactic_distVar: 0.7270498698545946
t-statistic: 0.19561002530779972
p-value: 0.42424096051585775
The difference is not statistically significant
------------------

## Results QA distance comparison with limits
For QA, using a threshold that filters out more distant adapters has a significant improvement upon the results only for the morphological distance, which results in the highest overall score for the method.


In [54]:
task = "copa"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.5509090909090909
reconstructed_featural: 0.5498181818181818
t-statistic: 0.7003307898835022
p-value: 0.2498448603526056
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.5590909090909091
reconstructed_featural: 0.5498181818181818
t-statistic: 2.0953793092650645
p-value: 0.031280577092468564
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.5529090909090908
reconstructed_syntactic_distVar: 0.5496363636363637
t-statistic: 1.4687150284812953
p-value: 0.08632558476901153
The difference is not statistically significant
-----------------------

In [55]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "copa"
test_columns = ["reconstructed_featural_limit"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_limit, baseline baseline_en
average scores
reconstructed_featural_limit: 0.5590909090909091
baseline_en: 0.532909090909091
t-statistic: 3.3791274321403106
p-value: 0.003506447137809433
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline baseline_avg_adapter
average scores
reconstructed_featural_limit: 0.5590909090909091
baseline_avg_adapter: 0.5534545454545454
t-statistic: 0.7957617039728352
p-value: 0.22232468284532297
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline baseline_closest_featural
average scores
reconstructed_featural_limit: 0.5590909090909091
baseline_closest_featural: 0.5645454545454546
t-statistic: -0.5600876294531378
p-value: 0.7061269492191149
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, 

## Results COPA distance comparison with limits
reconstructed featural with a limit is the highest performing method!
Outperforms all other methods significantly. It results in scores that outperform most of the baselines that could not be matched with the regular method.
However, the highest score remains the MAD-X baseline, which still slightly outperforms our method: as our method approximates adapters, and is mainly targeted to create adapters for language that do not have existing language adapters, the COPA task is not the best representation of the method's performance. This is because all languages in this task setup have an existing adapter, which makes the method less relevant. 