In [1]:
import os
import json
import math
import numpy as np
import scipy.stats as stats

# From huggingface api, with our trained adapters (eu, sr)
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# scraped from cc-100 website
xlm_included_langs = [
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "br",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "fa",
    "ff",
    "fi",
    "fr",
    "fy",
    "ga",
    "gd",
    "gl",
    "gn",
    "gu",
    "ha",
    "he",
    "hi",
    "hr",
    "ht",
    "hu",
    "hy",
    "id",
    "ig",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ku",
    "ky",
    "la",
    "lg",
    "li",
    "ln",
    "lo",
    "lt",
    "lv",
    "mg",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "ne",
    "nl",
    "no",
    "ns",
    "om",
    "or",
    "pa",
    "pl",
    "ps",
    "pt",
    "qu",
    "rm",
    "ro",
    "ru",
    "sa",
    "si",
    "sc",
    "sd",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "ss",
    "su",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "tl",
    "tn",
    "tr",
    "ug",
    "uk",
    "ur",
    "uz",
    "vi",
    "wo",
    "xh",
    "yi",
    "yo",
    "zu",
    "zh",
]
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1", "sib": "eval_accuracy"}
tasks = f1.keys()
scores = {task: {} for task in tasks}

inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]
# We prepare a subset of the scores where all distance types are included
scores_dist = {}
to_include = [
    "reconstructed_featural",
    "reconstructed_morphological_distVar",
    "reconstructed_syntactic_distVar",
    "baseline_en",
    "baseline_avg_adapter",
    "baseline_closest_featural",
    "no_train_gain",
]
# for each key, if not ALL of the to_include keys are in the list, we skip the language
langs = set()
for task_name in scores:
    scores_dist[task_name] = {}
    for lang_name in scores[task_name]:
        if all(col in scores[task_name][lang_name] for col in to_include):
            langs.add(lang_name)
            scores_dist[task_name][lang_name] = {col: scores[task_name][lang_name][col] for col in to_include}
        else:
            # print(f"skipping {lang_name} for {task_name}")
            continue
print("len(scores_dist)", len(scores_dist["sib"]))
print("og len(scores)", len(scores["sib"]))

len(scores_dist) 176
og len(scores) 195


In [2]:
def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val

In [9]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "pos"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.5676548287326185
baseline_en: 0.5350230613232024
t-statistic: 11.52057296931363
p-value: 2.1940815528850423e-26
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.5677830961241888
baseline_en: 0.5350230613232024
t-statistic: 11.536021706946196
p-value: 1.9298096666946395e-26
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.5678031066352571
baseline_en: 0.5350230613232024
t-statistic: 11.564743838984809
p-value: 1.5198614712944096e-26
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_closest_featural
average s

## POS comparison

In [3]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "pos"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.4485338790223098
baseline_en: 0.4443287769237761
t-statistic: 1.471682968258695
p-value: 0.07312353162482992
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.4495814021381374
baseline_en: 0.4443287769237761
t-statistic: 1.745247538164272
p-value: 0.042988729413020815
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.44801410251010987
baseline_en: 0.4443287769237761
t-statistic: 1.3395865924575125
p-value: 0.09267468908885493
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_closest_featural
average s

In [4]:
# we check if
task = "pos"
test_columns = [
    "reconstructed_morphological_distVar",
]
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")
print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_morphological_distVar, baseline reconstructed_featural
average scores
reconstructed_morphological_distVar: 0.4495814021381374
reconstructed_featural: 0.4485338790223098
t-statistic: 1.2566139823269367
p-value: 0.10684215706420998
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline reconstructed_syntactic_distVar
average scores
reconstructed_morphological_distVar: 0.4495814021381374
reconstructed_syntactic_distVar: 0.44801410251010987
t-statistic: 1.7033909635405693
p-value: 0.04679257582016358
The difference is statistically significant
-----------------------------------
sign ['reconstructed_morphological_distVar_reconstructed_syntactic_distVar']

notsign ['reconstructed_morphological_distVar_reconstructed_featural']


## Results POS distance comparison
As different languages lack some distance types, the evaluation is slightly different.
We retain only the languages which have full coverage of all distance type evaluations.
Our inspection of the dropped languages shows that they are mostly low-resource languages, which is not surprising but has some implications:
Our method has been shown in section [SECTION ANALYSIS RESOURCEDNESS] to have a higher impact on lower-resourced languages.

-> We see that morphological distance is most beneficial for POS, only one that outperforms all baselines. (Significantly for all but no_train_gain), followed by featural distance and syntactic distance (slightly less performant). These two still outperform all baselines significantly BUT the english source adapter setting, and slightly underperform compared to no_train_gain

Morphological distance significantly outperforms syntactic distance on POS

# NER comparison

In [5]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "ner"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.49947934356251256
baseline_en: 0.4647875590055643
t-statistic: 5.319830048696973
p-value: 4.2321140086576374e-07
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.5001237474726075
baseline_en: 0.4647875590055643
t-statistic: 5.329260470386214
p-value: 4.0712127039246156e-07
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.49944428243953554
baseline_en: 0.4647875590055643
t-statistic: 5.299914249860668
p-value: 4.5926651828571e-07
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores
re

In [6]:
# we check if
task = "ner"
test_columns = [
    "reconstructed_morphological_distVar",
]
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

test_column: reconstructed_morphological_distVar, baseline reconstructed_featural
average scores
reconstructed_morphological_distVar: 0.5001237474726075
reconstructed_featural: 0.49947934356251256
t-statistic: 1.3028120039229474
p-value: 0.09809962112247249
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline reconstructed_syntactic_distVar
average scores
reconstructed_morphological_distVar: 0.5001237474726075
reconstructed_syntactic_distVar: 0.49944428243953554
t-statistic: 1.3288125640435824
p-value: 0.09375411005382189
The difference is not statistically significant
-----------------------------------


## Results NER distance comparison
Again, morphological distance seems to be most informative, only one to significantly outperform all baselines. Other two methods are almost identical in score, outperforming all baselines significantly except for no_train_gain (which they still outperform)

# QA comparison

In [7]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "qa"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.7271252762822759
baseline_en: 0.7210740878836793
t-statistic: 2.4633369914457828
p-value: 0.015745852365037824
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.7272652915477485
baseline_en: 0.7210740878836793
t-statistic: 2.609570550253358
p-value: 0.012138048056661924
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.7270498698545946
baseline_en: 0.7210740878836793
t-statistic: 2.4029050957563642
p-value: 0.017525950911712523
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores
recon

## Results QA distance comparison
All distance types are very, very similar. All outperform all baselines, significantly in all cases but no_train_gain (smaller difference, less than 0,25% relative improvement compared to 36% relative to finetune baseline).
# SIB comparison

In [10]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "sib"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = [
    "baseline_en",
    "baseline_avg_adapter",
    "baseline_closest_featural",
    "no_train_gain",
    "reconstructed_featural",
]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: 0.6316705659536542
baseline_en: 0.5882074420677362
t-statistic: 10.977894928915193
p-value: 5.7580224876135305e-22
The difference is statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: 0.6312249331550802
baseline_en: 0.5882074420677362
t-statistic: 10.908649054027412
p-value: 9.062557834898598e-22
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: 0.6321579768270945
baseline_en: 0.5882074420677362
t-statistic: 11.123398927090108
p-value: 2.2166568576901843e-22
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores

## SIB results
All distance types outperform all baselines.
Morphological gets slightly lower results than featural ==> LOGIC in a HIHG-level task!
Syntactic outperforms featural, not a large margin but something!

In [30]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "copa"
test_columns = ["reconstructed_featural", "reconstructed_morphological_distVar", "reconstructed_syntactic_distVar"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_dist)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural, baseline baseline_en
average scores
reconstructed_featural: nan
baseline_en: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_morphological_distVar, baseline baseline_en
average scores
reconstructed_morphological_distVar: nan
baseline_en: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_distVar, baseline baseline_en
average scores
reconstructed_syntactic_distVar: nan
baseline_en: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural, baseline baseline_avg_adapter
average scores
reconstructed_featural: nan
baseline_avg_adapter: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_colum

  t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)


## Results COPA distance comparison
Outperforms baseline_en in all cases. Featural is best, but small difference.
--> Does not have to be analysed as this is done in main experiment
# Limit analysis

In [11]:
# we will now investigate if using limits and thresholds can further improve results
scores_limit = [
    "reconstructed_featural_limit",
    "reconstructed_syntactic_limit",
    "reconstructed_morphological_limit",
    "reconstructed_featural_threshold",
    "reconstructed_syntactic_threshold",
    "reconstructed_morphological_threshold",
    "reconstructed_featural",
    "reconstructed_syntactic_distVar",
    "reconstructed_morphological_distVar",
]
# we refine the subset to only include the languages that have all relevant columns
scores_limit_subset = {}
for task_name in scores:
    scores_limit_subset[task_name] = {}
    for lang_name in scores[task_name]:
        if all(col in scores[task_name][lang_name] for col in scores_limit):
            scores_limit_subset[task_name][lang_name] = {col: scores[task_name][lang_name][col] for col in scores_limit}
        else:
            print(f"skipping {lang_name} for {task_name}")
            missing = [col for col in scores_limit if col not in scores[task_name][lang_name].keys()]
            print(missing)
            continue
with_adapters_limit_subset = {}
no_adapters_limit_subset = {}
xlm_included_limit_subset = {}
xlm_excluded_limit_subset = {}
for task_name in scores_limit_subset:
    with_adapters_limit_subset[task_name] = {}
    no_adapters_limit_subset[task_name] = {}
    xlm_included_limit_subset[task_name] = {}
    xlm_excluded_limit_subset[task_name] = {}

    for lang_name in scores_limit_subset[task_name]:
        if lang_name in existing_adapters:
            with_adapters_limit_subset[task_name][lang_name] = scores_limit_subset[task_name][lang_name]
        else:
            no_adapters_limit_subset[task_name][lang_name] = scores_limit_subset[task_name][lang_name]
        if lang_name in xlm_included_langs:
            xlm_included_limit_subset[task_name][lang_name] = scores_limit_subset[task_name][lang_name]
        else:
            xlm_excluded_limit_subset[task_name][lang_name] = scores_limit_subset[task_name][lang_name]

skipping af for ner
['reconstructed_featural_limit', 'reconstructed_syntactic_limit', 'reconstructed_morphological_limit', 'reconstructed_syntactic_distVar', 'reconstructed_morphological_distVar']
skipping an for ner
['reconstructed_featural_limit', 'reconstructed_syntactic_limit', 'reconstructed_morphological_limit', 'reconstructed_syntactic_distVar', 'reconstructed_morphological_distVar']
skipping ast for ner
['reconstructed_featural_limit', 'reconstructed_syntactic_limit', 'reconstructed_morphological_limit', 'reconstructed_featural', 'reconstructed_syntactic_distVar', 'reconstructed_morphological_distVar']
skipping as for ner
['reconstructed_featural_limit', 'reconstructed_syntactic_limit', 'reconstructed_morphological_limit', 'reconstructed_syntactic_distVar', 'reconstructed_morphological_distVar']
skipping ay for ner
['reconstructed_featural_limit', 'reconstructed_syntactic_limit', 'reconstructed_morphological_limit', 'reconstructed_syntactic_distVar', 'reconstructed_morphologica

In [12]:
task = "all"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.5696541806173965
reconstructed_featural: 0.5676548287326185
t-statistic: 3.8568643941809024
p-value: 6.890033187957171e-05
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.571571979275728
reconstructed_featural: 0.5676548287326185
t-statistic: 3.6108327183199282
p-value: 0.00017601762110295747
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.570094537188873
reconstructed_syntactic_distVar: 0.5678031066352571
t-statistic: 3.5412154712955104
p-value: 0.00022743182215600902
The difference is statistically significant
-------------------------

In [34]:
task = "ner"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.5052744017684326
reconstructed_featural: 0.49947934356251256
t-statistic: 3.866540224747911
p-value: 0.00010845627027128958
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.5140822727352715
reconstructed_featural: 0.49947934356251256
t-statistic: 5.099558384560621
p-value: 1.0373958885215319e-06
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.5062823157986861
reconstructed_syntactic_distVar: 0.49944428243953554
t-statistic: 3.704583025826268
p-value: 0.00018907176248542088
The difference is statistically significant
----------------------

In [35]:
task = "ner"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
method = "limit"
subsets = {
    "with_adapters": with_adapters_limit_subset,
    "no_adapters": no_adapters_limit_subset,
    "xlm_included": xlm_included_limit_subset,
    "xlm_excluded": xlm_excluded_limit_subset,
}
sign = []
not_sign = []
for baseline in baselines:
    for subset_name, scores_subset in subsets.items():
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}, subset {subset_name}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_limit, baseline reconstructed_featural, subset with_adapters
average scores
reconstructed_featural_limit: 0.5093842699323033
reconstructed_featural: 0.4885533570818925
t-statistic: 5.073764356273462
p-value: 2.2037111564045107e-05
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural, subset no_adapters
average scores
reconstructed_featural_limit: 0.5158250802266954
reconstructed_featural: 0.5035325320956457
t-statistic: 3.422244374811933
p-value: 0.0005571769981067983
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural, subset xlm_included
average scores
reconstructed_featural_limit: 0.564280790165681
reconstructed_featural: 0.5523855837939105
t-statistic: 4.921674298394184
p-value: 3.841385892524253e-06
The difference is statistically significant
-

## Results NER distance comparison with limits
Both Limit and Thresholds significantly outperform their relative counterparts (no limitation).
best performing = limit, especially featural (0.514 vs 0.499 for overall NER)
==> Shows that our method has more potential, and can be further refined by looking for better limit and threshold values

In [36]:
task = "pos"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.4483538634919548
reconstructed_featural: 0.4485338790223098
t-statistic: -0.12885942222239558
p-value: 0.5510535925349446
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.4484558278153162
reconstructed_featural: 0.4485338790223098
t-statistic: -0.05020410696226338
p-value: 0.5199381152963118
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.44867461361811517
reconstructed_syntactic_distVar: 0.44801410251010987
t-statistic: 0.44166442528108035
p-value: 0.33014690506025063
The difference is not statistically significant
--------------

In [37]:
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["limit", "threshold"]
subsets = {
    "with_adapters": with_adapters_limit_subset,
    "no_adapters": no_adapters_limit_subset,
    "xlm_included": xlm_included_limit_subset,
    "xlm_excluded": xlm_excluded_limit_subset,
}
sign = []
not_sign = []
for subset_name, scores_subset in subsets.items():
    for baseline in baselines:
        for method in methods:
            if baseline == "reconstructed_featural":
                test_column = baseline + f"_{method}"
            else:
                # we replace "distVar" with "threshold"
                test_column = baseline.replace("distVar", method)

            print(f"test_column: {test_column}, baseline {baseline}, subset {subset_name}")
            t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_subset)
            if p_val < 0.05:
                print("The difference is statistically significant")
                sign.append(test_column + "_" + baseline)
            else:
                print("The difference is not statistically significant")
                not_sign.append(test_column + "_" + baseline)
            print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_limit, baseline reconstructed_featural, subset with_adapters
average scores
reconstructed_featural_limit: 0.5343047506476772
reconstructed_featural: 0.5303411521074455
t-statistic: 1.0319280309615306
p-value: 0.1592290797516526
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_threshold, baseline reconstructed_featural, subset with_adapters
average scores
reconstructed_featural_threshold: 0.5363598307162523
reconstructed_featural: 0.5303411521074455
t-statistic: 1.7671601981997402
p-value: 0.048767187580653654
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_limit, baseline reconstructed_syntactic_distVar, subset with_adapters
average scores
reconstructed_syntactic_limit: 0.5326257072297398
reconstructed_syntactic_distVar: 0.5305344422155532
t-statistic: 0.5112029403063311
p-value: 0.3083253702864398
The difference

F## Results POS distance comparison with limits
For POS, limiting or putting a threshold has no demonstrable positive effect on the results

In [38]:
task = "qa"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.7266866467655285
reconstructed_featural: 0.7271252762822759
t-statistic: -1.2504480934463686
p-value: 0.8814591421280972
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.7281344543568334
reconstructed_featural: 0.7271252762822759
t-statistic: 1.0506349044006917
p-value: 0.15798021009887603
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.7271196088629276
reconstructed_syntactic_distVar: 0.7270498698545946
t-statistic: 0.19561002530779972
p-value: 0.42424096051585775
The difference is not statistically significant
------------------

## Results QA distance comparison with limits
For QA, using a threshold that filters out more distant adapters has a significant improvement upon the results only for the morphological distance, which results in the highest overall score for the method.


In [39]:
task = "copa"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: nan
reconstructed_featural: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: nan
reconstructed_featural: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: nan
reconstructed_syntactic_distVar: nan
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_limit, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_limit: nan
reconstructed_syntactic_distVar

  t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)


In [40]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "copa"
test_columns = ["reconstructed_featural_limit"]
baselines = ["baseline_en", "baseline_avg_adapter", "baseline_closest_featural", "no_train_gain"]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_limit, baseline baseline_en
average scores
reconstructed_featural_limit: 0.5181818181818182
baseline_en: 0.5083636363636364
t-statistic: 1.4604198759809905
p-value: 0.08743404052954881
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline baseline_avg_adapter
average scores
reconstructed_featural_limit: 0.5181818181818182
baseline_avg_adapter: 0.5150909090909092
t-statistic: 0.5544198843936056
p-value: 0.2957370025331385
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline baseline_closest_featural
average scores
reconstructed_featural_limit: 0.5181818181818182
baseline_closest_featural: 0.5198181818181818
t-statistic: -0.2559960557948576
p-value: 0.5984285550587479
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_featural_limi

## Results COPA distance comparison with limits
reconstructed featural with a limit is the highest performing method!
Outperforms all other methods significantly. It results in scores that outperform most of the baselines that could not be matched with the regular method.
However, the highest score remains the MAD-X baseline, which still slightly outperforms our method: as our method approximates adapters, and is mainly targeted to create adapters for language that do not have existing language adapters, the COPA task is not the best representation of the method's performance. This is because all languages in this task setup have an existing adapter, which makes the method less relevant. 

# SIB analysis with limits

In [41]:
task = "sib"
baselines = ["reconstructed_featural", "reconstructed_syntactic_distVar", "reconstructed_morphological_distVar"]
methods = ["threshold", "limit"]
sign = []
not_sign = []
for baseline in baselines:
    for method in methods:
        if baseline == "reconstructed_featural":
            test_column = baseline + f"_{method}"
        else:
            # we replace "distVar" with "threshold"
            test_column = baseline.replace("distVar", method)

        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores_limit_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_featural_threshold, baseline reconstructed_featural
average scores
reconstructed_featural_threshold: 0.6327707219251336
reconstructed_featural: 0.6316705659536542
t-statistic: 2.758463290837115
p-value: 0.0032123259076249798
The difference is statistically significant
-----------------------------------
test_column: reconstructed_featural_limit, baseline reconstructed_featural
average scores
reconstructed_featural_limit: 0.6320326426024955
reconstructed_featural: 0.6316705659536542
t-statistic: 0.28180203796483927
p-value: 0.38921413622879614
The difference is not statistically significant
-----------------------------------
test_column: reconstructed_syntactic_threshold, baseline reconstructed_syntactic_distVar
average scores
reconstructed_syntactic_threshold: 0.6329796122994652
reconstructed_syntactic_distVar: 0.6321579768270945
t-statistic: 1.3339211587003226
p-value: 0.09198154087812509
The difference is not statistically significant
---------------------

## Results SIB distance comparison with limits
For SIB, best results are obtained by using syntactic with limit. As syntactic is already the highest scoring, this does not statistically outperform the other settings within syntactic distance.
-> Also no significant improvement over others, except for 

For morphological, values go down but not significantly when using a threshold or limit. 

Only statistically significant improvement is threshold for featural distance

In [44]:
# we compare with the variant distances and limits in pos
# we know we do better over finetune all over the line
task = "sib"
test_columns = ["reconstructed_syntactic_limit"]
baselines = [
    "baseline_en",
    "baseline_avg_adapter",
    "baseline_closest_featural",
    "no_train_gain",
    "reconstructed_featural",
    "reconstructed_morphological_distVar",
    "reconstructed_featural_limit",
]

sign = []
not_sign = []
for baseline in baselines:
    for test_column in test_columns:
        print(f"test_column: {test_column}, baseline {baseline}")
        t_stat, p_val = get_significance(task, test_column, baseline, alternative="greater", data=scores)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(test_column + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(test_column + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

test_column: reconstructed_syntactic_limit, baseline baseline_en
average scores
reconstructed_syntactic_limit: 0.6336898395721925
baseline_en: 0.5882074420677362
t-statistic: 11.097837950657471
p-value: 2.6217498595481545e-22
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_limit, baseline baseline_avg_adapter
average scores
reconstructed_syntactic_limit: 0.6336898395721925
baseline_avg_adapter: 0.5429060828877006
t-statistic: 20.34865138699926
p-value: 2.695662941050963e-48
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_limit, baseline baseline_closest_featural
average scores
reconstructed_syntactic_limit: 0.6336898395721925
baseline_closest_featural: 0.5652016488413547
t-statistic: 8.345711065913747
p-value: 1.0318153447336321e-14
The difference is statistically significant
-----------------------------------
test_column: reconstructed_syntactic_li