In [1]:
import os
import json
import math
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# From huggingface api, with our trained adapters (eu, sr)
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# scraped from cc-100 website
xlm_included_langs = [
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "br",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "fa",
    "ff",
    "fi",
    "fr",
    "fy",
    "ga",
    "gd",
    "gl",
    "gn",
    "gu",
    "ha",
    "he",
    "hi",
    "hr",
    "ht",
    "hu",
    "hy",
    "id",
    "ig",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ku",
    "ky",
    "la",
    "lg",
    "li",
    "ln",
    "lo",
    "lt",
    "lv",
    "mg",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "ne",
    "nl",
    "no",
    "ns",
    "om",
    "or",
    "pa",
    "pl",
    "ps",
    "pt",
    "qu",
    "rm",
    "ro",
    "ru",
    "sa",
    "si",
    "sc",
    "sd",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "ss",
    "su",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "tl",
    "tn",
    "tr",
    "ug",
    "uk",
    "ur",
    "uz",
    "vi",
    "wo",
    "xh",
    "yi",
    "yo",
    "zu",
    "zh",
]

scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
tasks = scores.keys()
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1"}
inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]

len(scores_dist) 62
og len(scores) 90


In [2]:
def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val


def make_boxplot(tasks, columns):
    data = []
    if tasks == "all":
        task_names = scores.keys()
    elif len(tasks) == 1:
        task_names = [tasks]
    else:
        task_names = tasks
    for task in task_names:
        for lang_name in scores[task]:
            if all(col in scores[task][lang_name] for col in columns):
                data.append([task, lang_name] + [scores[task][lang_name][col] for col in columns])
    df = pd.DataFrame(data, columns=["task", "lang_name"] + columns)
    # we divide the scores of qa by 100
    df.loc[df["task"] == "qa", columns] = df.loc[df["task"] == "qa", columns] / 100
    # we melt the dataframe to get it in the right format for seaborn
    df_melted = df.melt(id_vars=["task", "lang_name"], value_vars=columns, var_name="method", value_name="score")
    # we plot the data
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="method", y="score", data=df_melted)
    plt.title(f"Comparison of methods for {tasks}")
    plt.xlabel("Method")
    plt.ylabel("Score")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [5]:
# we will compare "base" to "extended"
extended = "reconstructed_featural_base"
baselines = ["reconstructed_featural", "reconstructed_featural_eu", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", extended, baseline, alternative="greater")
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5058487615875525
reconstructed_featural: 0.5037595015953552
t-statistic: 2.9138025357622714
p-value: 0.0019575295560539715
The difference is statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_eu
average scores
reconstructed_featural_base: 0.5042205974641217
reconstructed_featural_eu: 0.5035794215130618
t-statistic: 1.16891359238375
p-value: 0.12180149514837858
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_base: 0.5042205974641217
reconstructed_featural_sr: 0.5035835770013597
t-statistic: 1.1832859546938366
p-value: 0.11893574887789471
The difference is not statistically significant
-----------------------------------
sign ['all_reconstructed_featural']

notsign ['all_reconstructed_featural_eu', 'all_reconstructed_featural_sr'

# Result
Extending with additional adapters is WORSE than using only the base set!
This is coherent with the result that limiting the amount of languages to be taken into account is useful.
However, the differences are very small, and the analysis takes all languages across all tasks into account.

In [9]:
# we will compare "base" to "extended"
extended = "reconstructed_featural_base"
baselines = ["reconstructed_featural", "reconstructed_featural_eu", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", extended, baseline, alternative="greater", data=scores_subset)
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5552949616207513
reconstructed_featural: 0.5527708276115005
t-statistic: 2.116519452597469
p-value: 0.018954836285634828
The difference is statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_eu
average scores
reconstructed_featural_base: 0.5529363382122867
reconstructed_featural_eu: 0.5521205430836231
t-statistic: 0.9358906262255084
p-value: 0.1762735480783661
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_base: 0.5529363382122867
reconstructed_featural_sr: 0.5521876081608454
t-statistic: 0.8901195800448133
p-value: 0.18822536738765577
The difference is not statistically significant
-----------------------------------
sign ['all_reconstructed_featural']

notsign ['all_reconstructed_featural_eu', 'all_reconstructed_featural_sr']

# Checking if the closer adapters to sr and eu do benefit from the method

In [10]:
from huggingface_hub import HfApi
from qq import LanguageData, TagType
from urielplus import urielplus

ld = LanguageData.from_db()

api = HfApi()
# Fetch all AdapterHub xlm-roberta-base adapters
models = api.list_models(author="AdapterHub", library="adapter-transformers", search="xlm-roberta-base-")
# we print all found models

to_load = {
    m.modelId: m.modelId.split("xlm-roberta-base-")[-1].rsplit("-wiki_pfeiffer", 1)[0]
    for m in models
    if m.modelId.startswith("AdapterHub/xlm-roberta-base-") and m.modelId.endswith("-wiki_pfeiffer")
}


def get_glots(iso_list):
    manuals = {
        "Arabic": "arab1267",
        "Swahili": "swah1253",
        "Bengali": "beng1282",
        "Chinese": "mand1415",
        "Persian": "west2369",
        "Yoruba": "ilaa1246",
        "Nepali": "nepa1254",
        "Quechua": "cusc1236",
        "Estonian": "esto1258",
        "Guarani": "east2555",
    }

    glots = {}
    probs = []

    for lang in iso_list:
        eng = ld.get(lang, tag_type=TagType.BCP_47_CODE).english_name
        glot = ld.get(lang, tag_type=TagType.BCP_47_CODE).glottocode
        # we need to find if glot is in distances
        if not glot:
            if eng in manuals.keys():
                glot = manuals[eng]
        if eng and glot:
            glots[eng] = (lang, glot)
        else:
            probs.append(lang)

    print("no glottocodes found for these languages: ", probs)

    return glots


glots = get_glots(to_load.values())
iso_list = []
iso_s = [scores[key].keys() for key in scores.keys()]


# iso_list = [scores[key].keys() for key in scores.keys()]
for el in iso_s:
    for iso in el:
        iso_list.append(iso)
eval_glots = get_glots(iso_list)
u = urielplus.URIELPlus()


def typological_distance(target, glots):
    """
    This function takes a target language and a list of languages.
    It weights the other languages depending on their closeness to the target language.
    """

    # 1. retrieve closeness score of all languages to target language
    weights = {}
    probs = []
    for lang, codes in glots.items():
        iso, glot = codes
        # get the distance
        try:
            dist = u.new_distance("featural", [glot, target])
            # print(f"Distance {lang} to {target}: {dist}")
            weights[iso] = dist

        except Exception:
            print(f"Error: {lang} - {glot} - {target}")
            probs.append(lang)
    # delete the problematic from glots
    for lang in probs:
        del glots[lang]

    return weights

no glottocodes found for these languages:  []
no glottocodes found for these languages:  ['az', 'mg', 'ms', 'or']


In [None]:
eu_glots = eval_glots.copy()
sr_glots = eval_glots.copy()
eu_dists = typological_distance(ld.get("eu", TagType.BCP_47_CODE).glottocode, eu_glots)
sr_dists = typological_distance(ld.get("sr", TagType.BCP_47_CODE).glottocode, sr_glots)

In [12]:
# we add this to the data
for task_name in scores_subset:
    for lang_name in scores_subset[task_name]:
        if lang_name in eu_dists:
            scores_subset[task_name][lang_name]["eu_dist"] = eu_dists[lang_name]
        if lang_name in sr_dists:
            scores_subset[task_name][lang_name]["sr_dist"] = sr_dists[lang_name]

In [14]:
# we run the significance test again, but only taking into account the closest languages
median_dist_eu = np.median(list(eu_dists.values()))
median_dist_sr = np.median(list(sr_dists.values()))
print("median distance eu", median_dist_eu)
print("median distance sr", median_dist_sr)
subset_eu = {}
for task_name in scores_subset:
    subset_eu[task_name] = {}
    for lang_name in scores_subset[task_name]:
        if lang_name in eu_dists and eu_dists[lang_name] < median_dist_eu:
            subset_eu[task_name][lang_name] = scores_subset[task_name][lang_name]
subset_sr = {}
for task_name in scores_subset:
    subset_sr[task_name] = {}
    for lang_name in scores_subset[task_name]:
        if lang_name in sr_dists and sr_dists[lang_name] < median_dist_sr:
            subset_sr[task_name][lang_name] = scores_subset[task_name][lang_name]

median distance eu 0.5867
median distance sr 0.4646


In [16]:
# we run significance tests
# we will compare "base" to "extended"
base = "reconstructed_featural_eu"
baselines = ["reconstructed_featural_base", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", base, baseline, alternative="greater", data=subset_eu)
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural_base
average scores
reconstructed_featural_eu: 0.6159393202082123
reconstructed_featural_base: 0.6164917575087449
t-statistic: -0.7808833941300625
p-value: 0.7796950256740595
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_eu: 0.6159393202082123
reconstructed_featural_sr: 0.615646222253811
t-statistic: 0.5068908152872996
p-value: 0.30785337274677144
The difference is not statistically significant
-----------------------------------
sign []

notsign ['all_reconstructed_featural_base', 'all_reconstructed_featural_sr']


In [18]:
difference_eu_base = {}
for task_name in scores:
    difference_eu_base[task_name] = {}
    for lang_name in scores[task_name]:
        if (
            "reconstructed_featural_base" in scores[task_name][lang_name]
            and "reconstructed_featural_eu" in scores[task_name][lang_name]
        ):
            difference_eu_base[task_name][lang_name] = (
                scores[task_name][lang_name]["reconstructed_featural_eu"]
                - scores[task_name][lang_name]["reconstructed_featural_base"]
            )
difference_eu_base

{'ner': {'ace': -0.011021847245055227,
  'af': 0.001175962574009648,
  'als': -0.004561523548865409,
  'am': 0.001425601425601497,
  'an': 0.005389832248066773,
  'arz': 0.022140221402214,
  'ar': -0.0011907621544379743,
  'as': 0.025723121702746987,
  'ay': -0.005677924620655861,
  'bar': 0.015622647191687933,
  'ba': -0.008732237834405049,
  'be': -0.0016130765290583993,
  'bg': -0.002204374925134922,
  'bn': -0.0024011779702780567,
  'bo': 0.0,
  'br': -0.007233019619595993,
  'bs': -0.0035284389977650044,
  'ca': -0.0038713937401643195,
  'cdo': -0.004014272970562005,
  'ceb': -0.014160068092130773,
  'ce': 0.0011741859729937631,
  'ckb': 0.0001462273347631049,
  'co': -0.008230452674897193,
  'crh': 0.010921940250562201,
  'cs': 0.0009501219578313957,
  'cv': -0.0018302447952412981,
  'cy': -0.007468860525911114,
  'da': -0.0008571672699058119,
  'de': -0.0009523809523810378,
  'diq': 0.0,
  'dv': 0.0,
  'el': -0.0015832153311616803,
  'en': -0.00012661675768255165,
  'eo': -0.001

In [20]:
# we check which languages are the most different
diffs = []
for task_name in difference_eu_base:
    for lang_name in difference_eu_base[task_name]:
        diffs.append((task_name, lang_name, difference_eu_base[task_name][lang_name]))
diffs = sorted(diffs, key=lambda x: x[2], reverse=True)
diffs[:10]

[('pos', 'hu', 0.041604498656944155),
 ('pos', 'et', 0.039321340863233356),
 ('ner', 'sah', 0.03072895135731013),
 ('ner', 'ku', 0.029540836364257195),
 ('ner', 'as', 0.025723121702746987),
 ('ner', 'arz', 0.022140221402214),
 ('ner', 'so', 0.018475064603320712),
 ('ner', 'ne', 0.015810276679841917),
 ('ner', 'bar', 0.015622647191687933),
 ('qa', 'th', 0.01462051487261562)]

In [21]:
# we check which languages are the most different
diffs = []
for task_name in difference_eu_base:
    for lang_name in difference_eu_base[task_name]:
        diffs.append((task_name, lang_name, difference_eu_base[task_name][lang_name]))
diffs = sorted(diffs, key=lambda x: x[2], reverse=False)
diffs[:10]

[('ner', 'yo', -0.04074374123179664),
 ('ner', 'war', -0.039820904334705476),
 ('ner', 'vls', -0.0280930142521838),
 ('ner', 'mn', -0.027401129943502855),
 ('qa', 'zh', -0.020876145329926765),
 ('ner', 'ilo', -0.020256540513081123),
 ('pos', 'olo', -0.01989618111777658),
 ('ner', 'ceb', -0.014160068092130773),
 ('ner', 'fo', -0.013967896367220733),
 ('ner', 'eu', -0.012945043523727162)]

# Unexpected result
Adding more adapters does not seem to be beneficial.
We see an overall decrease in score, even though the difference is very small.
Surprisingly, basque itself is one of the languages that suffers the GREATEST decrease in performance when the Basque adapter is taken into account!