In [11]:
import os
import json
import math
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# From huggingface api, with our trained adapters (eu, sr)
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# scraped from cc-100 website
xlm_included_langs = [
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "br",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "fa",
    "ff",
    "fi",
    "fr",
    "fy",
    "ga",
    "gd",
    "gl",
    "gn",
    "gu",
    "ha",
    "he",
    "hi",
    "hr",
    "ht",
    "hu",
    "hy",
    "id",
    "ig",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ku",
    "ky",
    "la",
    "lg",
    "li",
    "ln",
    "lo",
    "lt",
    "lv",
    "mg",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "ne",
    "nl",
    "no",
    "ns",
    "om",
    "or",
    "pa",
    "pl",
    "ps",
    "pt",
    "qu",
    "rm",
    "ro",
    "ru",
    "sa",
    "si",
    "sc",
    "sd",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "ss",
    "su",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "tl",
    "tn",
    "tr",
    "ug",
    "uk",
    "ur",
    "uz",
    "vi",
    "wo",
    "xh",
    "yi",
    "yo",
    "zu",
    "zh",
]

scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
tasks = scores.keys()
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1"}
inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]

In [12]:
def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val


def make_boxplot(tasks, columns):
    data = []
    if tasks == "all":
        task_names = scores.keys()
    elif len(tasks) == 1:
        task_names = [tasks]
    else:
        task_names = tasks
    for task in task_names:
        for lang_name in scores[task]:
            if all(col in scores[task][lang_name] for col in columns):
                data.append([task, lang_name] + [scores[task][lang_name][col] for col in columns])
    df = pd.DataFrame(data, columns=["task", "lang_name"] + columns)
    # we divide the scores of qa by 100
    df.loc[df["task"] == "qa", columns] = df.loc[df["task"] == "qa", columns] / 100
    # we melt the dataframe to get it in the right format for seaborn
    df_melted = df.melt(id_vars=["task", "lang_name"], value_vars=columns, var_name="method", value_name="score")
    # we plot the data
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="method", y="score", data=df_melted)
    plt.title(f"Comparison of methods for {tasks}")
    plt.xlabel("Method")
    plt.ylabel("Score")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [13]:
# we will compare "base" to "extended"
extended = "reconstructed_featural_base"
baselines = ["reconstructed_featural", "reconstructed_featural_eu", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", extended, baseline, alternative="greater")
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5058487615875525
reconstructed_featural: 0.5037595015953552
t-statistic: 2.9138025357622714
p-value: 0.0019575295560539715
The difference is statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_eu
average scores
reconstructed_featural_base: 0.5042205974641217
reconstructed_featural_eu: 0.5035794215130618
t-statistic: 1.16891359238375
p-value: 0.12180149514837858
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_base: 0.5042205974641217
reconstructed_featural_sr: 0.5035835770013597
t-statistic: 1.1832859546938366
p-value: 0.11893574887789471
The difference is not statistically significant
-----------------------------------
sign ['all_reconstructed_featural']

notsign ['all_reconstructed_featural_eu', 'all_reconstructed_featural_sr'

# Result
Extending with additional adapters is WORSE than using only the base set!
This is coherent with the result that limiting the amount of languages to be taken into account is useful.
However, the differences are very small, and the analysis takes all languages across all tasks into account.

In [14]:
# we will compare "base" to "extended"
extended = "reconstructed_featural_base"
baselines = ["reconstructed_featural", "reconstructed_featural_eu", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", extended, baseline, alternative="greater", data=scores_subset)
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5552949616207513
reconstructed_featural: 0.5527708276115005
t-statistic: 2.116519452597469
p-value: 0.018954836285634828
The difference is statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_eu
average scores
reconstructed_featural_base: 0.5529363382122867
reconstructed_featural_eu: 0.5521205430836231
t-statistic: 0.9358906262255084
p-value: 0.1762735480783661
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_base: 0.5529363382122867
reconstructed_featural_sr: 0.5521876081608454
t-statistic: 0.8901195800448133
p-value: 0.18822536738765577
The difference is not statistically significant
-----------------------------------
sign ['all_reconstructed_featural']

notsign ['all_reconstructed_featural_eu', 'all_reconstructed_featural_sr']

# Checking if the closer adapters to sr and eu do benefit from the method

In [18]:
from huggingface_hub import HfApi
from qq import LanguageData, TagType
from urielplus import urielplus

ld = LanguageData.from_db()

api = HfApi()
# Fetch all AdapterHub xlm-roberta-base adapters
models = api.list_models(author="AdapterHub", library="adapter-transformers", search="xlm-roberta-base-")
# we print all found models

to_load = {
    m.modelId: m.modelId.split("xlm-roberta-base-")[-1].rsplit("-wiki_pfeiffer", 1)[0]
    for m in models
    if m.modelId.startswith("AdapterHub/xlm-roberta-base-") and m.modelId.endswith("-wiki_pfeiffer")
}


def get_glots(iso_list):
    manuals = {
        "Arabic": "arab1267",
        "Swahili": "swah1253",
        "Bengali": "beng1282",
        "Chinese": "mand1415",
        "Persian": "west2369",
        "Yoruba": "ilaa1246",
        "Nepali": "nepa1254",
        "Quechua": "cusc1236",
        "Estonian": "esto1258",
        "Guarani": "east2555",
    }

    glots = {}
    probs = []

    for lang in iso_list:
        eng = ld.get(lang, tag_type=TagType.BCP_47_CODE).english_name
        glot = ld.get(lang, tag_type=TagType.BCP_47_CODE).glottocode
        # we need to find if glot is in distances
        if not glot:
            if eng in manuals.keys():
                glot = manuals[eng]
        if eng and glot:
            glots[eng] = (lang, glot)
        else:
            probs.append(lang)

    print("no glottocodes found for these languages: ", probs)

    return glots


glots = get_glots(to_load.values())
iso_list = []
iso_s = [scores[key].keys() for key in scores.keys()]


# iso_list = [scores[key].keys() for key in scores.keys()]
for el in iso_s:
    for iso in el:
        iso_list.append(iso)
eval_glots = get_glots(iso_list)
u = urielplus.URIELPlus()

no glottocodes found for these languages:  []
no glottocodes found for these languages:  ['az', 'mg', 'ms', 'or']


In [20]:
def typological_distance(target, glots):
    """
    This function takes a target language and a list of languages.
    It weights the other languages depending on their closeness to the target language.
    """

    # 1. retrieve closeness score of all languages to target language
    weights = {}
    probs = []
    for lang, codes in glots.items():
        print(lang, codes)
        iso, glot = codes
        # get the distance
        try:
            dist = u.new_distance("featural", [glot, target])
            # print(f"Distance {lang} to {target}: {dist}")
            weights[iso] = dist

        except SystemExit:
            print(f"Error: {lang} - {glot} - {target}")
            probs.append(lang)
    # delete the problematic from glots
    for lang in probs:
        del glots[lang]

    return weights

In [22]:
eu_glots = eval_glots.copy()
sr_glots = eval_glots.copy()
eu_dists = typological_distance(ld.get("eu", TagType.BCP_47_CODE).glottocode, eu_glots)
sr_dists = typological_distance(ld.get("sr", TagType.BCP_47_CODE).glottocode, sr_glots)

2025-05-27 10:36:56,021 - root - INFO - In new_distance, calculated angular distance for featural with achi1257 and serb1264: 0.004992246627807617 seconds
2025-05-27 10:36:56,027 - root - INFO - In new_distance, calculated angular distance for featural with afri1274 and serb1264: 0.004000663757324219 seconds
2025-05-27 10:36:56,034 - root - INFO - In new_distance, calculated angular distance for featural with tosk1239 and serb1264: 0.005098104476928711 seconds
2025-05-27 10:36:56,039 - root - INFO - In new_distance, calculated angular distance for featural with amha1245 and serb1264: 0.003988027572631836 seconds
2025-05-27 10:36:56,043 - root - ERROR - No shared featural features between arag1245 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-05-27 10:36:56,048 - root - INFO - In new_distance, calculated angular distance for featural with egyp1253 and serb1264: 0.005009174346923828 seconds
2025-05-27 10:36:56,053 - root - INFO - I

Achinese ('ace', 'achi1257')
Afrikaans ('af', 'afri1274')
Tosk Albanian ('als', 'tosk1239')
Amharic ('am', 'amha1245')
Aragonese ('an', 'arag1245')
Error: Aragonese - arag1245 - serb1264
Egyptian Arabic ('arz', 'egyp1253')
Arabic ('ar', 'arab1267')
Asturian ('ast', 'astu1245')
Error: Asturian - astu1245 - serb1264
Assamese ('as', 'assa1263')
Aymara ('ay', 'nucl1667')
Bavarian ('bar', 'bava1246')
Bashkir ('ba', 'bash1264')
Belarusian ('be', 'bela1254')
Bulgarian ('bg', 'bulg1262')
Bengali ('bn', 'beng1280')
Tibetan ('bo', 'tibe1272')
Breton ('br', 'bret1244')
Bosnian ('bs', 'bosn1245')
Catalan ('ca', 'stan1289')
Min Dong Chinese ('cdo', 'mind1253')
Cebuano ('ceb', 'cebu1242')
Chechen ('ce', 'chec1245')
Sorani Kurdish ('ckb', 'cent1972')
Corsican ('co', 'cors1241')
Crimean Tatar ('crh', 'crim1257')
Kashubian ('csb', 'kash1274')
Error: Kashubian - kash1274 - serb1264
Czech ('cs', 'czec1258')
Chuvash ('cv', 'chuv1255')
Welsh ('cy', 'wels1247')
Danish ('da', 'dani1285')
German ('de', 'stan1

2025-05-27 10:36:56,197 - root - INFO - In new_distance, calculated angular distance for featural with stan1293 and serb1264: 0.0050051212310791016 seconds
2025-05-27 10:36:56,203 - root - INFO - In new_distance, calculated angular distance for featural with espe1235 and serb1264: 0.0050051212310791016 seconds
2025-05-27 10:36:56,208 - root - INFO - In new_distance, calculated angular distance for featural with stan1288 and serb1264: 0.004001617431640625 seconds
2025-05-27 10:36:56,213 - root - INFO - In new_distance, calculated angular distance for featural with esto1258 and serb1264: 0.004262208938598633 seconds
2025-05-27 10:36:56,218 - root - INFO - In new_distance, calculated angular distance for featural with basq1248 and serb1264: 0.004002571105957031 seconds
2025-05-27 10:36:56,223 - root - INFO - In new_distance, calculated angular distance for featural with extr1243 and serb1264: 0.003999233245849609 seconds
2025-05-27 10:36:56,229 - root - INFO - In new_distance, calculated 

English ('en', 'stan1293')
Esperanto ('eo', 'espe1235')
Spanish ('es', 'stan1288')
Estonian ('et', 'esto1258')
Basque ('eu', 'basq1248')
Extremaduran ('ext', 'extr1243')
Persian ('fa', 'west2369')
Finnish ('fi', 'finn1318')
Faroese ('fo', 'faro1244')
Northern Frisian ('frr', 'nort2626')
Error: Northern Frisian - nort2626 - serb1264
French ('fr', 'stan1290')
Friulian ('fur', 'friu1240')
Error: Friulian - friu1240 - serb1264
Frisian ('fy', 'west2354')
Gan Chinese ('gan', 'ganc1239')
Irish ('ga', 'iris1253')
Scots Gaelic ('gd', 'scot1245')
Galician ('gl', 'gali1258')
Guarani ('gn', 'east2555')
Gujarati ('gu', 'guja1252')
Hakka Chinese ('hak', 'hakk1236')
Hebrew ('he', 'hebr1245')
Hindi ('hi', 'hind1269')
Croatian ('hr', 'croa1245')
Upper Sorbian ('hsb', 'uppe1395')
Error: Upper Sorbian - uppe1395 - serb1264
Hungarian ('hu', 'hung1274')
Armenian ('hy', 'nucl1235')
Indonesian ('id', 'indo1316')
Igbo ('ig', 'nucl1417')
Ilocano ('ilo', 'ilok1237')
Icelandic ('is', 'icel1247')
Italian ('it', '

2025-05-27 10:36:56,388 - root - INFO - In new_distance, calculated angular distance for featural with kore1280 and serb1264: 0.004500627517700195 seconds
2025-05-27 10:36:56,392 - root - ERROR - No shared featural features between kols1241 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-05-27 10:36:56,402 - root - INFO - In new_distance, calculated angular distance for featural with kurd1259 and serb1264: 0.007733821868896484 seconds
2025-05-27 10:36:56,409 - root - INFO - In new_distance, calculated angular distance for featural with kirg1245 and serb1264: 0.0062029361724853516 seconds
2025-05-27 10:36:56,417 - root - INFO - In new_distance, calculated angular distance for featural with luxe1241 and serb1264: 0.006638526916503906 seconds
2025-05-27 10:36:56,420 - root - ERROR - No shared featural features between ligu1248 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-05-27 

Korean ('ko', 'kore1280')
Kölsch ('ksh', 'kols1241')
Error: Kölsch - kols1241 - serb1264
Kurmanji Kurdish ('ku', 'kurd1259')
Kyrgyz ('ky', 'kirg1245')
Luxembourgish ('lb', 'luxe1241')
Ligurian ('lij', 'ligu1248')
Error: Ligurian - ligu1248 - serb1264
Lombard ('lmo', 'lomb1257')
Lingala ('ln', 'ling1263')
Lithuanian ('lt', 'lith1251')
Latvian ('lv', 'latv1249')
Eastern Mari ('mhr', 'east2328')
Minangkabau ('min', 'mina1268')
Maori ('mi', 'maor1246')
Macedonian ('mk', 'mace1250')
Malayalam ('ml', 'mala1464')
Mongolian ('mn', 'mong1331')
Marathi ('mr', 'mara1378')
Maltese ('mt', 'malt1254')
Mirandese ('mwl', 'mira1251')
Error: Mirandese - mira1251 - serb1264
Myanmar (Burmese) ('my', 'nucl1310')
Mazanderani ('mzn', 'maza1291')
Neapolitan ('nap', 'neap1235')
Low German ('nds', 'nort2627')
Error: Low German - nort2627 - serb1264
Nepali ('ne', 'east1436')
Dutch ('nl', 'dutc1256')
Norwegian Nynorsk ('nn', 'norw1262')
Error: Norwegian Nynorsk - norw1262 - serb1264
Norwegian ('no', 'norw1258')
O

2025-05-27 10:36:56,573 - root - INFO - In new_distance, calculated angular distance for featural with port1283 and serb1264: 0.004083395004272461 seconds
2025-05-27 10:36:56,578 - root - INFO - In new_distance, calculated angular distance for featural with cusc1236 and serb1264: 0.0039997100830078125 seconds
2025-05-27 10:36:56,584 - root - INFO - In new_distance, calculated angular distance for featural with roma1326 and serb1264: 0.0050776004791259766 seconds
2025-05-27 10:36:56,589 - root - INFO - In new_distance, calculated angular distance for featural with roma1327 and serb1264: 0.004004240036010742 seconds
2025-05-27 10:36:56,594 - root - INFO - In new_distance, calculated angular distance for featural with russ1263 and serb1264: 0.004347801208496094 seconds
2025-05-27 10:36:56,599 - root - INFO - In new_distance, calculated angular distance for featural with kiny1244 and serb1264: 0.00400853157043457 seconds
2025-05-27 10:36:56,604 - root - INFO - In new_distance, calculated a

Portuguese ('pt', 'port1283')
Quechua ('qu', 'cusc1236')
Romansh ('rm', 'roma1326')
Romanian ('ro', 'roma1327')
Russian ('ru', 'russ1263')
Kinyarwanda ('rw', 'kiny1244')
Yakut ('sah', 'yaku1245')
Sicilian ('scn', 'sici1248')
Scots ('sco', 'scot1243')
Sindhi ('sd', 'sind1272')
Serbo-Croatian ('sh', 'sout1528')
Sinhala ('si', 'sinh1246')
Slovak ('sk', 'slov1269')
Error: Slovak - slov1269 - serb1264
Slovenian ('sl', 'slov1268')
Somali ('so', 'soma1255')
Albanian ('sq', 'alba1267')
Serbian ('sr', 'serb1264')
Sundanese ('su', 'sund1252')
Swedish ('sv', 'swed1254')
Swahili ('sw', 'swah1253')
Silesian ('szl', 'sile1253')
Error: Silesian - sile1253 - serb1264
Tamil ('ta', 'tami1289')
Telugu ('te', 'telu1262')
Tajik ('tg', 'taji1245')
Thai ('th', 'thai1261')
Turkmen ('tk', 'turk1304')
Tagalog ('tl', 'taga1270')
Turkish ('tr', 'nucl1301')
Tatar ('tt', 'tata1255')
Uyghur ('ug', 'uigh1240')
Ukrainian ('uk', 'ukra1253')
Urdu ('ur', 'urdu1245')
Uzbek ('uz', 'uzbe1247')
Venetian ('vec', 'vene1258')
E

2025-05-27 10:36:56,761 - root - INFO - In new_distance, calculated angular distance for featural with vlaa1240 and serb1264: 0.004000186920166016 seconds
2025-05-27 10:36:56,767 - root - INFO - In new_distance, calculated angular distance for featural with wara1300 and serb1264: 0.005017757415771484 seconds
2025-05-27 10:36:56,770 - root - ERROR - No shared featural features between wall1255 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-05-27 10:36:56,776 - root - INFO - In new_distance, calculated angular distance for featural with wuch1236 and serb1264: 0.005070686340332031 seconds
2025-05-27 10:36:56,779 - root - ERROR - No shared featural features between ming1252 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-05-27 10:36:56,780 - root - ERROR - Unknown languages: yidd1255.
2025-05-27 10:36:56,787 - root - INFO - In new_distance, calculated angular distance for featural

Vlaams ('vls', 'vlaa1240')
Waray ('war', 'wara1300')
Walloon ('wa', 'wall1255')
Error: Walloon - wall1255 - serb1264
Wu Chinese ('wuu', 'wuch1236')
Mingrelian ('xmf', 'ming1252')
Error: Mingrelian - ming1252 - serb1264
Yiddish ('yi', 'yidd1255')
Error: Yiddish - yidd1255 - serb1264
Yoruba ('yo', 'yoru1245')
Zeeuws ('zea', 'zeeu1238')
Chinese ('zh', 'mand1415')
Limburgan ('li', 'limb1263')
Error: Limburgan - limb1263 - serb1264
Assyrian Neo-Aramaic ('aii', 'assy1241')
Error: Assyrian Neo-Aramaic - assy1241 - serb1264
South Levantine Arabic ('ajp', 'sout3123')
Error: South Levantine Arabic - sout3123 - serb1264
Apurinã ('apu', 'apur1254')
Akuntsu ('aqz', 'akun1241')
Bhojpuri ('bho', 'bhoj1244')
Bambara ('bm', 'bamb1269')
Russia Buriat ('bxr', 'russ1264')
Chukot ('ckt', 'chuk1273')
Coptic ('cop', 'copt1239')
Swiss German ('gsw', 'swis1247')
Mbyá Guaraní ('gun', 'mbya1239')
Manx ('gv', 'manx1243')
Khunsari ('kfm', 'khun1255')
Error: Khunsari - khun1255 - serb1264
Northern Kurdish ('kmr', '

2025-05-27 10:36:56,947 - root - INFO - In new_distance, calculated angular distance for featural with hait1244 and serb1264: 0.004001617431640625 seconds


Haitian Creole ('ht', 'hait1244')


In [24]:
# we add this to the data
for task_name in scores_subset:
    for lang_name in scores_subset[task_name]:
        if lang_name in eu_dists:
            scores_subset[task_name][lang_name]["eu_dist"] = eu_dists[lang_name]
        if lang_name in sr_dists:
            scores_subset[task_name][lang_name]["sr_dist"] = sr_dists[lang_name]

In [25]:
# we run the significance test again, but only taking into account the closest languages
median_dist_eu = np.median(list(eu_dists.values()))
median_dist_sr = np.median(list(sr_dists.values()))
print("median distance eu", median_dist_eu)
print("median distance sr", median_dist_sr)
subset_eu = {}
for task_name in scores_subset:
    subset_eu[task_name] = {}
    for lang_name in scores_subset[task_name]:
        if lang_name in eu_dists and eu_dists[lang_name] < median_dist_eu:
            subset_eu[task_name][lang_name] = scores_subset[task_name][lang_name]
subset_sr = {}
for task_name in scores_subset:
    subset_sr[task_name] = {}
    for lang_name in scores_subset[task_name]:
        if lang_name in sr_dists and sr_dists[lang_name] < median_dist_sr:
            subset_sr[task_name][lang_name] = scores_subset[task_name][lang_name]

median distance eu 0.5867
median distance sr 0.4646


In [26]:
# we run significance tests
# we will compare "base" to "extended"
base = "reconstructed_featural_eu"
baselines = ["reconstructed_featural_base", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", base, baseline, alternative="greater", data=subset_eu)
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural_base
average scores
reconstructed_featural_eu: 0.6159393202082123
reconstructed_featural_base: 0.6164917575087449
t-statistic: -0.7808833941300625
p-value: 0.7796950256740595
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_eu: 0.6159393202082123
reconstructed_featural_sr: 0.615646222253811
t-statistic: 0.5068908152872996
p-value: 0.30785337274677144
The difference is not statistically significant
-----------------------------------
sign []

notsign ['all_reconstructed_featural_base', 'all_reconstructed_featural_sr']


In [27]:
difference_eu_base = {}
for task_name in scores:
    difference_eu_base[task_name] = {}
    for lang_name in scores[task_name]:
        if (
            "reconstructed_featural_base" in scores[task_name][lang_name]
            and "reconstructed_featural_eu" in scores[task_name][lang_name]
        ):
            difference_eu_base[task_name][lang_name] = (
                scores[task_name][lang_name]["reconstructed_featural_eu"]
                - scores[task_name][lang_name]["reconstructed_featural_base"]
            )
difference_eu_base

{'ner': {'ace': -0.011021847245055227,
  'af': 0.001175962574009648,
  'als': -0.004561523548865409,
  'am': 0.001425601425601497,
  'an': 0.005389832248066773,
  'arz': 0.022140221402214,
  'ar': -0.0011907621544379743,
  'as': 0.025723121702746987,
  'ay': -0.005677924620655861,
  'bar': 0.015622647191687933,
  'ba': -0.008732237834405049,
  'be': -0.0016130765290583993,
  'bg': -0.002204374925134922,
  'bn': -0.0024011779702780567,
  'bo': 0.0,
  'br': -0.007233019619595993,
  'bs': -0.0035284389977650044,
  'ca': -0.0038713937401643195,
  'cdo': -0.004014272970562005,
  'ceb': -0.014160068092130773,
  'ce': 0.0011741859729937631,
  'ckb': 0.0001462273347631049,
  'co': -0.008230452674897193,
  'crh': 0.010921940250562201,
  'cs': 0.0009501219578313957,
  'cv': -0.0018302447952412981,
  'cy': -0.007468860525911114,
  'da': -0.0008571672699058119,
  'de': -0.0009523809523810378,
  'diq': 0.0,
  'dv': 0.0,
  'el': -0.0015832153311616803,
  'en': -0.00012661675768255165,
  'eo': -0.001

In [28]:
# we check which languages are the most different
diffs = []
for task_name in difference_eu_base:
    for lang_name in difference_eu_base[task_name]:
        diffs.append((task_name, lang_name, difference_eu_base[task_name][lang_name]))
diffs = sorted(diffs, key=lambda x: x[2], reverse=True)
diffs[:10]

[('pos', 'hu', 0.041604498656944155),
 ('pos', 'et', 0.039321340863233356),
 ('ner', 'sah', 0.03072895135731013),
 ('ner', 'ku', 0.029540836364257195),
 ('ner', 'as', 0.025723121702746987),
 ('ner', 'arz', 0.022140221402214),
 ('ner', 'so', 0.018475064603320712),
 ('ner', 'ne', 0.015810276679841917),
 ('ner', 'bar', 0.015622647191687933),
 ('qa', 'th', 0.01462051487261562)]

In [29]:
# we check which languages are the most different
diffs = []
for task_name in difference_eu_base:
    for lang_name in difference_eu_base[task_name]:
        diffs.append((task_name, lang_name, difference_eu_base[task_name][lang_name]))
diffs = sorted(diffs, key=lambda x: x[2], reverse=False)
diffs[:10]

[('ner', 'yo', -0.04074374123179664),
 ('ner', 'war', -0.039820904334705476),
 ('ner', 'vls', -0.0280930142521838),
 ('ner', 'mn', -0.027401129943502855),
 ('qa', 'zh', -0.020876145329926765),
 ('ner', 'ilo', -0.020256540513081123),
 ('pos', 'olo', -0.01989618111777658),
 ('ner', 'ceb', -0.014160068092130773),
 ('ner', 'fo', -0.013967896367220733),
 ('ner', 'eu', -0.012945043523727162)]

# Unexpected result
Adding more adapters does not seem to be beneficial.
We see an overall decrease in score, even though the difference is very small.
Surprisingly, basque itself is one of the languages that suffers the GREATEST decrease in performance when the Basque adapter is taken into account!

In [35]:
# we start by preparing a dataframe: language, scores (& baselines), true/false labels for xlm inclusion and for adapter availability
tasks = ["ner", "pos", "copa", "qa"]
scores_inclusion = [
    "reconstructed_featural",
    "reconstructed_featural_sr",
    "reconstructed_featural_eu",
    "reconstructed_featural_base",
]
data = []
for task in tasks:
    for lang_name in scores[task]:
        if all(col in scores[task][lang_name] for col in scores_inclusion):
            data.append([task, lang_name] + [scores[task][lang_name][col] for col in scores_inclusion])
df = pd.DataFrame(data, columns=["task", "lang_name"] + scores_inclusion)
# we add a column for xlm inclusion and adaper availability
df["xlm_inclusion"] = df["lang_name"].apply(lambda x: 1 if x in xlm_included_langs else 0)
df["adapter_availability"] = df["lang_name"].apply(lambda x: 1 if x in existing_adapters else 0)
# we add columns with the distances calculated to eu and sr
df["eu_dist"] = df["lang_name"].apply(lambda x: eu_dists[x] if x in eu_dists else inf)
df["sr_dist"] = df["lang_name"].apply(lambda x: sr_dists[x] if x in sr_dists else inf)
# we add columns with the difference between scores for base, eu, sr and extended
df["base_sr_diff"] = df["reconstructed_featural_sr"] - df["reconstructed_featural_base"]
df["base_eu_diff"] = df["reconstructed_featural_eu"] - df["reconstructed_featural_base"]
df["base_extended_diff"] = df["reconstructed_featural"] - df["reconstructed_featural_base"]


# we drop the languages that have inf distances
df = df[(df["eu_dist"] != inf) & (df["sr_dist"] != inf)]

In [37]:
from sklearn import linear_model
import statsmodels.api as sm

regr = linear_model.LinearRegression()
X = df[["sr_dist"]]
y = df["base_sr_diff"]  ## we test this with featural_reconstructed, and with the mad-x scores
regr.fit(X, y)
# we interpret this:
# - xlm_inclusion: 1 means the language was included in the xlm training, 0 means it was not
# - adapter_availability: 1 means the language has a trained adapter, 0 means it does not
# - reconstructed_featural: the score of the language with our method

# we can check the coefficients of the linear regression
print("Coefficients: ", regr.coef_)
print("Intercept: ", regr.intercept_)
# we can check the r^2 score of the linear regression
print("R^2 score: ", regr.score(X, y))
# we can check the p-values of the coefficients
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

Coefficients:  [-0.00519623]
Intercept:  0.0016712683966647569
R^2 score:  0.007104092669362072
                            OLS Regression Results                            
Dep. Variable:           base_sr_diff   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1.631
Date:                Tue, 27 May 2025   Prob (F-statistic):              0.203
Time:                        10:41:02   Log-Likelihood:                 773.43
No. Observations:                 230   AIC:                            -1543.
Df Residuals:                     228   BIC:                            -1536.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------

In [38]:
regr = linear_model.LinearRegression()
X = df[["eu_dist"]]
y = df["base_eu_diff"]  ## we test this with featural_reconstructed, and with the mad-x scores
regr.fit(X, y)
# we interpret this:
# - xlm_inclusion: 1 means the language was included in the xlm training, 0 means it was not
# - adapter_availability: 1 means the language has a trained adapter, 0 means it does not
# - reconstructed_featural: the score of the language with our method

# we can check the coefficients of the linear regression
print("Coefficients: ", regr.coef_)
print("Intercept: ", regr.intercept_)
# we can check the r^2 score of the linear regression
print("R^2 score: ", regr.score(X, y))
# we can check the p-values of the coefficients
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

Coefficients:  [0.00525993]
Intercept:  -0.003492057219811984
R^2 score:  0.004103285618595076
                            OLS Regression Results                            
Dep. Variable:           base_eu_diff   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9394
Date:                Tue, 27 May 2025   Prob (F-statistic):              0.333
Time:                        10:41:46   Log-Likelihood:                 770.69
No. Observations:                 230   AIC:                            -1537.
Df Residuals:                     228   BIC:                            -1531.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [43]:
# we check the overall variance of the scores
for task in tasks:
    print(f"Task: {task}")
    print("Variance of scores:")
    # we retrieve the task-specific scores
    task_scores = df[df["task"] == task][scores_inclusion]
    print(task_scores.var())
    print("Mean of scores:")
    print(task_scores.mean())
    print("standard deviation of scores:")
    print(task_scores.std())
    print("\n")

Task: ner
Variance of scores:
reconstructed_featural         0.033755
reconstructed_featural_sr      0.033880
reconstructed_featural_eu      0.033830
reconstructed_featural_base    0.033984
dtype: float64
Mean of scores:
reconstructed_featural         0.499669
reconstructed_featural_sr      0.501983
reconstructed_featural_eu      0.502130
reconstructed_featural_base    0.503562
dtype: float64
standard deviation of scores:
reconstructed_featural         0.183725
reconstructed_featural_sr      0.184066
reconstructed_featural_eu      0.183928
reconstructed_featural_base    0.184348
dtype: float64


Task: pos
Variance of scores:
reconstructed_featural         0.043465
reconstructed_featural_sr      0.043807
reconstructed_featural_eu      0.043770
reconstructed_featural_base    0.043223
dtype: float64
Mean of scores:
reconstructed_featural         0.468250
reconstructed_featural_sr      0.469423
reconstructed_featural_eu      0.469425
reconstructed_featural_base    0.468298
dtype: float64
s

# Results
## NER
SLIGHTLY lower overall score (<1%), but also lower variance and standard deviation with our extended method.
For COPA and QA, the scores go up slightly
For POS, the scores go up when only considering 1 additional adapter, but go down when considering them both compared to the base setting.


In [50]:
regr = linear_model.LinearRegression()
df_exp = df.copy()
df_exp = df_exp[df_exp["task"] == "ner"]
# we only look at the subset of adapter availability
df_exp = df_exp[df_exp["adapter_availability"] == 1]
X = df_exp[["eu_dist"]]
y = df_exp["base_eu_diff"]  ## we test this with featural_reconstructed, and with the mad-x scores
regr.fit(X, y)
# we interpret this:
# - xlm_inclusion: 1 means the language was included in the xlm training, 0 means it was not
# - adapter_availability: 1 means the language has a trained adapter, 0 means it does not
# - reconstructed_featural: the score of the language with our method
get_significance(
    "ner", "reconstructed_featural_base", "reconstructed_featural_eu", alternative="greater", data=no_adapter
)
# we can check the p-values of the coefficients
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

average scores
reconstructed_featural_base: 0.50859276228701
reconstructed_featural_eu: 0.507307851555658
t-statistic: 1.289859325036442
p-value: 0.0999666016738178
                            OLS Regression Results                            
Dep. Variable:           base_eu_diff   R-squared:                       0.112
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     3.271
Date:                Tue, 27 May 2025   Prob (F-statistic):             0.0821
Time:                        11:06:22   Log-Likelihood:                 108.14
No. Observations:                  28   AIC:                            -212.3
Df Residuals:                      26   BIC:                            -209.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t  

# Result
The difference, while not statistically significant, becomes BIGGER as we go further from eu.
Overall, the base adapter outperforms those where the new adapters are included
==> We conclude that the new adapters do not help the results, rather hurt them.