In [1]:
import os
import json
import math
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# From huggingface api, with our trained adapters (eu, sr)
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# scraped from cc-100 website
xlm_included_langs = [
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "br",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "fa",
    "ff",
    "fi",
    "fr",
    "fy",
    "ga",
    "gd",
    "gl",
    "gn",
    "gu",
    "ha",
    "he",
    "hi",
    "hr",
    "ht",
    "hu",
    "hy",
    "id",
    "ig",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ku",
    "ky",
    "la",
    "lg",
    "li",
    "ln",
    "lo",
    "lt",
    "lv",
    "mg",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "ne",
    "nl",
    "no",
    "ns",
    "om",
    "or",
    "pa",
    "pl",
    "ps",
    "pt",
    "qu",
    "rm",
    "ro",
    "ru",
    "sa",
    "si",
    "sc",
    "sd",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "ss",
    "su",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "tl",
    "tn",
    "tr",
    "ug",
    "uk",
    "ur",
    "uz",
    "vi",
    "wo",
    "xh",
    "yi",
    "yo",
    "zu",
    "zh",
]

f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1", "sib": "eval_accuracy"}
tasks = f1.keys()
scores = {task: {} for task in tasks}
inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]

In [2]:
def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val


def make_boxplot(tasks, columns):
    data = []
    if tasks == "all":
        task_names = scores.keys()
    elif len(tasks) == 1:
        task_names = [tasks]
    else:
        task_names = tasks
    for task in task_names:
        for lang_name in scores[task]:
            if all(col in scores[task][lang_name] for col in columns):
                data.append([task, lang_name] + [scores[task][lang_name][col] for col in columns])
    df = pd.DataFrame(data, columns=["task", "lang_name"] + columns)
    # we divide the scores of qa by 100
    df.loc[df["task"] == "qa", columns] = df.loc[df["task"] == "qa", columns] / 100
    # we melt the dataframe to get it in the right format for seaborn
    df_melted = df.melt(id_vars=["task", "lang_name"], value_vars=columns, var_name="method", value_name="score")
    # we plot the data
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="method", y="score", data=df_melted)
    plt.title(f"Comparison of methods for {tasks}")
    plt.xlabel("Method")
    plt.ylabel("Score")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [3]:
# we will compare "base" to "extended"
extended = "reconstructed_featural_base"
baselines = ["reconstructed_featural", "reconstructed_featural_eu", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", extended, baseline, alternative="greater")
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5583546399423083
reconstructed_featural: 0.5573914125833664
t-statistic: 1.980453198159148
p-value: 0.024159499954714683
The difference is statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_eu
average scores
reconstructed_featural_base: 0.5603690382222606
reconstructed_featural_eu: 0.5602105530457461
t-statistic: 0.4441120487736835
p-value: 0.3285914643936275
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_base: 0.5603690382222606
reconstructed_featural_sr: 0.5602831184357288
t-statistic: 0.23770530342269294
p-value: 0.4061109230008323
The difference is not statistically significant
-----------------------------------
sign ['all_reconstructed_featural']

notsign ['all_reconstructed_featural_eu', 'all_reconstructed_featural_sr']

# Result
Extending with additional adapters is WORSE than using only the base set!
This is coherent with the result that limiting the amount of languages to be taken into account is useful.
However, the differences are very small, and the analysis takes all languages across all tasks into account.

In [7]:
# we will compare "base" to "extended"
extended = "reconstructed_featural_base"
baselines = ["reconstructed_featural", "reconstructed_featural_eu", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    for task in tasks:
        print(f"{task}, baseline {baseline}")
        t_stat, p_val = get_significance(task, extended, baseline, alternative="greater", data=scores_subset)
        if p_val < 0.05:
            print("The difference is statistically significant")
            sign.append(task + "_" + baseline)
        else:
            print("The difference is not statistically significant")
            not_sign.append(task + "_" + baseline)
        print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ner, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.492003487159023
reconstructed_featural: 0.4874836773042384
t-statistic: 2.300219501625802
p-value: 0.014548950619670712
The difference is statistically significant
-----------------------------------
copa, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5165454545454545
reconstructed_featural: 0.512
t-statistic: 1.9851275525133876
p-value: 0.03761410295255921
The difference is statistically significant
-----------------------------------
pos, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.5598713433606982
reconstructed_featural: 0.5564477748389041
t-statistic: 1.317676208450147
p-value: 0.10206896037089692
The difference is not statistically significant
-----------------------------------
qa, baseline reconstructed_featural
average scores
reconstructed_featural_base: 0.7213627874534232
reconstructed_featural: 0.7214930517311764
t-statistic: -0.

# Checking if the closer adapters to sr and eu do benefit from the method

In [8]:
from huggingface_hub import HfApi
from qq import LanguageData, TagType
from urielplus import urielplus

ld = LanguageData.from_db()

api = HfApi()
# Fetch all AdapterHub xlm-roberta-base adapters
models = api.list_models(author="AdapterHub", library="adapter-transformers", search="xlm-roberta-base-")
# we print all found models

to_load = {
    m.modelId: m.modelId.split("xlm-roberta-base-")[-1].rsplit("-wiki_pfeiffer", 1)[0]
    for m in models
    if m.modelId.startswith("AdapterHub/xlm-roberta-base-") and m.modelId.endswith("-wiki_pfeiffer")
}


def get_glots(iso_list):
    manuals = {
        "Arabic": "arab1267",
        "Swahili": "swah1253",
        "Bengali": "beng1282",
        "Chinese": "mand1415",
        "Persian": "west2369",
        "Yoruba": "ilaa1246",
        "Nepali": "nepa1254",
        "Quechua": "cusc1236",
        "Estonian": "esto1258",
        "Guarani": "east2555",
    }

    glots = {}
    probs = []

    for lang in iso_list:
        eng = ld.get(lang, tag_type=TagType.BCP_47_CODE).english_name
        glot = ld.get(lang, tag_type=TagType.BCP_47_CODE).glottocode
        # we need to find if glot is in distances
        if not glot:
            if eng in manuals.keys():
                glot = manuals[eng]
        if eng and glot:
            glots[eng] = (lang, glot)
        else:
            probs.append(lang)

    print("no glottocodes found for these languages: ", probs)

    return glots


glots = get_glots(to_load.values())
iso_list = []
iso_s = [scores[key].keys() for key in scores.keys()]


# iso_list = [scores[key].keys() for key in scores.keys()]
for el in iso_s:
    for iso in el:
        iso_list.append(iso)
eval_glots = get_glots(iso_list)
u = urielplus.URIELPlus()

no glottocodes found for these languages:  []
no glottocodes found for these languages:  ['az', 'mg', 'ms', 'or', 'kg', 'sc']


In [9]:
def typological_distance(target, glots):
    """
    This function takes a target language and a list of languages.
    It weights the other languages depending on their closeness to the target language.
    """

    # 1. retrieve closeness score of all languages to target language
    weights = {}
    probs = []
    for lang, codes in glots.items():
        print(lang, codes)
        iso, glot = codes
        # get the distance
        try:
            dist = u.new_distance("featural", [glot, target])
            # print(f"Distance {lang} to {target}: {dist}")
            weights[iso] = dist

        except SystemExit:
            print(f"Error: {lang} - {glot} - {target}")
            probs.append(lang)
    # delete the problematic from glots
    for lang in probs:
        del glots[lang]

    return weights

In [10]:
eu_glots = eval_glots.copy()
sr_glots = eval_glots.copy()
eu_dists = typological_distance(ld.get("eu", TagType.BCP_47_CODE).glottocode, eu_glots)
sr_dists = typological_distance(ld.get("sr", TagType.BCP_47_CODE).glottocode, sr_glots)

2025-06-03 00:27:22,324 - root - INFO - In new_distance, calculated angular distance for featural with achi1257 and basq1248: 0.009134531021118164 seconds
2025-06-03 00:27:22,332 - root - INFO - In new_distance, calculated angular distance for featural with afri1274 and basq1248: 0.006988048553466797 seconds
2025-06-03 00:27:22,342 - root - INFO - In new_distance, calculated angular distance for featural with tosk1239 and basq1248: 0.00873422622680664 seconds
2025-06-03 00:27:22,350 - root - INFO - In new_distance, calculated angular distance for featural with amha1245 and basq1248: 0.006989479064941406 seconds
2025-06-03 00:27:22,357 - root - INFO - In new_distance, calculated angular distance for featural with arag1245 and basq1248: 0.0060727596282958984 seconds
2025-06-03 00:27:22,367 - root - INFO - In new_distance, calculated angular distance for featural with egyp1253 and basq1248: 0.008660316467285156 seconds
2025-06-03 00:27:22,373 - root - INFO - In new_distance, calculated an

Achinese ('ace', 'achi1257')
Afrikaans ('af', 'afri1274')
Tosk Albanian ('als', 'tosk1239')
Amharic ('am', 'amha1245')
Aragonese ('an', 'arag1245')
Egyptian Arabic ('arz', 'egyp1253')
Arabic ('ar', 'arab1267')
Asturian ('ast', 'astu1245')
Error: Asturian - astu1245 - basq1248
Assamese ('as', 'assa1263')
Aymara ('ay', 'nucl1667')
Bavarian ('bar', 'bava1246')
Bashkir ('ba', 'bash1264')
Belarusian ('be', 'bela1254')
Bulgarian ('bg', 'bulg1262')
Bengali ('bn', 'beng1280')
Tibetan ('bo', 'tibe1272')
Breton ('br', 'bret1244')
Bosnian ('bs', 'bosn1245')
Catalan ('ca', 'stan1289')
Min Dong Chinese ('cdo', 'mind1253')
Cebuano ('ceb', 'cebu1242')
Chechen ('ce', 'chec1245')
Sorani Kurdish ('ckb', 'cent1972')


2025-06-03 00:27:22,496 - root - INFO - In new_distance, calculated angular distance for featural with cors1241 and basq1248: 0.006161212921142578 seconds
2025-06-03 00:27:22,503 - root - INFO - In new_distance, calculated angular distance for featural with crim1257 and basq1248: 0.0060727596282958984 seconds
2025-06-03 00:27:22,507 - root - ERROR - No shared featural features between kash1274 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:22,516 - root - INFO - In new_distance, calculated angular distance for featural with czec1258 and basq1248: 0.00801849365234375 seconds
2025-06-03 00:27:22,524 - root - INFO - In new_distance, calculated angular distance for featural with chuv1255 and basq1248: 0.008144140243530273 seconds
2025-06-03 00:27:22,531 - root - INFO - In new_distance, calculated angular distance for featural with wels1247 and basq1248: 0.005860805511474609 seconds
2025-06-03 00:27:22,538 - root - INFO - I

Corsican ('co', 'cors1241')
Crimean Tatar ('crh', 'crim1257')
Kashubian ('csb', 'kash1274')
Error: Kashubian - kash1274 - basq1248
Czech ('cs', 'czec1258')
Chuvash ('cv', 'chuv1255')
Welsh ('cy', 'wels1247')
Danish ('da', 'dani1285')
German ('de', 'stan1295')
Dimli ('diq', 'diml1238')
Dhivehi ('dv', 'dhiv1236')
Greek ('el', 'mode1248')
English ('en', 'stan1293')
Esperanto ('eo', 'espe1235')
Spanish ('es', 'stan1288')
Estonian ('et', 'esto1258')
Basque ('eu', 'basq1248')
Extremaduran ('ext', 'extr1243')
Persian ('fa', 'west2369')
Finnish ('fi', 'finn1318')
Faroese ('fo', 'faro1244')
Northern Frisian ('frr', 'nort2626')
French ('fr', 'stan1290')
Friulian ('fur', 'friu1240')
Frisian ('fy', 'west2354')
Gan Chinese ('gan', 'ganc1239')
Irish ('ga', 'iris1253')


2025-06-03 00:27:22,685 - root - INFO - In new_distance, calculated angular distance for featural with scot1245 and basq1248: 0.005092144012451172 seconds
2025-06-03 00:27:22,693 - root - INFO - In new_distance, calculated angular distance for featural with gali1258 and basq1248: 0.0072557926177978516 seconds
2025-06-03 00:27:22,703 - root - INFO - In new_distance, calculated angular distance for featural with east2555 and basq1248: 0.009017467498779297 seconds
2025-06-03 00:27:22,711 - root - INFO - In new_distance, calculated angular distance for featural with guja1252 and basq1248: 0.0070035457611083984 seconds
2025-06-03 00:27:22,720 - root - INFO - In new_distance, calculated angular distance for featural with hakk1236 and basq1248: 0.008005380630493164 seconds
2025-06-03 00:27:22,729 - root - INFO - In new_distance, calculated angular distance for featural with hebr1245 and basq1248: 0.008100271224975586 seconds
2025-06-03 00:27:22,739 - root - INFO - In new_distance, calculated 

Scots Gaelic ('gd', 'scot1245')
Galician ('gl', 'gali1258')
Guarani ('gn', 'east2555')
Gujarati ('gu', 'guja1252')
Hakka Chinese ('hak', 'hakk1236')
Hebrew ('he', 'hebr1245')
Hindi ('hi', 'hind1269')
Croatian ('hr', 'croa1245')
Upper Sorbian ('hsb', 'uppe1395')
Error: Upper Sorbian - uppe1395 - basq1248
Hungarian ('hu', 'hung1274')
Armenian ('hy', 'nucl1235')
Indonesian ('id', 'indo1316')
Igbo ('ig', 'nucl1417')
Ilocano ('ilo', 'ilok1237')
Icelandic ('is', 'icel1247')
Italian ('it', 'ital1282')
Japanese ('ja', 'nucl1643')
Javanese ('jv', 'java1254')
Georgian ('ka', 'nucl1302')
Kazakh ('kk', 'kaza1248')
Khmer ('km', 'cent1989')
Kannada ('kn', 'nucl1305')


2025-06-03 00:27:22,874 - root - INFO - In new_distance, calculated angular distance for featural with kore1280 and basq1248: 0.007997274398803711 seconds
2025-06-03 00:27:22,877 - root - ERROR - No shared featural features between kols1241 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:22,885 - root - INFO - In new_distance, calculated angular distance for featural with kurd1259 and basq1248: 0.0060062408447265625 seconds
2025-06-03 00:27:22,892 - root - INFO - In new_distance, calculated angular distance for featural with kirg1245 and basq1248: 0.006993770599365234 seconds
2025-06-03 00:27:22,897 - root - INFO - In new_distance, calculated angular distance for featural with luxe1241 and basq1248: 0.004213809967041016 seconds
2025-06-03 00:27:22,900 - root - ERROR - No shared featural features between ligu1248 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 

Korean ('ko', 'kore1280')
Kölsch ('ksh', 'kols1241')
Error: Kölsch - kols1241 - basq1248
Kurmanji Kurdish ('ku', 'kurd1259')
Kyrgyz ('ky', 'kirg1245')
Luxembourgish ('lb', 'luxe1241')
Ligurian ('lij', 'ligu1248')
Error: Ligurian - ligu1248 - basq1248
Lombard ('lmo', 'lomb1257')
Lingala ('ln', 'ling1263')
Lithuanian ('lt', 'lith1251')
Latvian ('lv', 'latv1249')
Eastern Mari ('mhr', 'east2328')
Minangkabau ('min', 'mina1268')
Maori ('mi', 'maor1246')
Macedonian ('mk', 'mace1250')
Malayalam ('ml', 'mala1464')
Mongolian ('mn', 'mong1331')
Marathi ('mr', 'mara1378')
Maltese ('mt', 'malt1254')
Mirandese ('mwl', 'mira1251')
Error: Mirandese - mira1251 - basq1248
Myanmar (Burmese) ('my', 'nucl1310')
Mazanderani ('mzn', 'maza1291')
Neapolitan ('nap', 'neap1235')
Low German ('nds', 'nort2627')
Nepali ('npi', 'nepa1254')
Dutch ('nl', 'dutc1256')
Norwegian Nynorsk ('nn', 'norw1262')
Error: Norwegian Nynorsk - norw1262 - basq1248
Norwegian ('no', 'norw1258')


2025-06-03 00:27:23,063 - root - INFO - In new_distance, calculated angular distance for featural with occi1239 and basq1248: 0.0070037841796875 seconds
2025-06-03 00:27:23,072 - root - INFO - In new_distance, calculated angular distance for featural with iron1242 and basq1248: 0.007989168167114258 seconds
2025-06-03 00:27:23,080 - root - INFO - In new_distance, calculated angular distance for featural with panj1256 and basq1248: 0.006994724273681641 seconds
2025-06-03 00:27:23,084 - root - ERROR - No shared featural features between penn1240 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,093 - root - INFO - In new_distance, calculated angular distance for featural with poli1260 and basq1248: 0.007998943328857422 seconds
2025-06-03 00:27:23,097 - root - ERROR - No shared featural features between piem1238 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:

Occitan ('oc', 'occi1239')
Ossetian ('os', 'iron1242')
Punjabi ('pa', 'panj1256')
Pennsylvania German ('pdc', 'penn1240')
Error: Pennsylvania German - penn1240 - basq1248
Polish ('pl', 'poli1260')
Piemontese ('pms', 'piem1238')
Error: Piemontese - piem1238 - basq1248
Western Panjabi ('pnb', 'west2386')
Pashto ('ps', 'nucl1276')
Portuguese ('pt', 'port1283')
Quechua ('qu', 'cusc1236')
Romansh ('rm', 'roma1326')
Romanian ('ro', 'roma1327')
Russian ('ru', 'russ1263')
Kinyarwanda ('rw', 'kiny1244')
Yakut ('sah', 'yaku1245')
Sicilian ('scn', 'sici1248')
Scots ('sco', 'scot1243')
Sindhi ('sd', 'sind1272')
Serbo-Croatian ('sh', 'sout1528')
Sinhala ('si', 'sinh1246')
Slovak ('sk', 'slov1269')
Error: Slovak - slov1269 - basq1248
Slovenian ('sl', 'slov1268')
Somali ('so', 'soma1255')
Albanian ('sq', 'alba1267')
Serbian ('sr', 'serb1264')


2025-06-03 00:27:23,247 - root - INFO - In new_distance, calculated angular distance for featural with sund1252 and basq1248: 0.008090019226074219 seconds
2025-06-03 00:27:23,255 - root - INFO - In new_distance, calculated angular distance for featural with swed1254 and basq1248: 0.0069942474365234375 seconds
2025-06-03 00:27:23,263 - root - INFO - In new_distance, calculated angular distance for featural with swah1253 and basq1248: 0.007185697555541992 seconds
2025-06-03 00:27:23,266 - root - ERROR - No shared featural features between sile1253 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,275 - root - INFO - In new_distance, calculated angular distance for featural with tami1289 and basq1248: 0.008213281631469727 seconds
2025-06-03 00:27:23,284 - root - INFO - In new_distance, calculated angular distance for featural with telu1262 and basq1248: 0.008007049560546875 seconds
2025-06-03 00:27:23,290 - root - INFO - 

Sundanese ('su', 'sund1252')
Swedish ('sv', 'swed1254')
Swahili ('swh', 'swah1253')
Silesian ('szl', 'sile1253')
Error: Silesian - sile1253 - basq1248
Tamil ('ta', 'tami1289')
Telugu ('te', 'telu1262')
Tajik ('tg', 'taji1245')
Thai ('th', 'thai1261')
Turkmen ('tk', 'turk1304')
Tagalog ('tl', 'taga1270')
Turkish ('tr', 'nucl1301')
Tatar ('tt', 'tata1255')
Uyghur ('ug', 'uigh1240')
Ukrainian ('uk', 'ukra1253')
Urdu ('ur', 'urdu1245')
Uzbek ('uz', 'uzbe1247')
Venetian ('vec', 'vene1258')
Error: Venetian - vene1258 - basq1248
Veps ('vep', 'veps1250')
Vietnamese ('vi', 'viet1252')
Vlaams ('vls', 'vlaa1240')
Waray ('war', 'wara1300')
Walloon ('wa', 'wall1255')
Error: Walloon - wall1255 - basq1248
Wu Chinese ('wuu', 'wuch1236')
Mingrelian ('xmf', 'ming1252')
Yiddish ('yi', 'yidd1255')
Error: Yiddish - yidd1255 - basq1248
Yoruba ('yo', 'yoru1245')
Zeeuws ('zea', 'zeeu1238')
Chinese ('zh', 'mand1415')


2025-06-03 00:27:23,434 - root - ERROR - No shared featural features between limb1263 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,441 - root - INFO - In new_distance, calculated angular distance for featural with hait1244 and basq1248: 0.0069997310638427734 seconds
2025-06-03 00:27:23,445 - root - ERROR - No shared featural features between assy1241 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,447 - root - ERROR - No shared featural features between sout3123 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,456 - root - INFO - In new_distance, calculated angular distance for featural with apur1254 and basq1248: 0.008002042770385742 seconds
2025-06-03 00:27:23,464 - root - INFO - In new_distance, calculated angular distance for featural with akun1241 and basq1248: 0.006997108

Limburgan ('li', 'limb1263')
Error: Limburgan - limb1263 - basq1248
Haitian Creole ('ht', 'hait1244')
Assyrian Neo-Aramaic ('aii', 'assy1241')
Error: Assyrian Neo-Aramaic - assy1241 - basq1248
South Levantine Arabic ('ajp', 'sout3123')
Error: South Levantine Arabic - sout3123 - basq1248
Apurinã ('apu', 'apur1254')
Akuntsu ('aqz', 'akun1241')
Bhojpuri ('bho', 'bhoj1244')
Bambara ('bm', 'bamb1269')
Russia Buriat ('bxr', 'russ1264')
Chukot ('ckt', 'chuk1273')
Coptic ('cop', 'copt1239')
Swiss German ('gsw', 'swis1247')
Mbyá Guaraní ('gun', 'mbya1239')
Manx ('gv', 'manx1243')
Khunsari ('kfm', 'khun1255')
Error: Khunsari - khun1255 - basq1248
Northern Kurdish ('kmr', 'nort2641')
Komi-Permyak ('koi', 'komi1269')
Komi-Zyrian ('kpv', 'komi1268')
Karelian ('krl', 'kare1335')
Error: Karelian - kare1335 - basq1248
Moksha ('mdf', 'moks1248')
Mundurukú ('myu', 'mund1330')
Erzya ('myv', 'erzy1239')
Nayini ('nyq', 'nayi1242')
Error: Nayini - nayi1242 - basq1248
Livvi ('olo', 'livv1243')
Nigerian Pidgi

2025-06-03 00:27:23,618 - root - INFO - In new_distance, calculated angular distance for featural with warl1254 and basq1248: 0.0052678585052490234 seconds
2025-06-03 00:27:23,626 - root - INFO - In new_distance, calculated angular distance for featural with nucl1347 and basq1248: 0.007170438766479492 seconds
2025-06-03 00:27:23,635 - root - INFO - In new_distance, calculated angular distance for featural with yuec1235 and basq1248: 0.008247613906860352 seconds
2025-06-03 00:27:23,641 - root - INFO - In new_distance, calculated angular distance for featural with meso1252 and basq1248: 0.0049822330474853516 seconds
2025-06-03 00:27:23,644 - root - ERROR - No shared featural features between taiz1242 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,649 - root - INFO - In new_distance, calculated angular distance for featural with tuni1259 and basq1248: 0.002992391586303711 seconds
2025-06-03 00:27:23,653 - root - INFO -

Wolof ('wo', 'nucl1347')
Yue Chinese ('yue', 'yuec1235')
Mesopotamian Arabic ('acm', 'meso1252')
Ta'Izzi-Adeni Arabic ('acq', 'taiz1242')
Error: Ta'Izzi-Adeni Arabic - taiz1242 - basq1248
Tunisian Arabic ('aeb', 'tuni1259')
Twi ('tw', 'twii1234')
North Levantine Arabic ('apc', 'nort3139')
Standard Arabic ('arb', 'stan1318')
Najdi Arabic ('ars', 'najd1235')
Moroccan Arabic ('ary', 'moro1292')
Awadhi ('awa', 'awad1243')
Central Aymara ('ayr', 'cent2142')
South Azerbaijani ('azb', 'sout2697')
North Azerbaijani ('azj', 'nort2697')
Balinese ('ban', 'bali1278')
Bemba ('bem', 'bemb1257')
Banjar ('bjn', 'banj1239')
Error: Banjar - banj1239 - basq1248
Buginese ('bug', 'bugi1244')
Chokwe ('cjk', 'chok1245')
Error: Chokwe - chok1245 - basq1248
Southwestern Dinka ('dik', 'sout2832')
Error: Southwestern Dinka - sout2832 - basq1248
Dyula ('dyu', 'dyul1238')
Dzongkha ('dz', 'dzon1239')
Ewe ('ee', 'ewee1241')
Fijian ('fj', 'fiji1243')
Fon ('fon', 'fonn1241')
Nigerian Fulfulde ('fuv', 'nige1253')


2025-06-03 00:27:23,809 - root - INFO - In new_distance, calculated angular distance for featural with nige1253 and basq1248: 0.009146928787231445 seconds
2025-06-03 00:27:23,814 - root - INFO - In new_distance, calculated angular distance for featural with west2721 and basq1248: 0.0040760040283203125 seconds
2025-06-03 00:27:23,820 - root - INFO - In new_distance, calculated angular distance for featural with haus1257 and basq1248: 0.006545305252075195 seconds
2025-06-03 00:27:23,823 - root - ERROR - No shared featural features between chha1249 and basq1248 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:23,830 - root - INFO - In new_distance, calculated angular distance for featural with kaby1243 and basq1248: 0.006023406982421875 seconds
2025-06-03 00:27:23,840 - root - INFO - In new_distance, calculated angular distance for featural with kach1280 and basq1248: 0.008281230926513672 seconds
2025-06-03 00:27:23,850 - root - INFO - 

West Central Oromo ('gaz', 'west2721')
Hausa ('ha', 'haus1257')
Chhattisgarhi ('hne', 'chha1249')
Error: Chhattisgarhi - chha1249 - basq1248
Kabyle ('kab', 'kaby1243')
Kachin ('kac', 'kach1280')
Kamba ('kam', 'kamb1297')
Kabiyè ('kbp', 'kabi1261')
Kabuverdianu ('kea', 'kabu1256')
Halh Mongolian ('khk', 'halh1238')
Kikuyu ('ki', 'kiku1240')
Kimbundu ('kmb', 'kimb1241')
Error: Kimbundu - kimb1241 - basq1248
Central Kanuri ('knc', 'cent2050')
Kashmiri ('ks', 'kash1277')
Luganda ('lg', 'gand1255')
Lao ('lo', 'laoo1244')
Latgalian ('ltg', 'east2282')
Error: Latgalian - east2282 - basq1248
Luba-Lulua ('lua', 'luba1249')
Luo ('luo', 'luok1236')
Mizo ('lus', 'lush1249')
Standard Latvian ('lvs', 'stan1325')
Error: Standard Latvian - stan1325 - basq1248
Magahi ('mag', 'maga1260')
Maithili ('mai', 'mait1250')
Meiteilon (Manipuri) ('mni', 'mani1292')
Mossi ('mos', 'moss1236')
Norwegian Bokmål ('nb', 'norw1259')


2025-06-03 00:27:23,995 - root - INFO - In new_distance, calculated angular distance for featural with pedi1238 and basq1248: 0.00408625602722168 seconds
2025-06-03 00:27:24,004 - root - INFO - In new_distance, calculated angular distance for featural with nuer1246 and basq1248: 0.008052825927734375 seconds
2025-06-03 00:27:24,009 - root - INFO - In new_distance, calculated angular distance for featural with nyan1308 and basq1248: 0.003999471664428711 seconds
2025-06-03 00:27:24,016 - root - INFO - In new_distance, calculated angular distance for featural with oriy1255 and basq1248: 0.005995035171508789 seconds
2025-06-03 00:27:24,024 - root - INFO - In new_distance, calculated angular distance for featural with pang1290 and basq1248: 0.006991147994995117 seconds
2025-06-03 00:27:24,029 - root - INFO - In new_distance, calculated angular distance for featural with papi1253 and basq1248: 0.004007816314697266 seconds
2025-06-03 00:27:24,036 - root - INFO - In new_distance, calculated ang

Sepedi ('nso', 'pedi1238')
Nuer ('nus', 'nuer1246')
Chichewa ('ny', 'nyan1308')
Odia ('ory', 'oriy1255')
Pangasinan ('pag', 'pang1290')
Papiamento ('pap', 'papi1253')
Southern Pashto ('pbt', 'sout2649')
Iranian Persian ('pes', 'west2369')
Plateau Malagasy ('plt', 'plat1254')
Dari ('prs', 'dari1249')
Ayacucho Quechua ('quy', 'ayac1239')
Rundi ('rn', 'rund1242')
Santali ('sat', 'sant1410')
Sango ('sg', 'sang1328')
Shan ('shn', 'shan1277')
Samoan ('sm', 'samo1305')
Shona ('sn', 'shon1251')
Swati ('ss', 'swat1243')
Sesotho ('st', 'sout2807')
Tamasheq ('taq', 'tama1365')
Tigrinya ('ti', 'tigr1271')
Tswana ('tn', 'tswa1253')
Tok Pisin ('tpi', 'tokp1240')
Tsonga ('ts', 'tson1249')
Tumbuka ('tum', 'tumb1250')
Error: Tumbuka - tumb1250 - basq1248
Central Atlas Tamazight ('tzm', 'cent2194')
Umbundu ('umb', 'umbu1257')


2025-06-03 00:27:24,188 - root - INFO - In new_distance, calculated angular distance for featural with nort2690 and basq1248: 0.007111787796020508 seconds
2025-06-03 00:27:24,196 - root - INFO - In new_distance, calculated angular distance for featural with xhos1239 and basq1248: 0.006998777389526367 seconds
2025-06-03 00:27:24,203 - root - INFO - In new_distance, calculated angular distance for featural with east2295 and basq1248: 0.005988597869873047 seconds
2025-06-03 00:27:24,210 - root - INFO - In new_distance, calculated angular distance for featural with stan1306 and basq1248: 0.0061702728271484375 seconds
2025-06-03 00:27:24,218 - root - INFO - In new_distance, calculated angular distance for featural with zulu1248 and basq1248: 0.0071675777435302734 seconds
2025-06-03 00:27:24,223 - root - INFO - In new_distance, calculated angular distance for featural with achi1257 and serb1264: 0.003992557525634766 seconds
2025-06-03 00:27:24,228 - root - INFO - In new_distance, calculated 

Northern Uzbek ('uzn', 'nort2690')
Xhosa ('xh', 'xhos1239')
Eastern Yiddish ('ydd', 'east2295')
Standard Malay ('zsm', 'stan1306')
Zulu ('zu', 'zulu1248')
Achinese ('ace', 'achi1257')
Afrikaans ('af', 'afri1274')
Tosk Albanian ('als', 'tosk1239')
Amharic ('am', 'amha1245')
Aragonese ('an', 'arag1245')
Error: Aragonese - arag1245 - serb1264
Egyptian Arabic ('arz', 'egyp1253')
Arabic ('ar', 'arab1267')
Asturian ('ast', 'astu1245')
Error: Asturian - astu1245 - serb1264
Assamese ('as', 'assa1263')
Aymara ('ay', 'nucl1667')
Bavarian ('bar', 'bava1246')
Bashkir ('ba', 'bash1264')
Belarusian ('be', 'bela1254')
Bulgarian ('bg', 'bulg1262')
Bengali ('bn', 'beng1280')
Tibetan ('bo', 'tibe1272')
Breton ('br', 'bret1244')
Bosnian ('bs', 'bosn1245')
Catalan ('ca', 'stan1289')
Min Dong Chinese ('cdo', 'mind1253')
Cebuano ('ceb', 'cebu1242')
Chechen ('ce', 'chec1245')
Sorani Kurdish ('ckb', 'cent1972')
Corsican ('co', 'cors1241')
Crimean Tatar ('crh', 'crim1257')
Kashubian ('csb', 'kash1274')
Error: 

2025-06-03 00:27:24,370 - root - INFO - In new_distance, calculated angular distance for featural with dani1285 and serb1264: 0.003998756408691406 seconds
2025-06-03 00:27:24,377 - root - INFO - In new_distance, calculated angular distance for featural with stan1295 and serb1264: 0.005831003189086914 seconds
2025-06-03 00:27:24,382 - root - INFO - In new_distance, calculated angular distance for featural with diml1238 and serb1264: 0.003542184829711914 seconds
2025-06-03 00:27:24,387 - root - INFO - In new_distance, calculated angular distance for featural with dhiv1236 and serb1264: 0.0041348934173583984 seconds
2025-06-03 00:27:24,392 - root - INFO - In new_distance, calculated angular distance for featural with mode1248 and serb1264: 0.004009246826171875 seconds
2025-06-03 00:27:24,397 - root - INFO - In new_distance, calculated angular distance for featural with stan1293 and serb1264: 0.004120349884033203 seconds
2025-06-03 00:27:24,402 - root - INFO - In new_distance, calculated a

Danish ('da', 'dani1285')
German ('de', 'stan1295')
Dimli ('diq', 'diml1238')
Dhivehi ('dv', 'dhiv1236')
Greek ('el', 'mode1248')
English ('en', 'stan1293')
Esperanto ('eo', 'espe1235')
Spanish ('es', 'stan1288')
Estonian ('et', 'esto1258')
Basque ('eu', 'basq1248')
Extremaduran ('ext', 'extr1243')
Persian ('fa', 'west2369')
Finnish ('fi', 'finn1318')
Faroese ('fo', 'faro1244')
Northern Frisian ('frr', 'nort2626')
Error: Northern Frisian - nort2626 - serb1264
French ('fr', 'stan1290')
Friulian ('fur', 'friu1240')
Error: Friulian - friu1240 - serb1264
Frisian ('fy', 'west2354')
Gan Chinese ('gan', 'ganc1239')
Irish ('ga', 'iris1253')
Scots Gaelic ('gd', 'scot1245')
Galician ('gl', 'gali1258')
Guarani ('gn', 'east2555')
Gujarati ('gu', 'guja1252')
Hakka Chinese ('hak', 'hakk1236')
Hebrew ('he', 'hebr1245')
Hindi ('hi', 'hind1269')
Croatian ('hr', 'croa1245')
Upper Sorbian ('hsb', 'uppe1395')
Error: Upper Sorbian - uppe1395 - serb1264
Hungarian ('hu', 'hung1274')
Armenian ('hy', 'nucl1235

2025-06-03 00:27:24,559 - root - INFO - In new_distance, calculated angular distance for featural with nucl1302 and serb1264: 0.005003452301025391 seconds
2025-06-03 00:27:24,564 - root - INFO - In new_distance, calculated angular distance for featural with kaza1248 and serb1264: 0.004088401794433594 seconds
2025-06-03 00:27:24,570 - root - INFO - In new_distance, calculated angular distance for featural with cent1989 and serb1264: 0.0039827823638916016 seconds
2025-06-03 00:27:24,575 - root - INFO - In new_distance, calculated angular distance for featural with nucl1305 and serb1264: 0.0051441192626953125 seconds
2025-06-03 00:27:24,579 - root - INFO - In new_distance, calculated angular distance for featural with kore1280 and serb1264: 0.0029892921447753906 seconds
2025-06-03 00:27:24,582 - root - ERROR - No shared featural features between kols1241 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:24,587 - root - INFO 

Georgian ('ka', 'nucl1302')
Kazakh ('kk', 'kaza1248')
Khmer ('km', 'cent1989')
Kannada ('kn', 'nucl1305')
Korean ('ko', 'kore1280')
Kölsch ('ksh', 'kols1241')
Error: Kölsch - kols1241 - serb1264
Kurmanji Kurdish ('ku', 'kurd1259')
Kyrgyz ('ky', 'kirg1245')
Luxembourgish ('lb', 'luxe1241')
Ligurian ('lij', 'ligu1248')
Error: Ligurian - ligu1248 - serb1264
Lombard ('lmo', 'lomb1257')
Lingala ('ln', 'ling1263')
Lithuanian ('lt', 'lith1251')
Latvian ('lv', 'latv1249')
Eastern Mari ('mhr', 'east2328')
Minangkabau ('min', 'mina1268')
Maori ('mi', 'maor1246')
Macedonian ('mk', 'mace1250')
Malayalam ('ml', 'mala1464')
Mongolian ('mn', 'mong1331')
Marathi ('mr', 'mara1378')
Maltese ('mt', 'malt1254')
Mirandese ('mwl', 'mira1251')
Error: Mirandese - mira1251 - serb1264
Myanmar (Burmese) ('my', 'nucl1310')
Mazanderani ('mzn', 'maza1291')
Neapolitan ('nap', 'neap1235')
Low German ('nds', 'nort2627')
Error: Low German - nort2627 - serb1264
Nepali ('npi', 'nepa1254')
Dutch ('nl', 'dutc1256')
Norwegi

2025-06-03 00:27:24,746 - root - INFO - In new_distance, calculated angular distance for featural with nucl1276 and serb1264: 0.005009174346923828 seconds
2025-06-03 00:27:24,752 - root - INFO - In new_distance, calculated angular distance for featural with port1283 and serb1264: 0.00500178337097168 seconds
2025-06-03 00:27:24,757 - root - INFO - In new_distance, calculated angular distance for featural with cusc1236 and serb1264: 0.004119873046875 seconds
2025-06-03 00:27:24,762 - root - INFO - In new_distance, calculated angular distance for featural with roma1326 and serb1264: 0.0040018558502197266 seconds
2025-06-03 00:27:24,767 - root - INFO - In new_distance, calculated angular distance for featural with roma1327 and serb1264: 0.004134416580200195 seconds
2025-06-03 00:27:24,773 - root - INFO - In new_distance, calculated angular distance for featural with russ1263 and serb1264: 0.0048542022705078125 seconds
2025-06-03 00:27:24,779 - root - INFO - In new_distance, calculated angu

Pashto ('ps', 'nucl1276')
Portuguese ('pt', 'port1283')
Quechua ('qu', 'cusc1236')
Romansh ('rm', 'roma1326')
Romanian ('ro', 'roma1327')
Russian ('ru', 'russ1263')
Kinyarwanda ('rw', 'kiny1244')
Yakut ('sah', 'yaku1245')
Sicilian ('scn', 'sici1248')
Scots ('sco', 'scot1243')
Sindhi ('sd', 'sind1272')
Serbo-Croatian ('sh', 'sout1528')
Sinhala ('si', 'sinh1246')
Slovak ('sk', 'slov1269')
Error: Slovak - slov1269 - serb1264
Slovenian ('sl', 'slov1268')
Somali ('so', 'soma1255')
Albanian ('sq', 'alba1267')
Serbian ('sr', 'serb1264')
Sundanese ('su', 'sund1252')
Swedish ('sv', 'swed1254')
Swahili ('swh', 'swah1253')
Silesian ('szl', 'sile1253')
Error: Silesian - sile1253 - serb1264
Tamil ('ta', 'tami1289')
Telugu ('te', 'telu1262')
Tajik ('tg', 'taji1245')
Thai ('th', 'thai1261')
Turkmen ('tk', 'turk1304')
Tagalog ('tl', 'taga1270')
Turkish ('tr', 'nucl1301')
Tatar ('tt', 'tata1255')
Uyghur ('ug', 'uigh1240')
Ukrainian ('uk', 'ukra1253')
Urdu ('ur', 'urdu1245')
Uzbek ('uz', 'uzbe1247')
Ven

2025-06-03 00:27:24,932 - root - INFO - In new_distance, calculated angular distance for featural with veps1250 and serb1264: 0.005001068115234375 seconds
2025-06-03 00:27:24,937 - root - INFO - In new_distance, calculated angular distance for featural with viet1252 and serb1264: 0.003991365432739258 seconds
2025-06-03 00:27:24,943 - root - INFO - In new_distance, calculated angular distance for featural with vlaa1240 and serb1264: 0.004995822906494141 seconds
2025-06-03 00:27:24,950 - root - INFO - In new_distance, calculated angular distance for featural with wara1300 and serb1264: 0.00599980354309082 seconds
2025-06-03 00:27:24,953 - root - ERROR - No shared featural features between wall1255 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:24,958 - root - INFO - In new_distance, calculated angular distance for featural with wuch1236 and serb1264: 0.004009723663330078 seconds
2025-06-03 00:27:24,962 - root - ERROR - N

Error: Venetian - vene1258 - serb1264
Veps ('vep', 'veps1250')
Vietnamese ('vi', 'viet1252')
Vlaams ('vls', 'vlaa1240')
Waray ('war', 'wara1300')
Walloon ('wa', 'wall1255')
Error: Walloon - wall1255 - serb1264
Wu Chinese ('wuu', 'wuch1236')
Mingrelian ('xmf', 'ming1252')
Error: Mingrelian - ming1252 - serb1264
Yiddish ('yi', 'yidd1255')
Error: Yiddish - yidd1255 - serb1264
Yoruba ('yo', 'yoru1245')
Zeeuws ('zea', 'zeeu1238')
Chinese ('zh', 'mand1415')
Limburgan ('li', 'limb1263')
Error: Limburgan - limb1263 - serb1264
Haitian Creole ('ht', 'hait1244')
Assyrian Neo-Aramaic ('aii', 'assy1241')
Error: Assyrian Neo-Aramaic - assy1241 - serb1264
South Levantine Arabic ('ajp', 'sout3123')
Error: South Levantine Arabic - sout3123 - serb1264
Apurinã ('apu', 'apur1254')
Akuntsu ('aqz', 'akun1241')
Bhojpuri ('bho', 'bhoj1244')
Bambara ('bm', 'bamb1269')
Russia Buriat ('bxr', 'russ1264')
Chukot ('ckt', 'chuk1273')
Coptic ('cop', 'copt1239')
Swiss German ('gsw', 'swis1247')
Mbyá Guaraní ('gun', 'm

2025-06-03 00:27:25,121 - root - INFO - In new_distance, calculated angular distance for featural with yuec1235 and serb1264: 0.002989530563354492 seconds
2025-06-03 00:27:25,127 - root - INFO - In new_distance, calculated angular distance for featural with meso1252 and serb1264: 0.004183769226074219 seconds
2025-06-03 00:27:25,131 - root - ERROR - No shared featural features between taiz1242 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:25,134 - root - ERROR - No shared featural features between tuni1259 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:25,138 - root - INFO - In new_distance, calculated angular distance for featural with twii1234 and serb1264: 0.00405120849609375 seconds
2025-06-03 00:27:25,143 - root - INFO - In new_distance, calculated angular distance for featural with nort3139 and serb1264: 0.0040094852447509766 seconds
2025-06-03 0

Yue Chinese ('yue', 'yuec1235')
Mesopotamian Arabic ('acm', 'meso1252')
Ta'Izzi-Adeni Arabic ('acq', 'taiz1242')
Error: Ta'Izzi-Adeni Arabic - taiz1242 - serb1264
Tunisian Arabic ('aeb', 'tuni1259')
Error: Tunisian Arabic - tuni1259 - serb1264
Twi ('tw', 'twii1234')
North Levantine Arabic ('apc', 'nort3139')
Standard Arabic ('arb', 'stan1318')
Najdi Arabic ('ars', 'najd1235')
Moroccan Arabic ('ary', 'moro1292')
Awadhi ('awa', 'awad1243')
Central Aymara ('ayr', 'cent2142')
South Azerbaijani ('azb', 'sout2697')
North Azerbaijani ('azj', 'nort2697')
Balinese ('ban', 'bali1278')
Bemba ('bem', 'bemb1257')
Banjar ('bjn', 'banj1239')
Error: Banjar - banj1239 - serb1264
Buginese ('bug', 'bugi1244')
Chokwe ('cjk', 'chok1245')
Error: Chokwe - chok1245 - serb1264
Southwestern Dinka ('dik', 'sout2832')
Error: Southwestern Dinka - sout2832 - serb1264
Dyula ('dyu', 'dyul1238')
Dzongkha ('dz', 'dzon1239')
Ewe ('ee', 'ewee1241')
Fijian ('fj', 'fiji1243')
Fon ('fon', 'fonn1241')
Nigerian Fulfulde ('fuv

2025-06-03 00:27:25,305 - root - INFO - In new_distance, calculated angular distance for featural with gand1255 and serb1264: 0.005007266998291016 seconds
2025-06-03 00:27:25,310 - root - INFO - In new_distance, calculated angular distance for featural with laoo1244 and serb1264: 0.004005908966064453 seconds
2025-06-03 00:27:25,313 - root - ERROR - No shared featural features between east2282 and serb1264 for which the two languages have information.
Unable to calculate featural distance.
2025-06-03 00:27:25,318 - root - INFO - In new_distance, calculated angular distance for featural with luba1249 and serb1264: 0.004122257232666016 seconds
2025-06-03 00:27:25,323 - root - INFO - In new_distance, calculated angular distance for featural with luok1236 and serb1264: 0.003999948501586914 seconds
2025-06-03 00:27:25,329 - root - INFO - In new_distance, calculated angular distance for featural with lush1249 and serb1264: 0.004994392395019531 seconds
2025-06-03 00:27:25,332 - root - ERROR - 

Luganda ('lg', 'gand1255')
Lao ('lo', 'laoo1244')
Latgalian ('ltg', 'east2282')
Error: Latgalian - east2282 - serb1264
Luba-Lulua ('lua', 'luba1249')
Luo ('luo', 'luok1236')
Mizo ('lus', 'lush1249')
Standard Latvian ('lvs', 'stan1325')
Error: Standard Latvian - stan1325 - serb1264
Magahi ('mag', 'maga1260')
Maithili ('mai', 'mait1250')
Meiteilon (Manipuri) ('mni', 'mani1292')
Mossi ('mos', 'moss1236')
Norwegian Bokmål ('nb', 'norw1259')
Sepedi ('nso', 'pedi1238')
Nuer ('nus', 'nuer1246')
Chichewa ('ny', 'nyan1308')
Odia ('ory', 'oriy1255')
Pangasinan ('pag', 'pang1290')
Papiamento ('pap', 'papi1253')
Southern Pashto ('pbt', 'sout2649')
Iranian Persian ('pes', 'west2369')
Plateau Malagasy ('plt', 'plat1254')
Dari ('prs', 'dari1249')
Error: Dari - dari1249 - serb1264
Ayacucho Quechua ('quy', 'ayac1239')
Rundi ('rn', 'rund1242')
Santali ('sat', 'sant1410')
Sango ('sg', 'sang1328')
Shan ('shn', 'shan1277')
Samoan ('sm', 'samo1305')
Shona ('sn', 'shon1251')
Swati ('ss', 'swat1243')
Sesotho 

2025-06-03 00:27:25,491 - root - INFO - In new_distance, calculated angular distance for featural with cent2194 and serb1264: 0.00401616096496582 seconds
2025-06-03 00:27:25,496 - root - INFO - In new_distance, calculated angular distance for featural with umbu1257 and serb1264: 0.0039937496185302734 seconds
2025-06-03 00:27:25,501 - root - INFO - In new_distance, calculated angular distance for featural with nort2690 and serb1264: 0.00400996208190918 seconds
2025-06-03 00:27:25,506 - root - INFO - In new_distance, calculated angular distance for featural with xhos1239 and serb1264: 0.004003047943115234 seconds
2025-06-03 00:27:25,511 - root - INFO - In new_distance, calculated angular distance for featural with east2295 and serb1264: 0.003998517990112305 seconds
2025-06-03 00:27:25,516 - root - INFO - In new_distance, calculated angular distance for featural with stan1306 and serb1264: 0.004008054733276367 seconds
2025-06-03 00:27:25,521 - root - INFO - In new_distance, calculated ang

Error: Tumbuka - tumb1250 - serb1264
Central Atlas Tamazight ('tzm', 'cent2194')
Umbundu ('umb', 'umbu1257')
Northern Uzbek ('uzn', 'nort2690')
Xhosa ('xh', 'xhos1239')
Eastern Yiddish ('ydd', 'east2295')
Standard Malay ('zsm', 'stan1306')
Zulu ('zu', 'zulu1248')


In [11]:
# we add this to the data
for task_name in scores_subset:
    for lang_name in scores_subset[task_name]:
        if lang_name in eu_dists:
            scores_subset[task_name][lang_name]["eu_dist"] = eu_dists[lang_name]
        if lang_name in sr_dists:
            scores_subset[task_name][lang_name]["sr_dist"] = sr_dists[lang_name]

In [12]:
# we run the significance test again, but only taking into account the closest languages
median_dist_eu = np.median(list(eu_dists.values()))
median_dist_sr = np.median(list(sr_dists.values()))
print("median distance eu", median_dist_eu)
print("median distance sr", median_dist_sr)
subset_eu = {}
for task_name in scores_subset:
    subset_eu[task_name] = {}
    for lang_name in scores_subset[task_name]:
        if lang_name in eu_dists and eu_dists[lang_name] < median_dist_eu:
            subset_eu[task_name][lang_name] = scores_subset[task_name][lang_name]
subset_sr = {}
for task_name in scores_subset:
    subset_sr[task_name] = {}
    for lang_name in scores_subset[task_name]:
        if lang_name in sr_dists and sr_dists[lang_name] < median_dist_sr:
            subset_sr[task_name][lang_name] = scores_subset[task_name][lang_name]

median distance eu 0.592
median distance sr 0.4697


In [13]:
# we run significance tests
# we will compare "base" to "extended"
base = "reconstructed_featural_eu"
baselines = ["reconstructed_featural_base", "reconstructed_featural_sr"]

sign = []
not_sign = []
for baseline in baselines:
    print(f"ALL tasks, baseline {baseline}")
    t_stat, p_val = get_significance("all", base, baseline, alternative="greater", data=subset_eu)
    if p_val < 0.05:
        print("The difference is statistically significant")
        sign.append("all" + "_" + baseline)
    else:
        print("The difference is not statistically significant")
        not_sign.append("all" + "_" + baseline)
    print("-----------------------------------")

print("sign", sign)
print("\nnotsign", not_sign)

ALL tasks, baseline reconstructed_featural_base
average scores
reconstructed_featural_eu: 0.670566290884825
reconstructed_featural_base: 0.6708087678464284
t-statistic: -0.37081305756111205
p-value: 0.6436797124917144
The difference is not statistically significant
-----------------------------------
ALL tasks, baseline reconstructed_featural_sr
average scores
reconstructed_featural_eu: 0.670566290884825
reconstructed_featural_sr: 0.6700622854779589
t-statistic: 1.0489033958384446
p-value: 0.15011053912890152
The difference is not statistically significant
-----------------------------------
sign []

notsign ['all_reconstructed_featural_base', 'all_reconstructed_featural_sr']


In [14]:
difference_eu_base = {}
for task_name in scores:
    difference_eu_base[task_name] = {}
    for lang_name in scores[task_name]:
        if (
            "reconstructed_featural_base" in scores[task_name][lang_name]
            and "reconstructed_featural_eu" in scores[task_name][lang_name]
        ):
            difference_eu_base[task_name][lang_name] = (
                scores[task_name][lang_name]["reconstructed_featural_eu"]
                - scores[task_name][lang_name]["reconstructed_featural_base"]
            )
difference_eu_base

{'ner': {'ace': -0.011021847245055227,
  'af': 0.001175962574009648,
  'als': -0.004561523548865409,
  'am': 0.001425601425601497,
  'an': 0.005389832248066773,
  'arz': 0.022140221402214,
  'ar': -0.0011907621544379743,
  'as': 0.025723121702746987,
  'ay': -0.005677924620655861,
  'bar': 0.015622647191687933,
  'ba': -0.008732237834405049,
  'be': -0.0016130765290583993,
  'bg': -0.002204374925134922,
  'bn': -0.0024011779702780567,
  'bo': 0.0,
  'br': -0.007233019619595993,
  'bs': -0.0035284389977650044,
  'ca': -0.0038713937401643195,
  'cdo': -0.004014272970562005,
  'ceb': -0.014160068092130773,
  'ce': 0.0011741859729937631,
  'ckb': 0.0001462273347631049,
  'co': -0.008230452674897193,
  'crh': 0.010921940250562201,
  'cs': 0.0009501219578313957,
  'cv': -0.0018302447952412981,
  'cy': -0.007468860525911114,
  'da': -0.0008571672699058119,
  'de': -0.0009523809523810378,
  'diq': 0.0,
  'dv': 0.0,
  'el': -0.0015832153311616803,
  'en': -0.00012661675768255165,
  'eo': -0.001

In [15]:
# we check which languages are the most different
diffs = []
for task_name in difference_eu_base:
    for lang_name in difference_eu_base[task_name]:
        diffs.append((task_name, lang_name, difference_eu_base[task_name][lang_name]))
diffs = sorted(diffs, key=lambda x: x[2], reverse=True)
diffs[:10]

[('pos', 'hu', 0.041604498656944155),
 ('pos', 'et', 0.039321340863233356),
 ('ner', 'sah', 0.03072895135731013),
 ('ner', 'ku', 0.029540836364257195),
 ('ner', 'as', 0.025723121702746987),
 ('ner', 'arz', 0.022140221402214),
 ('sib', 'cy', 0.019607843137254943),
 ('ner', 'so', 0.018475064603320712),
 ('ner', 'ne', 0.015810276679841917),
 ('ner', 'bar', 0.015622647191687933)]

In [16]:
# we check which languages are the most different
diffs = []
for task_name in difference_eu_base:
    for lang_name in difference_eu_base[task_name]:
        diffs.append((task_name, lang_name, difference_eu_base[task_name][lang_name]))
diffs = sorted(diffs, key=lambda x: x[2], reverse=False)
diffs[:10]

[('ner', 'yo', -0.04074374123179664),
 ('ner', 'war', -0.039820904334705476),
 ('ner', 'vls', -0.0280930142521838),
 ('ner', 'mn', -0.027401129943502855),
 ('qa', 'zh', -0.020876145329926765),
 ('ner', 'ilo', -0.020256540513081123),
 ('pos', 'olo', -0.01989618111777658),
 ('sib', 'kg', -0.019607843137254888),
 ('sib', 'tn', -0.01470588235294118),
 ('sib', 'ht', -0.014705882352941124)]

# Unexpected result
Adding more adapters does not seem to be beneficial.
We see an overall decrease in score, even though the difference is very small.!

In [17]:
# we start by preparing a dataframe: language, scores (& baselines), true/false labels for xlm inclusion and for adapter availability
tasks = ["ner", "pos", "copa", "qa"]
scores_inclusion = [
    "reconstructed_featural",
    "reconstructed_featural_sr",
    "reconstructed_featural_eu",
    "reconstructed_featural_base",
]
data = []
for task in tasks:
    for lang_name in scores[task]:
        if all(col in scores[task][lang_name] for col in scores_inclusion):
            data.append([task, lang_name] + [scores[task][lang_name][col] for col in scores_inclusion])
df = pd.DataFrame(data, columns=["task", "lang_name"] + scores_inclusion)
# we add a column for xlm inclusion and adaper availability
df["xlm_inclusion"] = df["lang_name"].apply(lambda x: 1 if x in xlm_included_langs else 0)
df["adapter_availability"] = df["lang_name"].apply(lambda x: 1 if x in existing_adapters else 0)
# we add columns with the distances calculated to eu and sr
df["eu_dist"] = df["lang_name"].apply(lambda x: eu_dists[x] if x in eu_dists else inf)
df["sr_dist"] = df["lang_name"].apply(lambda x: sr_dists[x] if x in sr_dists else inf)
# we add columns with the difference between scores for base, eu, sr and extended
df["base_sr_diff"] = df["reconstructed_featural_sr"] - df["reconstructed_featural_base"]
df["base_eu_diff"] = df["reconstructed_featural_eu"] - df["reconstructed_featural_base"]
df["base_extended_diff"] = df["reconstructed_featural"] - df["reconstructed_featural_base"]


# we drop the languages that have inf distances
df = df[(df["eu_dist"] != inf) & (df["sr_dist"] != inf)]

In [18]:
from sklearn import linear_model
import statsmodels.api as sm

regr = linear_model.LinearRegression()
X = df[["sr_dist"]]
y = df["base_sr_diff"]  ## we test this with featural_reconstructed, and with the mad-x scores
regr.fit(X, y)
# we interpret this:
# - xlm_inclusion: 1 means the language was included in the xlm training, 0 means it was not
# - adapter_availability: 1 means the language has a trained adapter, 0 means it does not
# - reconstructed_featural: the score of the language with our method

# we can check the coefficients of the linear regression
print("Coefficients: ", regr.coef_)
print("Intercept: ", regr.intercept_)
# we can check the r^2 score of the linear regression
print("R^2 score: ", regr.score(X, y))
# we can check the p-values of the coefficients
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

Coefficients:  [-0.00538413]
Intercept:  0.0015751703368789257
R^2 score:  0.007602209049513586
                            OLS Regression Results                            
Dep. Variable:           base_sr_diff   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1.724
Date:                Tue, 03 Jun 2025   Prob (F-statistic):              0.191
Time:                        00:28:03   Log-Likelihood:                 761.79
No. Observations:                 227   AIC:                            -1520.
Df Residuals:                     225   BIC:                            -1513.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------

In [19]:
regr = linear_model.LinearRegression()
X = df[["eu_dist"]]
y = df["base_eu_diff"]  ## we test this with featural_reconstructed, and with the mad-x scores
regr.fit(X, y)
# we interpret this:
# - xlm_inclusion: 1 means the language was included in the xlm training, 0 means it was not
# - adapter_availability: 1 means the language has a trained adapter, 0 means it does not
# - reconstructed_featural: the score of the language with our method

# we can check the coefficients of the linear regression
print("Coefficients: ", regr.coef_)
print("Intercept: ", regr.intercept_)
# we can check the r^2 score of the linear regression
print("R^2 score: ", regr.score(X, y))
# we can check the p-values of the coefficients
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

Coefficients:  [0.0051436]
Intercept:  -0.0035297287010684847
R^2 score:  0.003972214136946484
                            OLS Regression Results                            
Dep. Variable:           base_eu_diff   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.8973
Date:                Tue, 03 Jun 2025   Prob (F-statistic):              0.345
Time:                        00:28:11   Log-Likelihood:                 760.71
No. Observations:                 227   AIC:                            -1517.
Df Residuals:                     225   BIC:                            -1511.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [20]:
# we check the overall variance of the scores
for task in tasks:
    print(f"Task: {task}")
    print("Variance of scores:")
    # we retrieve the task-specific scores
    task_scores = df[df["task"] == task][scores_inclusion]
    print(task_scores.var())
    print("Mean of scores:")
    print(task_scores.mean())
    print("standard deviation of scores:")
    print(task_scores.std())
    print("\n")

Task: ner
Variance of scores:
reconstructed_featural         0.034103
reconstructed_featural_sr      0.034225
reconstructed_featural_eu      0.034172
reconstructed_featural_base    0.034335
dtype: float64
Mean of scores:
reconstructed_featural         0.498283
reconstructed_featural_sr      0.500544
reconstructed_featural_eu      0.500688
reconstructed_featural_base    0.502255
dtype: float64
standard deviation of scores:
reconstructed_featural         0.184669
reconstructed_featural_sr      0.185001
reconstructed_featural_eu      0.184858
reconstructed_featural_base    0.185296
dtype: float64


Task: pos
Variance of scores:
reconstructed_featural         0.043465
reconstructed_featural_sr      0.043807
reconstructed_featural_eu      0.043770
reconstructed_featural_base    0.043223
dtype: float64
Mean of scores:
reconstructed_featural         0.468250
reconstructed_featural_sr      0.469423
reconstructed_featural_eu      0.469425
reconstructed_featural_base    0.468298
dtype: float64
s

# Results
## NER
SLIGHTLY lower overall score (<1%), but also lower variance and standard deviation with our extended method.
For COPA and QA, the scores go up slightly
For POS, the scores go up when only considering 1 additional adapter, but go down when considering them both compared to the base setting.


In [21]:
regr = linear_model.LinearRegression()
df_exp = df.copy()
df_exp = df_exp[df_exp["task"] == "ner"]
# we only look at the subset of adapter availability
df_exp = df_exp[df_exp["adapter_availability"] == 1]
X = df_exp[["eu_dist"]]
y = df_exp["base_eu_diff"]  ## we test this with featural_reconstructed, and with the mad-x scores
regr.fit(X, y)
# we interpret this:
# - xlm_inclusion: 1 means the language was included in the xlm training, 0 means it was not
# - adapter_availability: 1 means the language has a trained adapter, 0 means it does not
# - reconstructed_featural: the score of the language with our method
get_significance(
    "ner", "reconstructed_featural_base", "reconstructed_featural_eu", alternative="greater", data=no_adapter
)
# we can check the p-values of the coefficients
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

average scores
reconstructed_featural_base: 0.50859276228701
reconstructed_featural_eu: 0.507307851555658
t-statistic: 1.289859325036442
p-value: 0.0999666016738178
                            OLS Regression Results                            
Dep. Variable:           base_eu_diff   R-squared:                       0.111
Model:                            OLS   Adj. R-squared:                  0.076
Method:                 Least Squares   F-statistic:                     3.129
Date:                Tue, 03 Jun 2025   Prob (F-statistic):             0.0891
Time:                        00:28:20   Log-Likelihood:                 103.78
No. Observations:                  27   AIC:                            -203.6
Df Residuals:                      25   BIC:                            -201.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t  

# Result
The difference, while not statistically significant, becomes BIGGER as we go further from eu.
Overall, the base adapter outperforms those where the new adapters are included
==> We conclude that the new adapters do not help the results, rather hurt them.