In [15]:
import os
import json
import math
import pandas as pd

# From huggingface api, with our trained adapters (eu, sr)
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# scraped from cc-100 website
xlm_included_langs = [
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "br",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "fa",
    "ff",
    "fi",
    "fr",
    "fy",
    "ga",
    "gd",
    "gl",
    "gn",
    "gu",
    "ha",
    "he",
    "hi",
    "hr",
    "ht",
    "hu",
    "hy",
    "id",
    "ig",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ku",
    "ky",
    "la",
    "lg",
    "li",
    "ln",
    "lo",
    "lt",
    "lv",
    "mg",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "ne",
    "nl",
    "no",
    "ns",
    "om",
    "or",
    "pa",
    "pl",
    "ps",
    "pt",
    "qu",
    "rm",
    "ro",
    "ru",
    "sa",
    "si",
    "sc",
    "sd",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "ss",
    "su",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "tl",
    "tn",
    "tr",
    "ug",
    "uk",
    "ur",
    "uz",
    "vi",
    "wo",
    "xh",
    "yi",
    "yo",
    "zu",
    "zh",
]
scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1"}
inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)
langs = set()
tasks = ["ner", "pos", "copa", "qa"]
for task in tasks:
    for lang_name in scores[task]:
        if "reconstructed_featural" not in scores[task][lang_name].keys():
            continue
        langs.add(lang_name)

# Analysing task - language pairs
We will make a dataframe with columns corresponding to tasks,
and rows corresponding to the languages in our data

In [20]:
# we make a code - english language mapping
from qq import LanguageData, TagType

ld = LanguageData.from_db()

lang_en = {}
for lang in langs:
    try:
        eng = ld.get(lang, tag_type=TagType.BCP_47_CODE).english_name
        lang_en[lang] = eng
    except KeyError:
        print(f"KeyError for language: {lang}")

In [23]:
# Define your columns
rows = []
for lang in langs:
    row = {"Language": f"{lang_en[lang]} ({lang})"}
    for task in tasks:
        row[task] = lang in scores.get(task, [])
    print(f"{lang} has data for {[row[task] for task in tasks]} tasks.")
    rows.append(row)

df = pd.DataFrame(rows, columns=["Language"] + tasks)
df

myv has data for [False, True, False, False] tasks.
oc has data for [True, False, False, False] tasks.
fr has data for [True, True, False, False] tasks.
ps has data for [True, False, False, False] tasks.
ckb has data for [True, False, False, False] tasks.
os has data for [True, False, False, False] tasks.
th has data for [True, True, True, True] tasks.
gun has data for [False, True, False, False] tasks.
cop has data for [False, True, False, False] tasks.
cs has data for [True, True, False, False] tasks.
cy has data for [True, True, False, False] tasks.
hak has data for [True, False, False, False] tasks.
uz has data for [True, False, False, False] tasks.
war has data for [True, False, False, False] tasks.
vls has data for [True, False, False, False] tasks.
nds has data for [True, False, False, False] tasks.
is has data for [True, True, False, False] tasks.
tt has data for [True, False, False, False] tasks.
ur has data for [True, True, False, False] tasks.
olo has data for [False, True, 

Unnamed: 0,Language,ner,pos,copa,qa
0,Erzya (myv),False,True,False,False
1,Occitan (oc),True,False,False,False
2,French (fr),True,True,False,False
3,Pashto (ps),True,False,False,False
4,Sorani Kurdish (ckb),True,False,False,False
...,...,...,...,...,...
152,Mingrelian (xmf),True,False,False,False
153,Mongolian (mn),True,False,False,False
154,Bavarian (bar),True,False,False,False
155,Telugu (te),True,True,False,False


In [24]:
# from this dataframe, we pop all the rows that are present in only 1 task, and store them in a task_exclusive dict
task_exclusive = {task: [] for task in tasks}
for index, row in df.iterrows():
    task_count = sum(row[task] for task in tasks)
    if task_count == 1:
        for task in tasks:
            if row[task]:
                task_exclusive[task].append(row["Language"])
                break
df_filtered = df[~df.apply(lambda row: sum(row[task] for task in tasks) == 1, axis=1)]
df_filtered

Unnamed: 0,Language,ner,pos,copa,qa
2,French (fr),True,True,False,False
6,Thai (th),True,True,True,True
9,Czech (cs),True,True,False,False
10,Welsh (cy),True,True,False,False
16,Icelandic (is),True,True,False,False
18,Urdu (ur),True,True,False,False
20,Slovenian (sl),True,True,False,False
21,Tagalog (tl),True,True,False,False
27,Irish (ga),True,True,False,False
28,Afrikaans (af),True,True,False,False


In [38]:
# we print this as latex with 1 and 0 for presence and absence of data
df_filtered = df_filtered.replace({True: 1, False: 0})
# we order the languages alphabetically
df_filtered = df_filtered.sort_values(by="Language").reset_index(drop=True)


# 1) define a little formatter that wraps 1→green, 0→red
def color_binary(x):
    if pd.isna(x):
        return ""
    return r"\textcolor{" + ("OliveGreen" if x == 1 else "red") + r"}{" + str(int(x)) + r"}"


# 2) build a dict mapping each task‐column to that formatter
formatters = {task: color_binary for task in tasks}

# 3) emit LaTeX, letting our raw commands through
latex = df_filtered.to_latex(
    index=False,
    escape=False,  # don’t escape backslashes
    formatters=formatters,
)
print(latex)

\begin{tabular}{lrrrr}
\toprule
Language & ner & pos & copa & qa \\
\midrule
Afrikaans (af) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Albanian (sq) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Amharic (am) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Arabic (ar) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{OliveGreen}{1} \\
Armenian (hy) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Basque (eu) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Belarusian (be) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Breton (br) & \textcolor{OliveGreen}{1} & \textcolor{OliveGreen}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\


In [41]:
len(task_exclusive["ner"]), len(task_exclusive["pos"]), len(task_exclusive["copa"]), len(task_exclusive["qa"])

(74, 22, 1, 0)

In [51]:
import re

# we first sort the languages alphabetically
task_exclusive = {task: sorted(task_exclusive[task]) for task in tasks}
# we make a table with the languages that are exclusive to one task
task_exclusive_df = pd.DataFrame(
    {"Task": ["NER", "POS", "COPA", "QA"], "Languages": [", ".join(task_exclusive[task]) for task in tasks]}
)
task_exclusive_df = task_exclusive_df.sort_values(by="Task").reset_index(drop=True)


# we print this as latex,
# we want there to be a line break after every FOUR languages
def format_languages(languages, max_len=50):
    langs = [lang.strip() for lang in languages.split(",")]
    lines = []
    current = ""
    for lang in langs:
        if not current:
            # start a fresh line
            current = lang
        else:
            # would adding ", "+lang exceed max_len?
            sep = ", "
            if len(current) + len(sep) + len(lang) <= max_len:
                current = f"{current}{sep}{lang}"
            else:
                # commit the current line and start a new one
                lines.append(current)
                current = lang
    # append the last buffer
    if current:
        lines.append(current)

    # join with LaTeX linebreaks, wrap in makecell
    body = r" \\ ".join(lines)
    return r"\makecell[l]{" + body + r"}"


# Apply to your DataFrame just like before
task_exclusive_df["Languages"] = task_exclusive_df["Languages"].apply(lambda cell: format_languages(cell, max_len=35))
latex_exclusive = task_exclusive_df.to_latex(index=False, escape=False)
# 2) insert a \hline after every data row
lines = latex_exclusive.splitlines()
out = []
for line in lines:
    out.append(line)
    # detect a data‐row: has an '&' and ends with '\\'
    if re.match(r"\s*[^\\].*&.*\\\\$", line):
        out.append(r"\hline")
new_latex = "\n".join(out)

print(new_latex)

\begin{tabular}{ll}
\toprule
Task & Languages \\
\hline
\midrule
COPA & \makecell[l]{Haitian Creole (ht)} \\
\hline
NER & \makecell[l]{Achinese (ace), Aragonese (an) \\ Assamese (as), Aymara (ay) \\ Bashkir (ba), Bavarian (bar) \\ Bengali (bn), Bosnian (bs) \\ Cebuano (ceb), Chechen (ce) \\ Chuvash (cv), Corsican (co) \\ Crimean Tatar (crh), Dhivehi (dv) \\ Dimli (diq), Eastern Mari (mhr) \\ Egyptian Arabic (arz) \\ Esperanto (eo), Extremaduran (ext) \\ Frisian (fy), Friulian (fur) \\ Gan Chinese (gan), Georgian (ka) \\ Guarani (gn), Gujarati (gu) \\ Hakka Chinese (hak), Igbo (ig) \\ Ilocano (ilo), Javanese (jv) \\ Kannada (kn), Khmer (km) \\ Kinyarwanda (rw) \\ Kurmanji Kurdish (ku), Kyrgyz (ky) \\ Lingala (ln), Lombard (lmo) \\ Low German (nds) \\ Luxembourgish (lb), Macedonian (mk) \\ Maori (mi), Mazanderani (mzn) \\ Min Dong Chinese (cdo) \\ Minangkabau (min), Mingrelian (xmf) \\ Mongolian (mn) \\ Myanmar (Burmese) (my) \\ Neapolitan (nap), Nepali (ne) \\ Northern Frisian (frr) \\ 