In [7]:
import os
import json
import math
import pandas as pd

# From huggingface api, with our trained adapters (eu, sr)
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
# scraped from cc-100 website
xlm_included_langs = [
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "br",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "fa",
    "ff",
    "fi",
    "fr",
    "fy",
    "ga",
    "gd",
    "gl",
    "gn",
    "gu",
    "ha",
    "he",
    "hi",
    "hr",
    "ht",
    "hu",
    "hy",
    "id",
    "ig",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ku",
    "ky",
    "la",
    "lg",
    "li",
    "ln",
    "lo",
    "lt",
    "lv",
    "mg",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "ne",
    "nl",
    "no",
    "ns",
    "om",
    "or",
    "pa",
    "pl",
    "ps",
    "pt",
    "qu",
    "rm",
    "ro",
    "ru",
    "sa",
    "si",
    "sc",
    "sd",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "ss",
    "su",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "tl",
    "tn",
    "tr",
    "ug",
    "uk",
    "ur",
    "uz",
    "vi",
    "wo",
    "xh",
    "yi",
    "yo",
    "zu",
    "zh",
]
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1", "sib": "eval_accuracy"}
tasks = list(f1.keys())
scores = {task: {} for task in tasks}
inf = math.inf
for file in os.listdir("../eval_scores/selected"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/selected", file), "r") as f:
                data = json.load(f)
                task_name = file.split(".")[0]

                scores[task_name] = data

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)
langs = set()
for task in tasks:
    for lang_name in scores[task]:
        if "reconstructed_featural" not in scores[task][lang_name].keys():
            continue
        langs.add(lang_name)

# Analysing task - language pairs
We will make a dataframe with columns corresponding to tasks,
and rows corresponding to the languages in our data

In [8]:
# we make a code - english language mapping
from qq import LanguageData, TagType

ld = LanguageData.from_db()

lang_en = {}
for lang in langs:
    try:
        eng = ld.get(lang, tag_type=TagType.BCP_47_CODE).english_name
        lang_en[lang] = eng
    except KeyError:
        print(f"KeyError for language: {lang}")

In [9]:
# Define your columns
rows = []
for lang in langs:
    row = {"Language": f"{lang_en[lang]} ({lang})"}
    for task in tasks:
        row[task] = lang in scores.get(task, [])
    print(f"{lang} has data for {[row[task] for task in tasks]} tasks.")
    rows.append(row)

df = pd.DataFrame(rows, columns=["Language"] + tasks)
df

yue has data for [False, False, True, False, True] tasks.
ta has data for [True, True, True, False, True] tasks.
taq has data for [False, False, False, False, True] tasks.
lmo has data for [True, False, False, False, True] tasks.
so has data for [True, False, False, False, True] tasks.
npi has data for [False, False, False, False, True] tasks.
ay has data for [True, False, False, False, False] tasks.
nds has data for [True, False, False, False, False] tasks.
ne has data for [True, False, False, False, False] tasks.
sq has data for [True, False, True, False, False] tasks.
sv has data for [True, False, True, False, True] tasks.
cy has data for [True, False, True, False, True] tasks.
nus has data for [False, False, False, False, True] tasks.
lb has data for [True, False, False, False, True] tasks.
ory has data for [False, False, False, False, True] tasks.
ja has data for [True, False, True, False, True] tasks.
si has data for [True, False, False, False, True] tasks.
cv has data for [True,

Unnamed: 0,Language,ner,copa,pos,qa,sib
0,Yue Chinese (yue),False,False,True,False,True
1,Tamil (ta),True,True,True,False,True
2,Tamasheq (taq),False,False,False,False,True
3,Lombard (lmo),True,False,False,False,True
4,Somali (so),True,False,False,False,True
...,...,...,...,...,...,...
229,Arabic (ar),True,False,True,True,False
230,Swiss German (gsw),False,False,True,False,False
231,Dhivehi (dv),True,False,False,False,False
232,Shona (sn),False,False,False,False,True


In [10]:
# from this dataframe, we pop all the rows that are present in only 1 task, and store them in a task_exclusive dict
task_exclusive = {task: [] for task in tasks}
for index, row in df.iterrows():
    task_count = sum(row[task] for task in tasks)
    if task_count == 1:
        for task in tasks:
            if row[task]:
                task_exclusive[task].append(row["Language"])
                break
df_filtered = df[~df.apply(lambda row: sum(row[task] for task in tasks) == 1, axis=1)]
df_filtered

Unnamed: 0,Language,ner,copa,pos,qa,sib
0,Yue Chinese (yue),False,False,True,False,True
1,Tamil (ta),True,True,True,False,True
3,Lombard (lmo),True,False,False,False,True
4,Somali (so),True,False,False,False,True
9,Albanian (sq),True,False,True,False,False
...,...,...,...,...,...,...
217,Norwegian (no),True,False,True,False,False
219,Punjabi (pa),True,False,False,False,True
221,Chinese (zh),True,True,True,True,True
226,Bulgarian (bg),True,False,True,False,True


In [12]:
# we print this as latex with 1 and 0 for presence and absence of data
df_filtered = df_filtered.replace({True: 1, False: 0})
# we order the languages alphabetically
df_filtered = df_filtered.sort_values(by="Language").reset_index(drop=True)


# 1) define a little formatter that wraps 1→green, 0→red
def color_binary(x):
    if pd.isna(x):
        return ""
    return r"\textcolor{" + ("green" if x == 1 else "red") + r"}{" + str(int(x)) + r"}"


# 2) build a dict mapping each task‐column to that formatter
formatters = {task: color_binary for task in tasks}

# 3) emit LaTeX, letting our raw commands through
latex = df_filtered.to_latex(
    index=False,
    escape=False,  # don’t escape backslashes
    formatters=formatters,
)
print(latex)

\begin{tabular}{lrrrrr}
\toprule
Language & ner & copa & pos & qa & sib \\
\midrule
Achinese (ace) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{green}{1} \\
Afrikaans (af) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} \\
Albanian (sq) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Amharic (am) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} \\
Arabic (ar) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{green}{1} & \textcolor{red}{0} \\
Armenian (hy) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} \\
Assamese (as) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{green}{1} \\
Bambara (bm) & \textcolor{r

In [14]:
import pandas as pd

# (Assuming df_filtered is already prepared, with True/False → 1/0 and sorted by "Language".)

# 1) Find the midpoint and split into two DataFrames
n = len(df_filtered)
mid = (n + 1) // 2  # put the extra row (if odd) into the first half
df1 = df_filtered.iloc[:mid].reset_index(drop=True)
df2 = df_filtered.iloc[mid:].reset_index(drop=True)


# 2) Re-use the same color formatter and formatters dict you already have
def color_binary(x):
    if pd.isna(x):
        return ""
    return r"\textcolor{" + ("green" if x == 1 else "red") + r"}{" + str(int(x)) + r"}"


formatters = {task: color_binary for task in tasks}

# 3) Generate two separate LaTeX tables (no surrounding "table" environment yet)
latex1 = df1.to_latex(index=False, escape=False, formatters=formatters)
latex2 = df2.to_latex(index=False, escape=False, formatters=formatters)

# 4) Combine them into one LaTeX snippet using two minipages
combined = r"""\begin{table}[ht]
\centering
\begin{minipage}{0.48\textwidth}
%s
\end{minipage}\hfill
\begin{minipage}{0.48\textwidth}
%s
\end{minipage}
\end{table}
""" % (latex1, latex2)

print(combined)

\begin{table}[ht]
\centering
\begin{minipage}{0.48\textwidth}
\begin{tabular}{lrrrrr}
\toprule
Language & ner & copa & pos & qa & sib \\
\midrule
Achinese (ace) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{green}{1} \\
Afrikaans (af) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} \\
Albanian (sq) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{red}{0} \\
Amharic (am) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} \\
Arabic (ar) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{green}{1} & \textcolor{red}{0} \\
Armenian (hy) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{green}{1} \\
Assamese (as) & \textcolor{green}{1} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor

In [16]:
import re

# we first sort the languages alphabetically
task_exclusive = {task: sorted(task_exclusive[task]) for task in tasks}
# we make a table with the languages that are exclusive to one task
task_exclusive_df = pd.DataFrame(
    {"Task": ["NER", "POS", "COPA", "QA", "SIB"], "Languages": [", ".join(task_exclusive[task]) for task in tasks]}
)
task_exclusive_df = task_exclusive_df.sort_values(by="Task").reset_index(drop=True)


# we print this as latex,
# we want there to be a line break after every FOUR languages
def format_languages(languages, max_len=50):
    langs = [lang.strip() for lang in languages.split(",")]
    lines = []
    current = ""
    for lang in langs:
        if not current:
            # start a fresh line
            current = lang
        else:
            # would adding ", "+lang exceed max_len?
            sep = ", "
            if len(current) + len(sep) + len(lang) <= max_len:
                current = f"{current}{sep}{lang}"
            else:
                # commit the current line and start a new one
                lines.append(current)
                current = lang
    # append the last buffer
    if current:
        lines.append(current)

    # join with LaTeX linebreaks, wrap in makecell
    body = r" \\ ".join(lines)
    return r"\makecell[l]{" + body + r"}"


# Apply to your DataFrame just like before
task_exclusive_df["Languages"] = task_exclusive_df["Languages"].apply(lambda cell: format_languages(cell, max_len=80))
latex_exclusive = task_exclusive_df.to_latex(index=False, escape=False)
# 2) insert a \hline after every data row
lines = latex_exclusive.splitlines()
out = []
for line in lines:
    out.append(line)
    # detect a data‐row: has an '&' and ends with '\\'
    if re.match(r"\s*[^\\].*&.*\\\\$", line):
        out.append(r"\hline")
new_latex = "\n".join(out)

print(new_latex)

\begin{tabular}{ll}
\toprule
Task & Languages \\
\hline
\midrule
COPA & \makecell[l]{Akuntsu (aqz), Apurinã (apu), Chukot (ckt), Coptic (cop), Erzya (myv) \\ Komi-Zyrian (kpv), Livvi (olo), Manx (gv), Mbyá Guaraní (gun), Moksha (mdf) \\ Mundurukú (myu), Nigerian Pidgin (pcm), Russia Buriat (bxr), Skolt Sami (sms) \\ Swiss German (gsw), Tupinambá (tpn), Warlpiri (wbp)} \\
\hline
NER & \makecell[l]{Aragonese (an), Aymara (ay), Bavarian (bar), Chechen (ce), Chuvash (cv) \\ Corsican (co), Dhivehi (dv), Dimli (diq), Eastern Mari (mhr), Extremaduran (ext) \\ Frisian (fy), Gan Chinese (gan), Hakka Chinese (hak), Kurmanji Kurdish (ku) \\ Low German (nds), Mazanderani (mzn), Min Dong Chinese (cdo), Mingrelian (xmf) \\ Mongolian (mn), Neapolitan (nap), Nepali (ne), Northern Frisian (frr) \\ Ossetian (os), Pashto (ps), Romansh (rm), Scots (sco), Serbo-Croatian (sh) \\ Uzbek (uz), Veps (vep), Vlaams (vls), Western Panjabi (pnb), Wu Chinese (wuu) \\ Yakut (sah), Zeeuws (zea)} \\
\hline
POS & \makec