# Tabulate results


In [1]:
import os
import sys

import pandas as pd
from tabulate import tabulate
from tqdm import tqdm
sys.path.append('../src')
from read_log_file import read_log_file

In [2]:
LOG_HOME_DIR = os.path.join('../logs/')
assert os.path.isdir(LOG_HOME_DIR)

In [3]:
MODEL_NAMES = ['logistic_regression', 'transformer_encoder', 'bert-base-uncased', 'bert-base-multilingual-cased']

In [4]:
SETUPS = ['zero', 'few50', 'few100', 'few150', 'few200', 'full', 'trg']

In [5]:
def get_best_score_from_dict(di: dict) -> dict:
    """Get max value from a dict"""
    keys_with_max_val = []
    # find max value
    max_val = -float('inf')
    for k, v in di.items():
        if v > max_val:
            max_val = v
    # find all keys with max value
    for k, v in di.items():
        if v == max_val:
            keys_with_max_val.append(k)
    return {
        'k': keys_with_max_val,
        'v': max_val,
    }

In [6]:
def create_best_results_df(langs: str) -> pd.DataFrame:
    results_dict = {}
    for model_name in MODEL_NAMES:
        results_dict[model_name] = {}
        log_dir = os.path.join(LOG_HOME_DIR, langs, model_name)
        log_filenames = os.listdir(log_dir)
        for fname in log_filenames:
            results_dict[model_name][fname] = read_log_file(
                log_file_path=os.path.join(log_dir, fname),
                plot=False,
                verbose=False,
            )['best_val_metrics']['f1']

    best_results_dict = {'Setup': SETUPS}
    best_results_dict.update({model_name: [] for model_name in MODEL_NAMES})
    for model_name in MODEL_NAMES:
        for setup in SETUPS:
            best_results_dict[model_name].append(
                get_best_score_from_dict(
                    {k: v for k, v in results_dict[model_name].items() if k.startswith(f'{setup}_')}
                )['v']
            )

    best_results_df = pd.DataFrame(best_results_dict)
    return best_results_df

In [7]:
def highlight_best_score(df: pd.DataFrame) -> pd.DataFrame:
    """Highlight best score in each row"""
    return df.style.apply(lambda x: ['background: red' if isinstance(v, float) and v == max(x.iloc[1:]) else '' for v in x], axis=1)

In [8]:
def tabulate_markdown(df: pd.DataFrame) -> str:
    """Tabulate in markdown format and bold best scores in each row"""
    df = df.round(4)
    for model_name in MODEL_NAMES:
        df[model_name] = df[model_name].astype(str)
    for idx in range(len(df)):
        max_val = max(float(df.iloc[idx][model_name]) for model_name in MODEL_NAMES)
        for model_name in MODEL_NAMES:
            cell_val = float(df.iloc[idx][model_name])
            if cell_val == max_val:
                df.at[idx, model_name] = f'**{cell_val}**'
            else:
                df.at[idx, model_name] = f'{cell_val}'

    return tabulate(df, headers='keys', showindex=False, tablefmt='github')


In [10]:
best_results_dfs_dict = {}
for langs in tqdm(['enbg', 'enar', 'bgen', 'bgar', 'aren', 'arbg'][:2]):
    best_results_dfs_dict[langs] = create_best_results_df(langs)

100%|██████████| 2/2 [00:00<00:00,  2.60it/s]


## en-bg

In [11]:
highlight_best_score(best_results_dfs_dict['enbg'])

Unnamed: 0,Setup,logistic_regression,transformer_encoder,bert-base-uncased,bert-base-multilingual-cased
0,zero,0.369656,0.803699,0.80272,0.810053
1,few50,0.775659,0.799576,0.810351,0.81859
2,few100,0.781497,0.810259,0.816115,0.822725
3,few150,0.795635,0.819153,0.818532,0.820666
4,few200,0.807414,0.819964,0.815694,0.824915
5,full,0.812499,0.8146,0.822458,0.833575
6,trg,0.813775,0.809617,0.821763,0.843117


In [12]:
print(tabulate_markdown(best_results_dfs_dict['enbg']))

| Setup   |   logistic_regression |   transformer_encoder |   bert-base-uncased | bert-base-multilingual-cased   |
|---------|-----------------------|-----------------------|---------------------|--------------------------------|
| zero    |                0.3697 |                0.8037 |              0.8027 | **0.8101**                     |
| few50   |                0.7757 |                0.7996 |              0.8104 | **0.8186**                     |
| few100  |                0.7815 |                0.8103 |              0.8161 | **0.8227**                     |
| few150  |                0.7956 |                0.8192 |              0.8185 | **0.8207**                     |
| few200  |                0.8074 |                0.82   |              0.8157 | **0.8249**                     |
| full    |                0.8125 |                0.8146 |              0.8225 | **0.8336**                     |
| trg     |                0.8138 |                0.8096 |              0.8218 

## en-ar

In [13]:
highlight_best_score(best_results_dfs_dict['enar'])

Unnamed: 0,Setup,logistic_regression,transformer_encoder,bert-base-uncased,bert-base-multilingual-cased
0,zero,0.422418,0.584852,0.598659,0.672059
1,few50,0.727199,0.689753,0.675045,0.774892
2,few100,0.743307,0.685501,0.691503,0.824014
3,few150,0.718456,0.688997,0.69793,0.791109
4,few200,0.747128,0.739566,0.711928,0.786594
5,full,0.747128,0.739566,0.711928,0.786594
6,trg,0.648866,0.68439,0.734822,0.738182


In [14]:
print(tabulate_markdown(best_results_dfs_dict['enar']))

| Setup   |   logistic_regression |   transformer_encoder |   bert-base-uncased | bert-base-multilingual-cased   |
|---------|-----------------------|-----------------------|---------------------|--------------------------------|
| zero    |                0.4224 |                0.5849 |              0.5987 | **0.6721**                     |
| few50   |                0.7272 |                0.6898 |              0.675  | **0.7749**                     |
| few100  |                0.7433 |                0.6855 |              0.6915 | **0.824**                      |
| few150  |                0.7185 |                0.689  |              0.6979 | **0.7911**                     |
| few200  |                0.7471 |                0.7396 |              0.7119 | **0.7866**                     |
| full    |                0.7471 |                0.7396 |              0.7119 | **0.7866**                     |
| trg     |                0.6489 |                0.6844 |              0.7348 

## bg-en

In [15]:
highlight_best_score(best_results_dfs_dict['bgen'])

KeyError: 'bgen'

In [16]:
print(tabulate_markdown(best_results_dfs_dict['bgen']))

KeyError: 'bgen'

## bg-ar

In [17]:
highlight_best_score(best_results_dfs_dict['bgar'])

KeyError: 'bgar'

In [18]:
print(tabulate_markdown(best_results_dfs_dict['bgar']))


KeyError: 'bgar'

## ar-en

In [19]:
highlight_best_score(best_results_dfs_dict['aren'])

KeyError: 'aren'

In [20]:
print(tabulate_markdown(best_results_dfs_dict['aren']))

KeyError: 'aren'

## ar-bg

In [21]:
highlight_best_score(best_results_dfs_dict['arbg'])

KeyError: 'arbg'

In [22]:
print(tabulate_markdown(best_results_dfs_dict['arbg']))

KeyError: 'arbg'