# Tabulate results


In [1]:
import os
import sys
from typing import Tuple
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import tqdm

sys.path.append('../src')
from read_log_file import read_log_file

In [2]:
LOG_HOME_DIR = os.path.join('../logs/')
assert os.path.isdir(LOG_HOME_DIR)

In [3]:
MODEL_NAMES = ['bert-base-multilingual-cased', 'bert-base-german-cased']
SETUPS = ['monotask', 'multitask']

In [4]:
def get_best_score_from_dict(di: dict) -> dict:
    """Get max value from a dict"""
    keys_with_max_val = []
    # find max value
    max_val = -float('inf')
    for k, v in di.items():
        if v > max_val:
            max_val = v
    # find all keys with max value
    for k, v in di.items():
        if v == max_val:
            keys_with_max_val.append(k)
    return {
        'k': keys_with_max_val,
        'v': max_val,
    }

In [5]:
def create_results_df(log_dir: str) -> pd.DataFrame:
    results_dict = {'log_filename': [], 'f1': [], 'acc': [], 'precision': [], 'recall': []}
    log_filenames = sorted(os.listdir(log_dir))
    for fname in log_filenames:
        best_val_metrics = read_log_file(
            log_file_path=os.path.join(log_dir, fname),
            plot=False,
            verbose=False,
        )['best_val_metrics']
        results_dict['log_filename'].append(fname)
        results_dict['f1'].append(best_val_metrics['f1'])
        results_dict['acc'].append(best_val_metrics['acc'])
        results_dict['precision'].append(best_val_metrics['precision'])
        results_dict['recall'].append(best_val_metrics['recall'])
    return pd.DataFrame(results_dict)

In [6]:
def highlight_best_score(df: pd.DataFrame) -> pd.DataFrame:
    """Highlight best score in each row"""
    return df.style.apply(
        lambda x: ['background: red' if isinstance(v, float) and v == max(x.iloc[1:]) else '' for v in x], axis=1)

In [7]:
def tabulate_markdown(df: pd.DataFrame) -> str:
    """Tabulate in markdown format and bold best scores in each row"""
    df = df.round(4)
    for model_name in MODEL_NAMES:
        df[model_name] = df[model_name].astype(str)
    for idx in range(len(df)):
        max_val = max(float(df.iloc[idx][model_name]) for model_name in MODEL_NAMES)
        for model_name in MODEL_NAMES:
            cell_val = float(df.iloc[idx][model_name])
            if cell_val == max_val:
                df.at[idx, model_name] = f'**{cell_val}**'
            else:
                df.at[idx, model_name] = f'{cell_val}'

    return tabulate(df, headers='keys', showindex=False, tablefmt='github')


## Monotask models

### (a) Multilingual BERT

In [8]:
create_results_df(os.path.join(LOG_HOME_DIR, 'monotask', 'bert-base-multilingual-cased')).style.highlight_max(
    color='blue', axis=0)

Unnamed: 0,log_filename,f1,acc,precision,recall
0,trg_fc128_lr0.0005_frozen.txt,0.750252,0.773498,0.748096,0.75277
1,trg_fc128_lr0.0005_trainable.txt,0.254023,0.340524,0.170262,0.5
2,trg_fc128_lr0.005_frozen.txt,0.743129,0.761171,0.738117,0.753272
3,trg_fc128_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5
4,trg_fc128_lr0.05_frozen.txt,0.313335,0.375963,0.656282,0.525775
5,trg_fc128_lr0.05_trainable.txt,0.254023,0.340524,0.170262,0.5
6,trg_fc256_lr0.0005_frozen.txt,0.751464,0.767334,0.746158,0.76451
7,trg_fc256_lr0.0005_trainable.txt,0.254023,0.340524,0.170262,0.5
8,trg_fc256_lr0.005_frozen.txt,0.763356,0.784284,0.760083,0.767513
9,trg_fc256_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5


### (b) German BERT

In [9]:
create_results_df(os.path.join(LOG_HOME_DIR, 'monotask', 'bert-base-german-cased')).style.highlight_max(color='blue',
                                                                                                        axis=0)


Unnamed: 0,log_filename,f1,acc,precision,recall
0,trg_fc128_lr0.0005_frozen.txt,0.780714,0.8151,0.808749,0.766804
1,trg_fc128_lr0.0005_trainable.txt,0.3974,0.659476,0.329738,0.5
2,trg_fc128_lr0.005_frozen.txt,0.759478,0.779661,0.755455,0.765102
3,trg_fc128_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5
4,trg_fc128_lr0.05_frozen.txt,0.717957,0.773498,0.770159,0.70353
5,trg_fc128_lr0.05_trainable.txt,0.254023,0.340524,0.170262,0.5
6,trg_fc256_lr0.0005_frozen.txt,0.7777,0.812018,0.803778,0.764468
7,trg_fc256_lr0.0005_trainable.txt,0.254023,0.340524,0.170262,0.5
8,trg_fc256_lr0.005_frozen.txt,0.771482,0.804314,0.790256,0.760815
9,trg_fc256_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5


## Multitask models

### (a) Multilingual BERT

In [10]:
create_results_df(os.path.join(LOG_HOME_DIR, 'multitask', 'bert-base-multilingual-cased')).style.highlight_max(
    color='blue', axis=0)

Unnamed: 0,log_filename,f1,acc,precision,recall
0,trg_fc128_lr0.0005_frozen.txt,0.767662,0.791988,0.768577,0.766789
1,trg_fc128_lr0.0005_trainable.txt,0.254023,0.340524,0.170262,0.5
2,trg_fc128_lr0.005_frozen.txt,0.741753,0.775039,0.75144,0.735336
3,trg_fc128_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5
4,trg_fc128_lr0.05_frozen.txt,0.386932,0.423729,0.65258,0.558707
5,trg_fc128_lr0.05_trainable.txt,0.254023,0.340524,0.170262,0.5
6,trg_fc256_lr0.0005_frozen.txt,0.750766,0.773498,0.748207,0.753864
7,trg_fc256_lr0.0005_trainable.txt,0.254023,0.340524,0.170262,0.5
8,trg_fc256_lr0.005_frozen.txt,0.752221,0.791988,0.780263,0.739433
9,trg_fc256_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5


### (b) German BERT

In [11]:
create_results_df(os.path.join(LOG_HOME_DIR, 'multitask', 'bert-base-german-cased')).style.highlight_max(color='blue',
                                                                                                         axis=0)

Unnamed: 0,log_filename,f1,acc,precision,recall
0,trg_fc128_lr0.0005_frozen.txt,0.769996,0.802773,0.787993,0.759647
1,trg_fc128_lr0.0005_trainable.txt,0.3974,0.659476,0.329738,0.5
2,trg_fc128_lr0.005_frozen.txt,0.774056,0.808937,0.799837,0.761037
3,trg_fc128_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5
4,trg_fc128_lr0.05_frozen.txt,0.254023,0.340524,0.170262,0.5
5,trg_fc128_lr0.05_trainable.txt,0.254023,0.340524,0.170262,0.5
6,trg_fc256_lr0.0005_frozen.txt,0.779537,0.808937,0.792902,0.770885
7,trg_fc256_lr0.0005_trainable.txt,0.254023,0.340524,0.170262,0.5
8,trg_fc256_lr0.005_frozen.txt,0.755406,0.784284,0.760943,0.7511
9,trg_fc256_lr0.005_trainable.txt,0.254023,0.340524,0.170262,0.5


## Cross validation results

In [12]:
CV_LOG_HOME_DIR = os.path.join('../logs_cv/')
FOLDS = ['fold_A', 'fold_B', 'fold_C', 'fold_D', 'fold_E']
# sanity check
assert os.path.isdir(CV_LOG_HOME_DIR)

In [13]:
cv_results_dict = {}
for setup in SETUPS:
    for model_name in MODEL_NAMES:
        cv_results_dict[setup, model_name] = [
            create_results_df(os.path.join(CV_LOG_HOME_DIR, f'cv_{fold}_{setup}', model_name)) for fold in FOLDS
        ]

log_filenames = cv_results_dict[SETUPS[0], MODEL_NAMES[0]][0]['log_filename'].to_list()
table_dict = {'log_filename': log_filenames}
table_dict.update({k: [] for k in cv_results_dict})
for (setup, model_name), li in cv_results_dict.items():
    # sanity check
    for df in li:
        assert log_filenames == df['log_filename'].to_list()
    for idx, log_filename in enumerate(log_filenames):
        f1_mean = np.mean([df['f1'][idx] for df in li])
        f1_std = np.std([df['f1'][idx] for df in li])
        table_dict[setup, model_name].append(f'{f1_mean:0.3f} ± {f1_std:0.2f}')
pd.DataFrame(table_dict).style.apply(
    lambda x: [
        'background: blue' if "±" in v and float(v.split()[0]) == max([float(value.split()[0]) for value in x]) else ''
        for v in x], axis=0)

Unnamed: 0,log_filename,"('monotask', 'bert-base-multilingual-cased')","('monotask', 'bert-base-german-cased')","('multitask', 'bert-base-multilingual-cased')","('multitask', 'bert-base-german-cased')"
0,trg_fc128_lr0.0005_frozen.txt,0.733 ± 0.01,0.760 ± 0.01,0.737 ± 0.02,0.754 ± 0.01
1,trg_fc128_lr0.005_frozen.txt,0.728 ± 0.01,0.746 ± 0.01,0.722 ± 0.01,0.750 ± 0.01
2,trg_fc256_lr0.0005_frozen.txt,0.735 ± 0.01,0.759 ± 0.01,0.730 ± 0.01,0.758 ± 0.02
3,trg_fc256_lr0.005_frozen.txt,0.727 ± 0.02,0.750 ± 0.01,0.737 ± 0.01,0.747 ± 0.01
4,trg_fc512_lr0.0005_frozen.txt,0.737 ± 0.01,0.762 ± 0.01,0.737 ± 0.01,0.759 ± 0.01
5,trg_fc512_lr0.005_frozen.txt,0.727 ± 0.02,0.745 ± 0.01,0.729 ± 0.01,0.748 ± 0.00


In [14]:
pd.DataFrame(table_dict).to_clipboard()