In [1]:
import json
import numpy as np
import os

def print_tex_result(results_file, primary_metrics, percent_metrics):
    with open(results_file, 'r') as f:
        results = json.load(f)

    task_metrics = {}
    for task, metric_name in primary_metrics.items():
        if task in results and metric_name in results[task]:
            value = results[task][metric_name]
            if metric_name in percent_metrics:
                task_metrics[task] = value * 100
            else:
                task_metrics[task] = value
        else:
            task_metrics[task] = np.nan
    return task_metrics

In [15]:
results_dir = "evaluations_new_new/none-3B-midway/20"
is_global_model = False

primary_metrics = {
    "coreference": "accuracy",
    "entailment": "accuracy",
    "linguistic_acceptability": "accuracy",
    "paraphrase": "f1_score",
    "question_classification": "accuracy",
    "structure_to_text": "rougeL",
    "text_formatting": "rougeL",
    "word_disambiguation": "f1_score"
}
percent_metrics = {"accuracy", "f1_score", "rougeL"}
latex_order = list(primary_metrics.keys())

if is_global_model:
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
    latex_row = " & ".join([f"{metrics[task]:.2f}" for task in latex_order]) + " \\\\"
    print(latex_row)

else:
    all_rows = []
    # 1) load everything
    for i in range(8):
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        task_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        all_rows.append([task_metrics[task] for task in latex_order])

    all_array   = np.array(all_rows)
    best_values = np.nanmax(all_array, axis=0)
    avg_values  = np.nanmean(all_array, axis=0)

    # 2) print per‐client rows, bolding only the column‐wise maxima
    for i, row_vals in enumerate(all_rows):
        cells = []
        for val, best in zip(row_vals, best_values):
            if np.isclose(val, best):
                cells.append(f"\\textbf{{{val:.2f}}}")
            else:
                cells.append(f"{val:.2f}")
        print(f"& {i} & " + " & ".join(cells) + " \\\\")

    # 3) average row
    print('\\cline{2-10}')
    avg_cells = [f"{v:.2f}" for v in avg_values]
    print(f"& avg & " + " & ".join(avg_cells) + " \\\\")

    # 4) best row (still bold)
    best_cells = [f"{v:.2f}" for v in best_values]
    print(f"& best & " + " & ".join(best_cells) + " \\\\")

& 0 & 55.50 & 35.50 & 50.50 & 55.76 & 35.00 & 37.35 & 85.37 & 67.11 \\
& 1 & 56.00 & 37.50 & 41.50 & 67.94 & 36.50 & 36.87 & 86.73 & 67.11 \\
& 2 & \textbf{57.50} & 43.00 & \textbf{57.50} & \textbf{75.00} & 40.00 & 35.34 & 85.79 & \textbf{67.55} \\
& 3 & 55.00 & 36.50 & 49.50 & 47.52 & 37.00 & 35.55 & 85.72 & \textbf{67.55} \\
& 4 & 53.50 & \textbf{43.50} & 49.50 & 70.04 & \textbf{45.00} & 36.00 & 84.67 & \textbf{67.55} \\
& 5 & 34.50 & 0.00 & 0.00 & 0.00 & 0.50 & \textbf{42.89} & 85.26 & 18.18 \\
& 6 & 36.50 & 0.00 & 0.00 & 2.96 & 2.00 & 39.67 & \textbf{90.80} & 29.06 \\
& 7 & 54.00 & 39.50 & 53.50 & 72.08 & 39.00 & 36.31 & 85.23 & 67.11 \\
\cline{2-10}
& avg & 50.31 & 29.44 & 37.75 & 48.91 & 29.38 & 37.50 & 86.20 & 56.40 \\
& best & 57.50 & 43.50 & 57.50 & 75.00 & 45.00 & 42.89 & 90.80 & 67.55 \\
