## Get detailed experimental results

In [2]:
import pandas as pd 


df = pd.read_csv('./results_csv/greedy_full_results.csv')


In [3]:
import pandas as pd
import numpy as np

#  dataset orders
dataset_order = ['xsum','wmt14_fren','wmt19_deen','coqa','triviaqa','mmlu','gsm8k']

method_blocks = { 
    'general_baselines': [
        'MonteCarloSequenceEntropy',
        'MonteCarloNormalizedSequenceEntropy',
        'SemanticEntropy',
        'DegMat_NLI_score_entail',
        'EigValLaplacian_NLI_score_entail',
        'SAR_t0.001',
        'PTrue',
        'GreedyAveDissimilarity',
        'SupervisedCocoa'
    ],
    'msp': [
        'MaximumSequenceProbability',
        'GreedySemanticEnrichedMaxprobAveDissimilarity',
        'SupervisedCocoaMSP',
    ],
    'ppl': [
        'Perplexity',
        'GreedySemanticEnrichedPPLAveDissimilarity',
        'SupervisedCocoaPPL',
    ],
    'mte': [
        'MeanTokenEntropy',
        'GreedySemanticEnrichedMTEAveDissimilarity',
        'SupervisedCocoaMTE',
    ],
}


pretty_method_names = {
            'MonteCarloSequenceEntropy': 'MCSE',
        'MonteCarloNormalizedSequenceEntropy' : 'MCNSE',
        'SemanticEntropy':'Semantic Entropy',
        'DegMat_NLI_score_entail':'DegMat',
        'EigValLaplacian_NLI_score_entail':'EigValLaplacian',
        'SAR_t0.001':'SAR',
        'PTrue': 'P(True)',
        'GreedyAveDissimilarity': 'Consistency Light',
        'SupervisedCocoa': 'Consistency',
                'MaximumSequenceProbability': ' MSP',
        'GreedySemanticEnrichedMaxprobAveDissimilarity': '$\text{CoCoA}_{MSP}$',
        'SupervisedCocoaMSP': '$\text{CoCoA}_{MSP}$ Light',
            'Perplexity': ' PPL',
        'GreedySemanticEnrichedPPLAveDissimilarity': '$\text{CoCoA}_{PPL}$',
        'SupervisedCocoaPPL': '$\text{CoCoA}_{PPL}$ Light',
        'MeanTokenEntropy': ' MTE',
        'GreedySemanticEnrichedMTEAveDissimilarity': '$\text{CoCoA}_{MTE}$',
        'SupervisedCocoaMTE': '$\text{CoCoA}_{MTE}$ Light',

}

# flatten the blocks into one ordered list
method_order = [m for _, block in method_blocks.items() for m in block]

def style_best_second(col: pd.Series):
    s = pd.Series("", index=col.index, dtype="object")
    ranks = col.rank(ascending=False, method='min')
    s[ranks == 1] = "font-weight: bold;"
    s[ranks == 2] = "text-decoration: underline;"
    return s

def block_separator_positions(index_labels):
    """Return 0-based row positions where a block should start (except the first block)."""
    poses = []
    seen_any = False
    for block in method_blocks.values():
        for m in block:
            if m in index_labels:
                pos = list(index_labels).index(m)
                if seen_any:
                    poses.append(pos)
                seen_any = True
                break
    return poses

def style_block_separators_by_pos(df_like: pd.DataFrame, positions):
    """Add a top border to rows at given positions (by integer position)."""
    styles = pd.DataFrame("", index=df_like.index, columns=df_like.columns)
    for i, _ in enumerate(df_like.index):
        if i in positions:
            styles.iloc[i, :] = "border-top: 2px solid #666;"
    return styles

styled_tables = {}
for model, sub in df.groupby("model"):
    pv = sub.pivot_table(index="method", columns="dataset", values="score", aggfunc="max")

    #  enforce column order 
    cols = [c for c in dataset_order if c in pv.columns]
    pv = pv.reindex(columns=cols)

    #  enforce method order---
    idx_present = [m for m in method_order if m in pv.index]
    others = [m for m in pv.index if m not in idx_present]
    pv = pv.reindex(index=idx_present + others)

    sep_positions = block_separator_positions(pv.index)

    # Rename to pretty names (display-only) 
    pv.index = pv.index.map(lambda x: pretty_method_names.get(x, x))

    # Style
    styler = (pv.style
                .apply(style_best_second, axis=0)
                .apply(lambda df_: style_block_separators_by_pos(df_, sep_positions), axis=None)
                .format("{:.4g}")
                .set_caption(f"Model: {model}"))

    styled_tables[model] = styler

# display
for m, st in styled_tables.items():
    display(st)


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.159,0.2967,0.3372,0.2584,0.5494,0.4204,0.4273
MCNSE,0.1082,0.3709,0.4737,0.2929,0.5862,0.4423,0.2986
Semantic Entropy,0.1641,0.3069,0.3892,0.2943,0.5815,0.463,0.4177
DegMat,0.201,0.2742,0.4311,0.4069,0.6509,0.4798,0.3953
EigValLaplacian,0.2009,0.2292,0.3939,0.3811,0.6447,0.4539,0.3578
SAR,0.144,0.3985,0.5174,0.3815,0.6492,0.5084,0.3869
P(True),-0.1592,0.1754,0.135,0.03632,0.2753,0.0266,0.133
Consistency Light,0.2256,0.3368,0.496,0.4085,0.6562,0.4849,0.4258
Consistency,0.2322,0.4827,0.4685,0.3266,0.6569,0.3625,0.3626
MSP,0.2011,0.3121,0.4,0.3209,0.6622,0.5392,0.3767


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.03257,0.293,0.3537,0.2371,0.482,0.1712,0.3512
MCNSE,0.0215,0.3704,0.415,0.2188,0.5015,0.1695,0.3444
Semantic Entropy,0.03334,0.2968,0.3887,0.2717,0.5493,0.2285,0.3748
DegMat,0.0807,0.25,0.3546,0.3534,0.6218,0.3417,0.3087
EigValLaplacian,0.07869,0.1979,0.2779,0.3319,0.6037,0.2917,0.2732
SAR,0.07736,0.4275,0.4827,0.3108,0.5947,0.3516,0.398
P(True),0.05764,0.04748,0.03665,-0.03727,-0.06599,-0.1803,0.02565
Consistency Light,0.02413,0.3886,0.4533,0.3746,0.6135,0.3914,0.3678
Consistency,-0.02221,0.4753,0.4407,0.353,0.5519,0.2787,0.3782
MSP,0.3276,0.3416,0.4564,0.2771,0.5257,0.5077,0.3244


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.006539,0.2575,0.3503,0.2475,0.4961,0.3371,0.4748
MCNSE,0.00863,0.3421,0.4778,0.2381,0.5396,0.3561,0.401
Semantic Entropy,0.007853,0.2715,0.3823,0.2711,0.5618,0.387,0.4722
DegMat,0.1367,0.2287,0.382,0.3358,0.646,0.4098,0.299
EigValLaplacian,0.1322,0.2065,0.3283,0.3011,0.6245,0.3984,0.2411
SAR,0.09437,0.3526,0.517,0.3127,0.6438,0.419,0.4714
P(True),0.1792,0.1179,0.03284,-0.06055,-0.1282,0.06787,0.004601
Consistency Light,0.0515,0.2846,0.5002,0.379,0.6473,0.423,0.4351
Consistency,-0.006206,0.3174,0.4568,0.3064,0.5966,0.4322,0.4408
MSP,0.2875,0.3149,0.4508,0.3262,0.6277,0.4736,0.4707


In [None]:
import pandas as pd
import numpy as np


method_order = [m for _, block in method_blocks.items() for m in block]

def pivot_one_model(df_model: pd.DataFrame) -> pd.DataFrame:
    pv = df_model.pivot_table(index="method", columns="dataset", values="score", aggfunc="max")

    cols = [c for c in dataset_order if c in pv.columns]
    pv = pv.reindex(columns=cols)

    idx_present = [m for m in method_order if m in pv.index]
    others = [m for m in pv.index if m not in idx_present]
    pv = pv.reindex(index=idx_present + others)

    return pv

def best_second_masks(pv: pd.DataFrame):
    best = pd.DataFrame(False, index=pv.index, columns=pv.columns)
    second = pd.DataFrame(False, index=pv.index, columns=pv.columns)
    for c in pv.columns:
        col = pv[c].dropna()
        if col.empty:
            continue
        uniq = np.sort(col.unique())[::-1]  # larger is better
        top = uniq[0]
        best.loc[pv[c] == top, c] = True
        if len(uniq) > 1:
            second_val = uniq[1]
            second.loc[pv[c] == second_val, c] = True
    return best, second

def latex_escape(s: str) -> str:
    rep = {'&': r'\&','%': r'\%','$': r'\$','#': r'\#','_': r'\_','{': r'\{','}': r'\}',
           '~': r'\textasciitilde{}','^': r'\textasciicircum{}','\\': r'\textbackslash{}'}
    return ''.join(rep.get(ch, ch) for ch in s)

def display_name(method_key: str) -> str:
    return pretty_method_names.get(method_key, latex_escape(method_key))

def format_cell(val, is_best, is_second):
    if pd.isna(val):
        txt = "--"
    else:
        txt = f"{val:.3f}"        # ‚Üê fixed three decimals
    if is_best:
        return r"\textbf{" + txt + "}"
    if is_second:
        return r"\underline{" + txt + "}"
    return txt

def block_lists_for(pv_index):
    lists = []
    for _, block in method_blocks.items():
        present = [m for m in block if m in pv_index]
        if present:
            lists.append(present)
    others = [m for m in pv_index if m not in method_order]
    if others:
        lists.append(others)
    return lists

def latex_one_big_table(df: pd.DataFrame, table_caption="All models", table_label="tab:all_models"):
    by_model = {model: pivot_one_model(sub) for model, sub in df.groupby("model")}
    all_cols = [c for c in dataset_order if any(c in pv.columns for pv in by_model.values())]

    colspec = "l" + "c" * len(all_cols)
    lines = []
    lines.append(r"\begin{table}[t]")
    lines.append(r"\centering")
    lines.append(r"\small")
    lines.append(r"\setlength{\tabcolsep}{6pt}")
    lines.append(r"\begin{tabular}{" + colspec + "}")
    lines.append(r"\toprule")
    header = ["Method"] + [latex_escape(c) for c in all_cols]
    lines.append(" & ".join(header) + r" \\")
    lines.append(r"\midrule")

    first_model = True
    for model_name, pv in by_model.items():
        # align columns to global set
        pv = pv.reindex(columns=all_cols)

        # model header row
        if not first_model:
            lines.append(r"\midrule")
        first_model = False
        # gray banner with model name across all dataset columns (plus empty first column)
        lines.append(r"\rowcolor[gray]{0.9} " + " & "
                     + r"\multicolumn{" + str(len(all_cols)) + r"}{c}{" + latex_escape(str(model_name)) + r"} \\")
        lines.append(r"\midrule")

        # best/second within this model
        best_mask, second_mask = best_second_masks(pv)

        # method blocks with midrules between them
        blocks = block_lists_for(pv.index)
        first_block = True
        for block in blocks:
            if not first_block:
                lines.append(r"\midrule")
            first_block = False
            for m in block:
                row = [display_name(m)]
                for c in all_cols:
                    v = pv.loc[m, c] if m in pv.index else np.nan
                    is_b = bool(best_mask.loc[m, c]) if (m in best_mask.index and c in best_mask.columns) else False
                    is_s = bool(second_mask.loc[m, c]) if (m in second_mask.index and c in second_mask.columns) else False
                    row.append(format_cell(v, is_b, is_s))
                lines.append(" & ".join(row) + r" \\")

    lines.append(r"\bottomrule")
    lines.append(r"\end{tabular}")
    lines.append(rf"\caption{{{latex_escape(table_caption)}}}")
    lines.append(rf"\label{{{latex_escape(table_label)}}}")
    lines.append(r"\end{table}")
    return "\n".join(lines)

# ---- build the LaTeX string ----
latex_all = latex_one_big_table(df, table_caption="Results across models", table_label="tab:results_all_models")

with open("tables/greedy_full_results.tex", "w") as f:
    f.write(latex_all)
print(latex_all)


In [4]:
import pandas as pd
import numpy as np
import pandas as pd 


df = pd.read_csv('./results_csv/sample_full_results.csv')

dataset_order = ['xsum','wmt14_fren','wmt19_deen','coqa','triviaqa','mmlu','gsm8k']

method_blocks = { 
    'general_baselines': [
        'MonteCarloSequenceEntropy',
        'MonteCarloNormalizedSequenceEntropy',
        'SemanticEntropy',
        'DegMat_NLI_score_entail',
        'EigValLaplacian_NLI_score_entail',
        'SAR_t0.001',
        'PTrueBestSample',
        'BestAveDissimilarity',
    ],
    'msp': [
        'BestSampledMaximumSequenceProbability',
        'BestSemanticEnrichedMaxprobAveDissimilarity',
    ],
    'ppl': [
        'BestSampledPerplexity',
        'BestSemanticEnrichedPPLAveDissimilarity',
    ],
    'mte': [
        'BestSampledMeanTokenEntropy',
        'BestSemanticEnrichedMTEAveDissimilarity',
    ],
}


pretty_method_names = {
            'MonteCarloSequenceEntropy': 'MCSE',
        'MonteCarloNormalizedSequenceEntropy' : 'MCNSE',
        'SemanticEntropy':'Semantic Entropy',
        'DegMat_NLI_score_entail':'DegMat',
        'EigValLaplacian_NLI_score_entail':'EigValLaplacian',
        'SAR_t0.001':'SAR',
        'PTrueBestSample': 'P(True)',
        'BestAveDissimilarity': 'Consistency Light',
        'SupervisedCocoa': 'Consistency',
                'BesSampledMaximumSequenceProbability': ' MSP',
        'BestSemanticEnrichedMaxprobAveDissimilarity': '$\text{CoCoA}_{MSP}$',
        'SupervisedCocoaMSP': '$\text{CoCoA}_{MSP}$ Light',
            'BesSampledPerplexity': ' PPL',
        'BestSemanticEnrichedPPLAveDissimilarity': '$\text{CoCoA}_{PPL}$',
        'SupervisedCocoaPPL': '$\text{CoCoA}_{PPL}$ Light',
        'BesSampledMeanTokenEntropy': ' MTE',
        'BestSemanticEnrichedMTEAveDissimilarity': '$\text{CoCoA}_{MTE}$',
        'SupervisedCocoaMTE': '$\text{CoCoA}_{MTE}$ Light',

}

method_order = [m for _, block in method_blocks.items() for m in block]

def style_best_second(col: pd.Series):
    s = pd.Series("", index=col.index, dtype="object")
    ranks = col.rank(ascending=False, method='min')
    s[ranks == 1] = "font-weight: bold;"
    s[ranks == 2] = "text-decoration: underline;"
    return s

def block_separator_positions(index_labels):
    """Return 0-based row positions where a block should start (except the first block)."""
    poses = []
    seen_any = False
    for block in method_blocks.values():
        # first method from this block that is actually present
        for m in block:
            if m in index_labels:
                pos = list(index_labels).index(m)
                if seen_any:
                    poses.append(pos)
                seen_any = True
                break
    return poses

def style_block_separators_by_pos(df_like: pd.DataFrame, positions):
    """Add a top border to rows at given positions (by integer position)."""
    styles = pd.DataFrame("", index=df_like.index, columns=df_like.columns)
    for i, _ in enumerate(df_like.index):
        if i in positions:
            styles.iloc[i, :] = "border-top: 2px solid #666;"
    return styles

styled_tables = {}
for model, sub in df.groupby("model"):
    pv = sub.pivot_table(index="method", columns="dataset", values="score", aggfunc="max")

    cols = [c for c in dataset_order if c in pv.columns]
    pv = pv.reindex(columns=cols)

    idx_present = [m for m in method_order if m in pv.index]
    others = [m for m in pv.index if m not in idx_present]
    pv = pv.reindex(index=idx_present + others)

    sep_positions = block_separator_positions(pv.index)

    pv.index = pv.index.map(lambda x: pretty_method_names.get(x, x))

    styler = (pv.style
                .apply(style_best_second, axis=0)
                .apply(lambda df_: style_block_separators_by_pos(df_, sep_positions), axis=None)
                .format("{:.4g}")
                .set_caption(f"Model: {model}"))

    styled_tables[model] = styler

# display
for m, st in styled_tables.items():
    display(st)

dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.1285,0.3989,0.4192,0.2854,0.5354,0.4208,0.5983
MCNSE,0.1527,0.3953,0.4522,0.3181,0.5875,0.4427,0.7709
Semantic Entropy,0.134,0.4198,0.4601,0.3187,0.5663,0.4633,0.5673
DegMat,0.1766,0.35,0.4217,0.4219,0.6367,0.4798,0.6329
EigValLaplacian,0.174,0.2885,0.3825,0.3929,0.6221,0.4537,0.5224
SAR,0.1932,0.4553,0.5209,0.3846,0.6421,0.509,0.8258
P(True),0.02159,0.2452,0.2448,0.03777,0.2435,0.0277,0.8151
Consistency Light,0.2821,0.4908,0.6515,0.4159,0.6266,0.4844,0.9787
BestSampledMaximumSequenceProbability,0.09901,0.3848,0.3778,0.3687,0.638,0.5399,-0.1754
$	ext{CoCoA}_{MSP}$,0.2101,0.5638,0.6977,0.4278,0.6588,0.5297,0.4983


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.1924,0.3663,0.3945,0.2589,0.4647,0.1576,0.5463
MCNSE,0.1862,0.3774,0.48,0.2395,0.4841,0.1648,0.6335
Semantic Entropy,0.1939,0.3708,0.4512,0.2863,0.5282,0.2133,0.559
DegMat,0.1913,0.274,0.4095,0.3667,0.6063,0.3198,0.3966
EigValLaplacian,0.1901,0.2157,0.3326,0.3399,0.5874,0.2738,0.3513
SAR,0.1594,0.4412,0.5709,0.3271,0.5783,0.3405,0.6684
P(True),0.05813,0.07541,0.05632,-0.01068,-0.07065,-0.1197,-0.08369
Consistency Light,0.0304,0.473,0.5981,0.395,0.5996,0.3527,0.7949
BestSampledMaximumSequenceProbability,0.4641,0.339,0.4126,0.3039,0.5138,0.4827,0.2809
$	ext{CoCoA}_{MSP}$,0.4839,0.5288,0.6845,0.3841,0.5871,0.4522,0.5133


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.1618,0.4065,0.4065,0.2887,0.4916,0.3387,0.6929
MCNSE,0.1961,0.4708,0.5069,0.2773,0.5292,0.358,0.7
Semantic Entropy,0.164,0.4335,0.4424,0.3119,0.5539,0.3887,0.6753
DegMat,0.2049,0.4394,0.4096,0.3761,0.6179,0.4103,0.4541
EigValLaplacian,0.1975,0.3878,0.3437,0.3419,0.6003,0.3993,0.3606
SAR,0.1746,0.5628,0.5902,0.3474,0.6197,0.421,0.7796
P(True),0.2065,0.472,0.2692,-0.05774,-0.08433,0.06773,0.2776
Consistency Light,0.07123,0.6705,0.7077,0.4046,0.6144,0.423,0.8463
BestSampledMaximumSequenceProbability,0.3302,0.2118,0.2914,0.3875,0.607,0.4755,0.3069
$	ext{CoCoA}_{MSP}$,0.3656,0.7121,0.7304,0.4296,0.6442,0.4665,0.5625


In [None]:
import pandas as pd
import numpy as np


method_order = [m for _, block in method_blocks.items() for m in block]

def pivot_one_model(df_model: pd.DataFrame) -> pd.DataFrame:
    pv = df_model.pivot_table(index="method", columns="dataset", values="score", aggfunc="max")

    cols = [c for c in dataset_order if c in pv.columns]
    pv = pv.reindex(columns=cols)

    idx_present = [m for m in method_order if m in pv.index]
    others = [m for m in pv.index if m not in idx_present]
    pv = pv.reindex(index=idx_present + others)

    return pv

def best_second_masks(pv: pd.DataFrame):
    best = pd.DataFrame(False, index=pv.index, columns=pv.columns)
    second = pd.DataFrame(False, index=pv.index, columns=pv.columns)
    for c in pv.columns:
        col = pv[c].dropna()
        if col.empty:
            continue
        uniq = np.sort(col.unique())[::-1]  # larger is better
        top = uniq[0]
        best.loc[pv[c] == top, c] = True
        if len(uniq) > 1:
            second_val = uniq[1]
            second.loc[pv[c] == second_val, c] = True
    return best, second

def latex_escape(s: str) -> str:
    rep = {'&': r'\&','%': r'\%','$': r'\$','#': r'\#','_': r'\_','{': r'\{','}': r'\}',
           '~': r'\textasciitilde{}','^': r'\textasciicircum{}','\\': r'\textbackslash{}'}
    return ''.join(rep.get(ch, ch) for ch in s)

def display_name(method_key: str) -> str:
    return pretty_method_names.get(method_key, latex_escape(method_key))

def format_cell(val, is_best, is_second):
    if pd.isna(val):
        txt = "--"
    else:
        txt = f"{val:.3f}"        # ‚Üê fixed three decimals
    if is_best:
        return r"\textbf{" + txt + "}"
    if is_second:
        return r"\underline{" + txt + "}"
    return txt

def block_lists_for(pv_index):
    lists = []
    for _, block in method_blocks.items():
        present = [m for m in block if m in pv_index]
        if present:
            lists.append(present)
    others = [m for m in pv_index if m not in method_order]
    if others:
        lists.append(others)
    return lists

def latex_one_big_table(df: pd.DataFrame, table_caption="All models", table_label="tab:all_models"):
    by_model = {model: pivot_one_model(sub) for model, sub in df.groupby("model")}
    all_cols = [c for c in dataset_order if any(c in pv.columns for pv in by_model.values())]

    colspec = "l" + "c" * len(all_cols)
    lines = []
    lines.append(r"\begin{table}[t]")
    lines.append(r"\centering")
    lines.append(r"\small")
    lines.append(r"\setlength{\tabcolsep}{6pt}")
    lines.append(r"\begin{tabular}{" + colspec + "}")
    lines.append(r"\toprule")
    header = ["Method"] + [latex_escape(c) for c in all_cols]
    lines.append(" & ".join(header) + r" \\")
    lines.append(r"\midrule")

    first_model = True
    for model_name, pv in by_model.items():
        pv = pv.reindex(columns=all_cols)

        if not first_model:
            lines.append(r"\midrule")
        first_model = False
        # gray banner with model name across all dataset columns (plus empty first column)
        lines.append(r"\rowcolor[gray]{0.9} " + " & "
                     + r"\multicolumn{" + str(len(all_cols)) + r"}{c}{" + latex_escape(str(model_name)) + r"} \\")
        lines.append(r"\midrule")

        # best/second within this model
        best_mask, second_mask = best_second_masks(pv)

        # method blocks with midrules between them
        blocks = block_lists_for(pv.index)
        first_block = True
        for block in blocks:
            if not first_block:
                lines.append(r"\midrule")
            first_block = False
            for m in block:
                row = [display_name(m)]
                for c in all_cols:
                    v = pv.loc[m, c] if m in pv.index else np.nan
                    is_b = bool(best_mask.loc[m, c]) if (m in best_mask.index and c in best_mask.columns) else False
                    is_s = bool(second_mask.loc[m, c]) if (m in second_mask.index and c in second_mask.columns) else False
                    row.append(format_cell(v, is_b, is_s))
                lines.append(" & ".join(row) + r" \\")

    lines.append(r"\bottomrule")
    lines.append(r"\end{tabular}")
    lines.append(rf"\caption{{{latex_escape(table_caption)}}}")
    lines.append(rf"\label{{{latex_escape(table_label)}}}")
    lines.append(r"\end{table}")
    return "\n".join(lines)

# ---- build the LaTeX string ----
latex_all = latex_one_big_table(df, table_caption="Results across models", table_label="tab:results_all_models")

with open("tables/sample_full_results.tex", "w") as f:
    f.write(latex_all)
print(latex_all)

In [5]:
import pandas as pd
import numpy as np
import pandas as pd 


df = pd.read_csv('./results_csv/mbr_full_results.csv')

dataset_order = ['xsum','wmt14_fren','wmt19_deen','coqa','triviaqa','mmlu','gsm8k']

method_blocks = { 
    'general_baselines': [
        'MonteCarloSequenceEntropy',
        'MonteCarloNormalizedSequenceEntropy',
        'SemanticEntropy',
        'DegMat_NLI_score_entail',
        'EigValLaplacian_NLI_score_entail',
        'SAR_t0.001',
        'PTrueMbrSample',
        'MbrAveDissimilarity',
    ],
    'msp': [
        'MbrSampledMaximumSequenceProbability',
        'MbrSemanticEnrichedMaxprobAveDissimilarity',
    ],
    'ppl': [
        'MbrSampledPerplexity',
        'MbrSemanticEnrichedPPLAveDissimilarity',
    ],
    'mte': [
        'MbrSampledMeanTokenEntropy',
        'MbrSemanticEnrichedMTEAveDissimilarity',
    ],
}


pretty_method_names = {
            'MonteCarloSequenceEntropy': 'MCSE',
        'MonteCarloNormalizedSequenceEntropy' : 'MCNSE',
        'SemanticEntropy':'Semantic Entropy',
        'DegMat_NLI_score_entail':'DegMat',
        'EigValLaplacian_NLI_score_entail':'EigValLaplacian',
        'SAR_t0.001':'SAR',
        'PTrueMbrSample': 'P(True)',
        'MbrAveDissimilarity': 'Consistency Light',
        'SupervisedCocoa': 'Consistency',
                'MbrSampledMaximumSequenceProbability': ' MSP',
        'MbrSemanticEnrichedMaxprobAveDissimilarity': '$\text{CoCoA}_{MSP}$',
        'SupervisedCocoaMSP': '$\text{CoCoA}_{MSP}$ Light',
            'MbrSampledPerplexity': ' PPL',
        'MbrSemanticEnrichedPPLAveDissimilarity': '$\text{CoCoA}_{PPL}$',
        'SupervisedCocoaPPL': '$\text{CoCoA}_{PPL}$ Light',
        'MbrSampledMeanTokenEntropy': ' MTE',
        'MbrSemanticEnrichedMTEAveDissimilarity': '$\text{CoCoA}_{MTE}$',
        'SupervisedCocoaMTE': '$\text{CoCoA}_{MTE}$ Light',

}

method_order = [m for _, block in method_blocks.items() for m in block]

def style_best_second(col: pd.Series):
    s = pd.Series("", index=col.index, dtype="object")
    ranks = col.rank(ascending=False, method='min')
    s[ranks == 1] = "font-weight: bold;"
    s[ranks == 2] = "text-decoration: underline;"
    return s

def block_separator_positions(index_labels):
    """Return 0-based row positions where a block should start (except the first block)."""
    poses = []
    seen_any = False
    for block in method_blocks.values():
        for m in block:
            if m in index_labels:
                pos = list(index_labels).index(m)
                if seen_any:
                    poses.append(pos)
                seen_any = True
                break
    return poses

def style_block_separators_by_pos(df_like: pd.DataFrame, positions):
    """Add a top border to rows at given positions (by integer position)."""
    styles = pd.DataFrame("", index=df_like.index, columns=df_like.columns)
    for i, _ in enumerate(df_like.index):
        if i in positions:
            styles.iloc[i, :] = "border-top: 2px solid #666;"
    return styles

styled_tables = {}
for model, sub in df.groupby("model"):
    pv = sub.pivot_table(index="method", columns="dataset", values="score", aggfunc="max")

    cols = [c for c in dataset_order if c in pv.columns]
    pv = pv.reindex(columns=cols)

    idx_present = [m for m in method_order if m in pv.index]
    others = [m for m in pv.index if m not in idx_present]
    pv = pv.reindex(index=idx_present + others)

    sep_positions = block_separator_positions(pv.index)

    pv.index = pv.index.map(lambda x: pretty_method_names.get(x, x))

    styler = (pv.style
                .apply(style_best_second, axis=0)
                .apply(lambda df_: style_block_separators_by_pos(df_, sep_positions), axis=None)
                .format("{:.4g}")
                .set_caption(f"Model: {model}"))

    styled_tables[model] = styler

# display
for m, st in styled_tables.items():
    display(st)

dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.1878,0.3306,0.3621,0.2738,0.5603,0.4553,0.4604
MCNSE,0.1264,0.3868,0.4937,0.3059,0.6048,0.4787,0.3956
Semantic Entropy,0.1919,0.332,0.4144,0.3035,0.5939,0.4971,0.4379
DegMat,0.2456,0.2712,0.4704,0.3923,0.6723,0.5162,0.4155
EigValLaplacian,0.2384,0.2201,0.4319,0.3646,0.66,0.4765,0.3386
SAR,0.1836,0.4125,0.5435,0.3705,0.6639,0.5525,0.5357
P(True),-0.1237,0.1941,0.1614,0.01601,0.2787,0.06149,0.2841
Consistency Light,0.235,0.3225,0.4976,0.3758,0.6645,0.5229,0.5882
MSP,0.1869,0.3549,0.398,0.2133,0.458,0.583,0.3668
$	ext{CoCoA}_{MSP}$,0.2479,0.4612,0.5902,0.3363,0.6166,0.5748,0.4715


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.08865,0.2403,0.3542,0.2204,0.4444,0.296,0.6466
MCNSE,0.05454,0.3345,0.4427,0.2472,0.4698,0.3047,0.545
Semantic Entropy,0.08922,0.2496,0.3821,0.2451,0.5142,0.3605,0.6503
DegMat,0.2172,0.2182,0.3613,0.368,0.6113,0.3906,0.5395
EigValLaplacian,0.2166,0.1675,0.28,0.3467,0.5916,0.3041,0.4917
SAR,0.1251,0.3926,0.5198,0.3447,0.5782,0.4709,0.6418
P(True),-0.01411,0.0979,0.09052,-0.01378,-0.05395,-0.2102,0.08348
Consistency Light,0.1462,0.2896,0.4397,0.3872,0.6026,0.4275,0.5987
MSP,0.1967,0.3442,0.4635,0.1611,0.2472,0.5774,0.6751
$	ext{CoCoA}_{MSP}$,0.2253,0.4028,0.5899,0.304,0.4871,0.5862,0.7314


dataset,xsum,wmt14_fren,wmt19_deen,coqa,triviaqa,mmlu,gsm8k
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MCSE,0.1233,0.2967,0.363,0.2449,0.5147,0.3853,0.7506
MCNSE,0.1048,0.3854,0.4852,0.2561,0.5524,0.4061,0.5477
Semantic Entropy,0.127,0.3098,0.3951,0.2722,0.5803,0.4386,0.7354
DegMat,0.2441,0.2802,0.3645,0.3582,0.6728,0.4613,0.3915
EigValLaplacian,0.2426,0.2384,0.3187,0.3338,0.6544,0.4432,0.3178
SAR,0.2043,0.3918,0.5032,0.3339,0.6583,0.4783,0.6561
P(True),0.07944,0.1425,0.08366,-0.05572,-0.08377,0.04704,-0.08415
Consistency Light,0.2106,0.2967,0.4367,0.3545,0.6586,0.4579,0.6027
MSP,0.2587,0.3807,0.4857,0.2156,0.3463,0.5137,0.7111
$	ext{CoCoA}_{MSP}$,0.2949,0.4407,0.5889,0.3358,0.5974,0.5137,0.766


In [None]:
import pandas as pd
import numpy as np


method_order = [m for _, block in method_blocks.items() for m in block]

def pivot_one_model(df_model: pd.DataFrame) -> pd.DataFrame:
    pv = df_model.pivot_table(index="method", columns="dataset", values="score", aggfunc="max")

    cols = [c for c in dataset_order if c in pv.columns]
    pv = pv.reindex(columns=cols)

    idx_present = [m for m in method_order if m in pv.index]
    others = [m for m in pv.index if m not in idx_present]
    pv = pv.reindex(index=idx_present + others)

    return pv

def best_second_masks(pv: pd.DataFrame):
    best = pd.DataFrame(False, index=pv.index, columns=pv.columns)
    second = pd.DataFrame(False, index=pv.index, columns=pv.columns)
    for c in pv.columns:
        col = pv[c].dropna()
        if col.empty:
            continue
        uniq = np.sort(col.unique())[::-1]  # larger is better
        top = uniq[0]
        best.loc[pv[c] == top, c] = True
        if len(uniq) > 1:
            second_val = uniq[1]
            second.loc[pv[c] == second_val, c] = True
    return best, second

def latex_escape(s: str) -> str:
    rep = {'&': r'\&','%': r'\%','$': r'\$','#': r'\#','_': r'\_','{': r'\{','}': r'\}',
           '~': r'\textasciitilde{}','^': r'\textasciicircum{}','\\': r'\textbackslash{}'}
    return ''.join(rep.get(ch, ch) for ch in s)

def display_name(method_key: str) -> str:
    return pretty_method_names.get(method_key, latex_escape(method_key))

def format_cell(val, is_best, is_second):
    if pd.isna(val):
        txt = "--"
    else:
        txt = f"{val:.3f}"        # ‚Üê fixed three decimals
    if is_best:
        return r"\textbf{" + txt + "}"
    if is_second:
        return r"\underline{" + txt + "}"
    return txt

def block_lists_for(pv_index):
    lists = []
    for _, block in method_blocks.items():
        present = [m for m in block if m in pv_index]
        if present:
            lists.append(present)
    others = [m for m in pv_index if m not in method_order]
    if others:
        lists.append(others)
    return lists

def latex_one_big_table(df: pd.DataFrame, table_caption="All models", table_label="tab:all_models"):
    # collect pivots per model
    by_model = {model: pivot_one_model(sub) for model, sub in df.groupby("model")}
    # union of datasets present across models but keep desired order
    all_cols = [c for c in dataset_order if any(c in pv.columns for pv in by_model.values())]

    colspec = "l" + "c" * len(all_cols)
    lines = []
    lines.append(r"\begin{table}[t]")
    lines.append(r"\centering")
    lines.append(r"\small")
    lines.append(r"\setlength{\tabcolsep}{6pt}")
    lines.append(r"\begin{tabular}{" + colspec + "}")
    lines.append(r"\toprule")
    header = ["Method"] + [latex_escape(c) for c in all_cols]
    lines.append(" & ".join(header) + r" \\")
    lines.append(r"\midrule")

    first_model = True
    for model_name, pv in by_model.items():
        pv = pv.reindex(columns=all_cols)

        if not first_model:
            lines.append(r"\midrule")
        first_model = False
        # gray banner with model name across all dataset columns (plus empty first column)
        lines.append(r"\rowcolor[gray]{0.9} " + " & "
                     + r"\multicolumn{" + str(len(all_cols)) + r"}{c}{" + latex_escape(str(model_name)) + r"} \\")
        lines.append(r"\midrule")

        # best/second within this model
        best_mask, second_mask = best_second_masks(pv)

        # method blocks with midrules between them
        blocks = block_lists_for(pv.index)
        first_block = True
        for block in blocks:
            if not first_block:
                lines.append(r"\midrule")
            first_block = False
            for m in block:
                row = [display_name(m)]
                for c in all_cols:
                    v = pv.loc[m, c] if m in pv.index else np.nan
                    is_b = bool(best_mask.loc[m, c]) if (m in best_mask.index and c in best_mask.columns) else False
                    is_s = bool(second_mask.loc[m, c]) if (m in second_mask.index and c in second_mask.columns) else False
                    row.append(format_cell(v, is_b, is_s))
                lines.append(" & ".join(row) + r" \\")

    lines.append(r"\bottomrule")
    lines.append(r"\end{tabular}")
    lines.append(rf"\caption{{{latex_escape(table_caption)}}}")
    lines.append(rf"\label{{{latex_escape(table_label)}}}")
    lines.append(r"\end{table}")
    return "\n".join(lines)

latex_all = latex_one_big_table(df, table_caption="Results across models", table_label="tab:results_all_models")

with open("tables/mbr_full_results.tex", "w") as f:
    f.write(latex_all)
print(latex_all)

### Summary table for main part

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

df = pd.read_csv('./results_csv/greedy_full_results.csv')

task_map = {
    'xsum': 'summarization',
    'wmt14_fren': 'translation',
    'wmt19_deen': 'translation',
    'coqa': 'qa',
    'triviaqa': 'qa',
    'mmlu': 'qa',
    'gsm8k': 'qa',
}

pretty_method_names = {
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy' : 'MCNSE',
    'SemanticEntropy':'Semantic Entropy',
    'DegMat_NLI_score_entail':'DegMat',
    'EigValLaplacian_NLI_score_entail':'EigValLaplacian',
    'SAR_t0.001':'SAR',
    'PTrue': 'P(True)',
    'GreedyAveDissimilarity': 'Consistency Light',
    'SupervisedCocoa': 'Consistency',
    'MaximumSequenceProbability': ' MSP',
    'GreedySemanticEnrichedMaxprobAveDissimilarity': '$\\text{CoCoA}_{MSP}$',
    'SupervisedCocoaMSP': '$\\text{CoCoA}_{MSP}$ Light',
    'Perplexity': ' PPL',
    'GreedySemanticEnrichedPPLAveDissimilarity': '$\\text{CoCoA}_{PPL}$',
    'SupervisedCocoaPPL': '$\\text{CoCoA}_{PPL}$ Light',
    'MeanTokenEntropy': ' MTE',
    'GreedySemanticEnrichedMTEAveDissimilarity': '$\\text{CoCoA}_{MTE}$',
    'SupervisedCocoaMTE': '$\\text{CoCoA}_{MTE}$ Light',
}

# Method blocks for ordering + midrules
method_blocks = {
    'general_baselines': [
        'MonteCarloSequenceEntropy',
        'MonteCarloNormalizedSequenceEntropy',
        'SemanticEntropy',
        'DegMat_NLI_score_entail',
        'EigValLaplacian_NLI_score_entail',
        'SAR_t0.001',
        'PTrue',
        'GreedyAveDissimilarity',
        'SupervisedCocoa',
    ],
    'msp': [
        'MaximumSequenceProbability',
        'GreedySemanticEnrichedMaxprobAveDissimilarity',
        'SupervisedCocoaMSP',
    ],
    'ppl': [
        'Perplexity',
        'GreedySemanticEnrichedPPLAveDissimilarity',
        'SupervisedCocoaPPL',
    ],
    'mte': [
        'MeanTokenEntropy',
        'GreedySemanticEnrichedMTEAveDissimilarity',
        'SupervisedCocoaMTE',
    ],
}

# Column layout
model_order = ['llama', 'mistral', 'falcon']  
task_order  = ['qa', 'translation', 'summarization']
task_short  = {'qa':'QA', 'translation':'NMT', 'summarization':'SUM'}  # header labels
model_header = {'llama': 'Llama', 'mistral': 'Mistral', 'falcon': 'Falcon'}  # pretty names for header

#  AGGREGATE TO TASK-LEVEL MEANS 
df_sum = df.copy()
df_sum['task'] = df_sum['dataset'].map(task_map)

agg = (df_sum
       .dropna(subset=['task'])
       .groupby(['model', 'method', 'task'], as_index=False)['score']
       .mean())

# Wide table: rows = method, cols = (model, task)
wide = agg.pivot_table(index='method', columns=['model', 'task'], values='score')

# Ensure all desired columns exist (even if NaN), in the set order
cols = [(m, t) for m in model_order for t in task_order]
wide = wide.reindex(columns=pd.MultiIndex.from_tuples(cols, names=['model', 'task']))

method_order = [m for _, block in method_blocks.items() for m in block]
wanted_methods = [m for m in method_order if m in wide.index]
others = [m for m in wide.index if m not in method_order]  # keep any extras at end (or drop if not wanted)
wide = wide.reindex(wanted_methods + others)

best = pd.DataFrame(False, index=wide.index, columns=wide.columns)
second = pd.DataFrame(False, index=wide.index, columns=wide.columns)

for col in wide.columns:
    s = wide[col].dropna()
    if s.empty:
        continue
    uniq = np.sort(s.unique())[::-1]  # larger is better
    top = uniq[0]
    best.loc[s.index[s == top], col] = True
    if len(uniq) > 1:
        second_val = uniq[1]
        second.loc[s.index[s == second_val], col] = True

def fmt_cell(val, is_best=False, is_second=False):
    if pd.isna(val):
        txt = "--"
    else:
        txt = f"{val:.3f}"  # three decimals
    if is_best:
        return r"\textbf{" + txt + "}"
    if is_second:
        return r"\underline{" + txt + "}"
    return txt

header = r"""
\begin{table*}[th!]
\centering
\renewcommand{\arraystretch}{1.2}
\scalebox{0.85}{
\begin{tabular}{lccccccccc}
\bottomrule
\textbf{Metric} & \multicolumn{3}{c}{\textbf{Llama}} & \multicolumn{3}{c}{\textbf{Mistral}} & \multicolumn{3}{c}{\textbf{Falcon}} \\  
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}  \\
\midrule
""".strip("\n")

end_txt = r"""
\bottomrule
\end{tabular}}
\caption{Results for Evaluated Sequence - Greedy Sample: Mean PRR across datasets for each task. The best performing method is in bold, and the second-best is underscored. Arrows indicate improvement in CoCoA over the base version.}
\label{tab:best_sample_results}
\end{table*}
""".strip("\n")

lines = [header]

def method_display_name(key: str) -> str:
    return pretty_method_names.get(key, key)

# loop through blocks in declared order; insert \midrule between blocks
first_block_emitted = False
for block_name, block_methods in method_blocks.items():
    present = [m for m in block_methods if m in wide.index]
    if not present:
        continue

    if first_block_emitted:
        lines.append(r"\midrule")
    first_block_emitted = True

    for m in present:
        row = [method_display_name(m)]
        for model in model_order:
            for task in task_order:
                col = (model, task)
                v = wide.loc[m, col] if col in wide.columns else np.nan
                is_b = bool(best.loc[m, col]) if (m in best.index and col in best.columns) else False
                is_s = bool(second.loc[m, col]) if (m in second.index and col in second.columns) else False
                row.append(fmt_cell(v, is_b, is_s))
        lines.append(" & ".join(row) + r" \\")

other_methods_present = [m for m in wide.index if m not in method_order]
if other_methods_present:
    lines.append(r"\midrule")
    for m in other_methods_present:
        row = [method_display_name(m)]
        for model in model_order:
            for task in task_order:
                col = (model, task)
                v = wide.loc[m, col] if col in wide.columns else np.nan
                is_b = bool(best.loc[m, col]) if (m in best.index and col in best.columns) else False
                is_s = bool(second.loc[m, col]) if (m in second.index and col in second.columns) else False
                row.append(fmt_cell(v, is_b, is_s))
        lines.append(" & ".join(row) + r" \\")

lines.append(end_txt)

latex_summary_table = "\n".join(lines)

# ---- WRITE OUT ----
Path("tables").mkdir(parents=True, exist_ok=True)
with open("tables/greedy_summary_table.tex", "w") as f:
    f.write(latex_summary_table)

print(latex_summary_table)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
df = pd.read_csv('./results_csv/mbr_full_results.csv')

task_map = {
    'xsum': 'summarization',
    'wmt14_fren': 'translation',
    'wmt19_deen': 'translation',
    'coqa': 'qa',
    'triviaqa': 'qa',
    'mmlu': 'qa',
    'gsm8k': 'qa',
}

method_blocks = { 
    'general_baselines': [
        'MonteCarloSequenceEntropy',
        'MonteCarloNormalizedSequenceEntropy',
        'SemanticEntropy',
        'DegMat_NLI_score_entail',
        'EigValLaplacian_NLI_score_entail',
        'SAR_t0.001',
        'PTrueMbrSample',
        'MbrAveDissimilarity',
    ],
    'msp': [
        'MbrSampledMaximumSequenceProbability',
        'MbrSemanticEnrichedMaxprobAveDissimilarity',
    ],
    'ppl': [
        'MbrSampledPerplexity',
        'MbrSemanticEnrichedPPLAveDissimilarity',
    ],
    'mte': [
        'MbrSampledMeanTokenEntropy',
        'MbrSemanticEnrichedMTEAveDissimilarity',
    ],
}


pretty_method_names = {
            'MonteCarloSequenceEntropy': 'MCSE',
        'MonteCarloNormalizedSequenceEntropy' : 'MCNSE',
        'SemanticEntropy':'Semantic Entropy',
        'DegMat_NLI_score_entail':'DegMat',
        'EigValLaplacian_NLI_score_entail':'EigValLaplacian',
        'SAR_t0.001':'SAR',
        'PTrueMbrSample': 'P(True)',
        'MbrAveDissimilarity': 'Consistency Light',
        'SupervisedCocoa': 'Consistency',
                'MbrSampledMaximumSequenceProbability': ' MSP',
        'MbrSemanticEnrichedMaxprobAveDissimilarity': '$\text{CoCoA}_{MSP}$',
        'SupervisedCocoaMSP': '$\text{CoCoA}_{MSP}$ Light',
            'MbrSampledPerplexity': ' PPL',
        'MbrSemanticEnrichedPPLAveDissimilarity': '$\text{CoCoA}_{PPL}$',
        'SupervisedCocoaPPL': '$\text{CoCoA}_{PPL}$ Light',
        'MbrSampledMeanTokenEntropy': ' MTE',
        'MbrSemanticEnrichedMTEAveDissimilarity': '$\text{CoCoA}_{MTE}$',
        'SupervisedCocoaMTE': '$\text{CoCoA}_{MTE}$ Light',

}


# Column layout
model_order = ['llama', 'mistral', 'falcon']  # match values in df['model']
task_order  = ['qa', 'translation', 'summarization']
task_short  = {'qa':'QA', 'translation':'NMT', 'summarization':'SUM'}  # header labels
model_header = {'llama': 'Llama', 'mistral': 'Mistral', 'falcon': 'Falcon'}  # pretty names for header

# ---- AGGREGATE TO TASK-LEVEL MEANS ----
df_sum = df.copy()
df_sum['task'] = df_sum['dataset'].map(task_map)

agg = (df_sum
       .dropna(subset=['task'])
       .groupby(['model', 'method', 'task'], as_index=False)['score']
       .mean())

# Wide table: rows = method, cols = (model, task)
wide = agg.pivot_table(index='method', columns=['model', 'task'], values='score')

# Ensure all desired columns exist (even if NaN), in the set order
cols = [(m, t) for m in model_order for t in task_order]
wide = wide.reindex(columns=pd.MultiIndex.from_tuples(cols, names=['model', 'task']))

# ---- ENFORCE METHOD ORDER BY BLOCKS ----
method_order = [m for _, block in method_blocks.items() for m in block]
wanted_methods = [m for m in method_order if m in wide.index]
others = [m for m in wide.index if m not in method_order]  # keep any extras at end (or drop if not wanted)
wide = wide.reindex(wanted_methods + others)

# ---- BEST / SECOND MASKS PER COLUMN ----
best = pd.DataFrame(False, index=wide.index, columns=wide.columns)
second = pd.DataFrame(False, index=wide.index, columns=wide.columns)

for col in wide.columns:
    s = wide[col].dropna()
    if s.empty:
        continue
    uniq = np.sort(s.unique())[::-1]  # larger is better
    top = uniq[0]
    best.loc[s.index[s == top], col] = True
    if len(uniq) > 1:
        second_val = uniq[1]
        second.loc[s.index[s == second_val], col] = True

# ---- FORMATTER ----
def fmt_cell(val, is_best=False, is_second=False):
    if pd.isna(val):
        txt = "--"
    else:
        txt = f"{val:.3f}"  # three decimals
    if is_best:
        return r"\textbf{" + txt + "}"
    if is_second:
        return r"\underline{" + txt + "}"
    return txt

# ---- LATEX HEADER / FOOTER ----
header = r"""
\begin{table*}[th!]
\centering
\renewcommand{\arraystretch}{1.2}
\scalebox{0.85}{
\begin{tabular}{lccccccccc}
\bottomrule
\textbf{Metric} & \multicolumn{3}{c}{\textbf{Llama}} & \multicolumn{3}{c}{\textbf{Mistral}} & \multicolumn{3}{c}{\textbf{Falcon}} \\  
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}  \\
\midrule
""".strip("\n")

end_txt = r"""
\bottomrule
\end{tabular}}
\caption{Results for Evaluated Sequence - MBR Sample: Mean PRR across datasets for each task. The best performing method is in bold, and the second-best is underscored. Arrows indicate improvement in CoCoA over the base version.}
\label{tab:best_sample_results}
\end{table*}
""".strip("\n")

lines = [header]

def method_display_name(key: str) -> str:
    return pretty_method_names.get(key, key)

first_block_emitted = False
for block_name, block_methods in method_blocks.items():
    present = [m for m in block_methods if m in wide.index]
    if not present:
        continue

    if first_block_emitted:
        lines.append(r"\midrule")
    first_block_emitted = True

    for m in present:
        row = [method_display_name(m)]
        for model in model_order:
            for task in task_order:
                col = (model, task)
                v = wide.loc[m, col] if col in wide.columns else np.nan
                is_b = bool(best.loc[m, col]) if (m in best.index and col in best.columns) else False
                is_s = bool(second.loc[m, col]) if (m in second.index and col in second.columns) else False
                row.append(fmt_cell(v, is_b, is_s))
        lines.append(" & ".join(row) + r" \\")

other_methods_present = [m for m in wide.index if m not in method_order]
if other_methods_present:
    lines.append(r"\midrule")
    for m in other_methods_present:
        row = [method_display_name(m)]
        for model in model_order:
            for task in task_order:
                col = (model, task)
                v = wide.loc[m, col] if col in wide.columns else np.nan
                is_b = bool(best.loc[m, col]) if (m in best.index and col in best.columns) else False
                is_s = bool(second.loc[m, col]) if (m in second.index and col in second.columns) else False
                row.append(fmt_cell(v, is_b, is_s))
        lines.append(" & ".join(row) + r" \\")

lines.append(end_txt)

latex_summary_table = "\n".join(lines)

# ---- WRITE OUT ----
Path("tables").mkdir(parents=True, exist_ok=True)
with open("tables/mbr_summary_table.tex", "w") as f:
    f.write(latex_summary_table)

print(latex_summary_table)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

df = pd.read_csv('./results_csv/sample_full_results.csv')

task_map = {
    'xsum': 'summarization',
    'wmt14_fren': 'translation',
    'wmt19_deen': 'translation',
    'coqa': 'qa',
    'triviaqa': 'qa',
    'mmlu': 'qa',
    'gsm8k': 'qa',
}



# --- desired orders ---
dataset_order = ['xsum','wmt14_fren','wmt19_deen','coqa','triviaqa','mmlu','gsm8k']

method_blocks = { 
    'general_baselines': [
        'MonteCarloSequenceEntropy',
        'MonteCarloNormalizedSequenceEntropy',
        'SemanticEntropy',
        'DegMat_NLI_score_entail',
        'EigValLaplacian_NLI_score_entail',
        'SAR_t0.001',
        'PTrueBestSample',
        'BestAveDissimilarity',
    ],
    'msp': [
        'BestSampledMaximumSequenceProbability',
        'BestSemanticEnrichedMaxprobAveDissimilarity',
    ],
    'ppl': [
        'BestSampledPerplexity',
        'BestSemanticEnrichedPPLAveDissimilarity',
    ],
    'mte': [
        'BestSampledMeanTokenEntropy',
        'BestSemanticEnrichedMTEAveDissimilarity',
    ],
}

pretty_method_names = {
            'MonteCarloSequenceEntropy': 'MCSE',
        'MonteCarloNormalizedSequenceEntropy' : 'MCNSE',
        'SemanticEntropy':'Semantic Entropy',
        'DegMat_NLI_score_entail':'DegMat',
        'EigValLaplacian_NLI_score_entail':'EigValLaplacian',
        'SAR_t0.001':'SAR',
        'PTrueBestSample': 'P(True)',
        'BestAveDissimilarity': 'Consistency Light',
        'SupervisedCocoa': 'Consistency',
                'BestSampledMaximumSequenceProbability': ' MSP',
        'BestSemanticEnrichedMaxprobAveDissimilarity': '$\text{CoCoA}_{MSP}$',
        'SupervisedCocoaMSP': '$\text{CoCoA}_{MSP}$ Light',
            'BestSampledPerplexity': ' PPL',
        'BestSemanticEnrichedPPLAveDissimilarity': '$\text{CoCoA}_{PPL}$',
        'SupervisedCocoaPPL': '$\text{CoCoA}_{PPL}$ Light',
        'BestSampledMeanTokenEntropy': ' MTE',
        'BestSemanticEnrichedMTEAveDissimilarity': '$\text{CoCoA}_{MTE}$',
        'SupervisedCocoaMTE': '$\text{CoCoA}_{MTE}$ Light',

}




# Column layout
model_order = ['llama', 'mistral', 'falcon']  
task_order  = ['qa', 'translation', 'summarization']
task_short  = {'qa':'QA', 'translation':'NMT', 'summarization':'SUM'}  # header labels
model_header = {'llama': 'Llama', 'mistral': 'Mistral', 'falcon': 'Falcon'}  # pretty names for header

#  AGGREGATE TO TASK-LEVEL MEANS 
df_sum = df.copy()
df_sum['task'] = df_sum['dataset'].map(task_map)

agg = (df_sum
       .dropna(subset=['task'])
       .groupby(['model', 'method', 'task'], as_index=False)['score']
       .mean())

wide = agg.pivot_table(index='method', columns=['model', 'task'], values='score')

cols = [(m, t) for m in model_order for t in task_order]
wide = wide.reindex(columns=pd.MultiIndex.from_tuples(cols, names=['model', 'task']))

method_order = [m for _, block in method_blocks.items() for m in block]
wanted_methods = [m for m in method_order if m in wide.index]
others = [m for m in wide.index if m not in method_order]  # keep any extras at end (or drop if not wanted)
wide = wide.reindex(wanted_methods + others)

best = pd.DataFrame(False, index=wide.index, columns=wide.columns)
second = pd.DataFrame(False, index=wide.index, columns=wide.columns)

for col in wide.columns:
    s = wide[col].dropna()
    if s.empty:
        continue
    uniq = np.sort(s.unique())[::-1]  # larger is better
    top = uniq[0]
    best.loc[s.index[s == top], col] = True
    if len(uniq) > 1:
        second_val = uniq[1]
        second.loc[s.index[s == second_val], col] = True

def fmt_cell(val, is_best=False, is_second=False):
    if pd.isna(val):
        txt = "--"
    else:
        txt = f"{val:.3f}"  # three decimals
    if is_best:
        return r"\textbf{" + txt + "}"
    if is_second:
        return r"\underline{" + txt + "}"
    return txt

header = r"""
\begin{table*}[th!]
\centering
\renewcommand{\arraystretch}{1.2}
\scalebox{0.85}{
\begin{tabular}{lccccccccc}
\bottomrule
\textbf{Metric} & \multicolumn{3}{c}{\textbf{Llama}} & \multicolumn{3}{c}{\textbf{Mistral}} & \multicolumn{3}{c}{\textbf{Falcon}} \\  
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}
& \textbf{QA} & \textbf{NMT} & \textbf{SUM}  \\
\midrule
""".strip("\n")

end_txt = r"""
\bottomrule
\end{tabular}}
\caption{Results for Evaluated Sequence - Most Probable Sample: Mean PRR across datasets for each task. The best performing method is in bold, and the second-best is underscored. Arrows indicate improvement in CoCoA over the base version.}
\label{tab:best_sample_results}
\end{table*}
""".strip("\n")

lines = [header]

def method_display_name(key: str) -> str:
    return pretty_method_names.get(key, key)

first_block_emitted = False
for block_name, block_methods in method_blocks.items():
    present = [m for m in block_methods if m in wide.index]
    if not present:
        continue

    if first_block_emitted:
        lines.append(r"\midrule")
    first_block_emitted = True

    for m in present:
        row = [method_display_name(m)]
        for model in model_order:
            for task in task_order:
                col = (model, task)
                v = wide.loc[m, col] if col in wide.columns else np.nan
                is_b = bool(best.loc[m, col]) if (m in best.index and col in best.columns) else False
                is_s = bool(second.loc[m, col]) if (m in second.index and col in second.columns) else False
                row.append(fmt_cell(v, is_b, is_s))
        lines.append(" & ".join(row) + r" \\")

other_methods_present = [m for m in wide.index if m not in method_order]
if other_methods_present:
    lines.append(r"\midrule")
    for m in other_methods_present:
        row = [method_display_name(m)]
        for model in model_order:
            for task in task_order:
                col = (model, task)
                v = wide.loc[m, col] if col in wide.columns else np.nan
                is_b = bool(best.loc[m, col]) if (m in best.index and col in best.columns) else False
                is_s = bool(second.loc[m, col]) if (m in second.index and col in second.columns) else False
                row.append(fmt_cell(v, is_b, is_s))
        lines.append(" & ".join(row) + r" \\")

lines.append(end_txt)

latex_summary_table = "\n".join(lines)

Path("tables").mkdir(parents=True, exist_ok=True)
with open("tables/sample_summary_table.tex", "w") as f:
    f.write(latex_summary_table)

print(latex_summary_table)
