In [198]:
'''
We are also interested in how often our Thompson sampling algorithm chooses the best reference model!
    - Potential message of Thompson sampling: on its own, it is not game-changer in performance,
    BUT the sampling algorithm's outputed posterior hyperparameters give us information about best reference model.
    So instead of needing to run 7x reference models' DPO, we can do one Thompson sampling run instead.
    WHILE getting good enough results compared to baseline.
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy.stats import kendalltau

REF_MODELS = [
    "Yi-1.5-9B-Chat",
    "Meta-Llama-3.1-8B-Instruct",
    "Phi-3-medium-128k-instruct",
    "Mistral-7B-Instruct-v0.3",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen3-4B-Instruct-2507"
    ]

# Overall Performance Results

In [111]:
# create a dataframe to store performance results - look at final, argmax, and corr. highest val.
baseline_logs = pd.DataFrame(
    data=None, columns=["method", "dataset", "reference", "piv", "seed",
                        "final_test_acc", "best_test_acc", "amax_val_test_acc"])

# get our baseline DPO results
baseline_files = sorted([f for f in os.listdir("results") if "baseline-DPO" in f])

# go thru all of our baseline-DPO files
for file in baseline_files:
    
    # load in this set of results
    df = pd.read_csv(f"results/{file}")
    
    # to prevent accidents
    file = file.replace("ultrafeedback_binarized", "ultrafeedback-binarized")
    
    # unpack the filename
    method_str, dataset_str, _, reference, seed_str = file.split("_")
    dataset = dataset_str.split("=")[1]
    seed = int(seed_str.split("=")[1].split(".csv")[0])
    
    # get the metrics of interest
    final_test_acc = df.test_acc.values[-1]
    best_test_acc = df.test_acc.max()
    amax_val_test_acc = df.test_acc[df.val_acc.argmax()]
    
    # assemble our row and add to our dataframe
    row = ["dpo", dataset, reference, np.nan, seed, final_test_acc, best_test_acc, amax_val_test_acc]
    baseline_logs.loc[len(baseline_logs.index)] = row

In [116]:
# show our mean results
baseline_logs.groupby(["method", "dataset", "reference"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,piv,seed,final_test_acc,best_test_acc,amax_val_test_acc
method,dataset,reference,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dpo,PKU-SafeRLHF-30K-standard,Meta-Llama-3.1-8B-Instruct,,1.0,0.570333,0.570333,0.566667
dpo,PKU-SafeRLHF-30K-standard,Mistral-7B-Instruct-v0.3,,1.0,0.572333,0.572333,0.562
dpo,PKU-SafeRLHF-30K-standard,Phi-3-medium-128k-instruct,,1.0,0.528333,0.535,0.531333
dpo,PKU-SafeRLHF-30K-standard,Qwen2.5-0.5B-Instruct,,1.0,0.581667,0.583333,0.583333
dpo,PKU-SafeRLHF-30K-standard,Qwen2.5-1.5B-Instruct,,1.0,0.567667,0.568,0.566333
dpo,PKU-SafeRLHF-30K-standard,Qwen3-4B-Instruct-2507,,1.0,0.554,0.561,0.549333
dpo,PKU-SafeRLHF-30K-standard,Yi-1.5-9B-Chat,,1.0,0.561333,0.563333,0.554667
dpo,ultrafeedback-binarized,Meta-Llama-3.1-8B-Instruct,,1.0,0.681,0.681667,0.681
dpo,ultrafeedback-binarized,Mistral-7B-Instruct-v0.3,,1.0,0.685,0.685,0.668667
dpo,ultrafeedback-binarized,Phi-3-medium-128k-instruct,,1.0,0.657,0.658667,0.649333


In [170]:
# create a dataframe to store performance results - look at final, argmax, and corr. highest val.
thompson_logs = pd.DataFrame(
    data=None, columns=["method", "dataset", "reference", "piv", "seed",
                        "final_test_acc", "best_test_acc", "amax_val_test_acc", "model_ordering"])

# get our Thompson sampling files
thompson_files = sorted([f for f in os.listdir("results") if "Thompson" in f])

# go thru all of our baseline-DPO files
for file in thompson_files:
    
    # load in this set of results
    df = pd.read_csv(f"results/{file}")
    
    # to prevent accidents
    file = file.replace("ultrafeedback_binarized", "ultrafeedback-binarized")
    
    # unpack the filename
    dataset_str, piv_str, seed_str = file.split("_dataset")[1].split("_")
    dataset = dataset_str.split("=")[1]
    piv = float(piv_str.split("=")[1])
    seed = int(seed_str.split("=")[1].split(".csv")[0])
    
    # get the metrics of interest
    final_test_acc = df.test_acc.values[-1]
    best_test_acc = df.test_acc.max()
    amax_val_test_acc = df.test_acc[df.val_acc.argmax()]
    
    # also look at the posterior means of our bandit "arms"
    a = df[[f"a{i}" for i in range(7)]].iloc[-1]
    b = df[[f"b{i}" for i in range(7)]].iloc[-1]
    mus = a.values / (a.values + b.values)
    
    
    # assemble our row and add to our dataframe
    row = ["thompson", dataset, np.nan, piv, seed, final_test_acc, best_test_acc, amax_val_test_acc, np.argsort(mus)]
    thompson_logs.loc[len(thompson_logs.index)] = row

In [209]:
# what were the least to best performing DPO reference models?
saferlhf_baseline_orderings = baseline_logs.query("dataset == 'PKU-SafeRLHF-30K-standard'")\
.groupby(["method", "dataset", "reference"]).mean().sort_values(by="final_test_acc").reset_index().reference

# convert to numerical index
converter = {m : i for i, m in enumerate(REF_MODELS)}
saferlhf_baseline_orderings_idxs = np.array([converter[m] for m in saferlhf_baseline_orderings.values])

In [213]:
# what were the least to best performing DPO reference models?
ultra_baseline_orderings = baseline_logs.query("dataset == 'ultrafeedback-binarized'")\
.groupby(["method", "dataset", "reference"]).mean().sort_values(by="final_test_acc").reset_index().reference

# convert to numerical index
converter = {m : i for i, m in enumerate(REF_MODELS)}
ultra_baseline_orderings_idxs = np.array([converter[m] for m in ultra_baseline_orderings.values])

In [227]:
[kendalltau(
    np.tile(ultra_baseline_orderings_idxs, (6, 1))[i,:], 
    np.array(list(thompson_logs.query("dataset == 'ultrafeedback-binarized'").model_ordering.values))[i,:]
) for i in range(6)]

[SignificanceResult(statistic=0.4285714285714286, pvalue=0.2388888888888889),
 SignificanceResult(statistic=-0.04761904761904762, pvalue=1.0),
 SignificanceResult(statistic=0.04761904761904762, pvalue=1.0),
 SignificanceResult(statistic=0.14285714285714288, pvalue=0.7726190476190476),
 SignificanceResult(statistic=0.7142857142857143, pvalue=0.03015873015873016),
 SignificanceResult(statistic=-0.23809523809523814, pvalue=0.5619047619047619)]

In [228]:
[kendalltau(
    np.tile(saferlhf_baseline_orderings_idxs, (6, 1))[i,:], 
    np.array(list(thompson_logs.query("dataset == 'PKU-SafeRLHF-30K-standard'").model_ordering.values))[i,:]
) for i in range(6)]

[SignificanceResult(statistic=-0.33333333333333337, pvalue=0.3813492063492063),
 SignificanceResult(statistic=0.33333333333333337, pvalue=0.3813492063492063),
 SignificanceResult(statistic=-0.23809523809523814, pvalue=0.5619047619047619),
 SignificanceResult(statistic=-0.14285714285714288, pvalue=0.7726190476190476),
 SignificanceResult(statistic=0.33333333333333337, pvalue=0.3813492063492063),
 SignificanceResult(statistic=-0.14285714285714288, pvalue=0.7726190476190476)]

In [208]:
# get the raw metrics (note that )
thompson_logs

Unnamed: 0,method,dataset,reference,piv,seed,final_test_acc,best_test_acc,amax_val_test_acc,model_ordering
0,thompson,PKU-SafeRLHF-30K-standard,,10.0,0,0.587,0.587,0.587,"[2, 3, 6, 5, 4, 1, 0]"
1,thompson,PKU-SafeRLHF-30K-standard,,10.0,1,0.536,0.536,0.511,"[6, 2, 0, 5, 1, 3, 4]"
2,thompson,PKU-SafeRLHF-30K-standard,,10.0,2,0.547,0.55,0.547,"[2, 1, 6, 5, 3, 0, 4]"
3,thompson,PKU-SafeRLHF-30K-standard,,5.0,0,0.559,0.564,0.564,"[1, 0, 5, 6, 3, 2, 4]"
4,thompson,PKU-SafeRLHF-30K-standard,,5.0,1,0.514,0.536,0.512,"[2, 6, 1, 3, 5, 0, 4]"
5,thompson,PKU-SafeRLHF-30K-standard,,5.0,2,0.57,0.57,0.557,"[4, 5, 3, 1, 6, 0, 2]"
6,thompson,ultrafeedback-binarized,,10.0,0,0.654,0.66,0.66,"[4, 3, 2, 6, 1, 5, 0]"
7,thompson,ultrafeedback-binarized,,10.0,1,0.691,0.715,0.694,"[4, 1, 3, 5, 6, 0, 2]"
8,thompson,ultrafeedback-binarized,,10.0,2,0.679,0.683,0.679,"[4, 3, 6, 2, 1, 0, 5]"
9,thompson,ultrafeedback-binarized,,5.0,0,0.65,0.65,0.64,"[5, 3, 0, 6, 1, 2, 4]"
