In [1]:
import json 
import pandas as pd 
from glob import glob
from pathlib import Path
import numpy as np

import sys
sys.path.append('/fs/clip-political/rupak/hf-fewshot')

In [2]:
from hf_fewshot.prompting_utils import load_jsonlines

In [3]:
label_dict = {
    'A' : '1', 
    'B' : '2'
}

Choose a class of models to analyze their outputs - 8B or 70B. 

In [35]:
result_files = sorted(glob("output/*70B.jsonl"))
result_files

['output/draw-0-results_70B.jsonl',
 'output/draw-1-results_70B.jsonl',
 'output/draw-2-results_70B.jsonl']

In [36]:
data = load_jsonlines(result_files[2])

In [37]:
mean_1 = [x['preferences']['1'] for x in data]
mean_2 = [x['preferences']['2'] for x in data]

In [38]:
np.mean(mean_1), np.mean(mean_2)

(0.5663463155691262, 0.4336536815066035)

In [39]:
np.mean(mean_1) - 0.50, np.mean(mean_2) - 0.50 

(0.06634631556912618, -0.0663463184933965)

In [34]:
llama 8B biases - 0.14, 0.024, 0.1 
llama 70B biases - 0.06, 0.14, 0.08

SyntaxError: invalid decimal literal (1365568787.py, line 1)

Compile the outputs of the models into a single dataframe, track their source. 

In [73]:
df_list = []

for file in result_files:
    results = load_jsonlines(file)
    
    results = pd.DataFrame(results)
    results['source'] = Path(file).name.split("-")[1]

    results["label"] = results["label"].apply(lambda x: label_dict[x])
    results["correct"] = results["label"] == results["output"]
    df_list.append({
        # keep just the name of the file
        "file": Path(file).name,
        "results_df": results
    })

In [74]:
def get_default_score(df, endswith="_0"): 
    # take a subset of the df where the pair_id ends with _0 
    df_0 = df[df["pair_id"].str.endswith(endswith)]
    print(df_0["correct"].mean()) 

bias = 0.06
def debias_dict(preferences, bias):
    # debias the preferences by subtracting the bias from the value 
    preferences['1'] = preferences['1'] - bias
    preferences['2'] = preferences['2'] + bias
    return preferences


def get_debiased_df(df_temp, bias):
    # apply debias_dict to the preferences column for each row in df 
    df= df_temp.copy(deep=True)

    df["preferences_debiased"] = df["preferences"].apply(lambda x: debias_dict(x, bias))
    df['output_debiased'] = df["preferences_debiased"].apply(lambda x: '1' if x['1'] > x['2'] else '2')
    df['correct_debiased'] = df['label'] == df['output_debiased']

    return df 
    
def get_default_score_debiased(df, endswith="_0"): 
    # take a subset of the df where the pair_id ends with _0 
    df_temp = df[df["pair_id"].str.endswith(endswith)]
    print(df_temp["correct_debiased"].mean())

In [75]:
df0 = df_list[0]["results_df"]
df1 = df_list[1]["results_df"]
df2 = df_list[2]["results_df"]

In [76]:
df0_debiased = get_debiased_df(df0, 0.06)
df1_debiased = get_debiased_df(df1, 0.14)
df2_debiased = get_debiased_df(df2, 0.08)

In [78]:
get_default_score(df0), get_default_score(df1), get_default_score(df2)

0.6319702602230484
0.5966542750929368
0.6282527881040892


(None, None, None)

In [79]:
# combine the dfs
df_combined = pd.concat([df0, df1, df2])

In [80]:
df_combined.head()

Unnamed: 0,pair_id,output,preferences,label,source,correct
0,1047895789_0,2,"{'1': -0.05999847058757041, '2': 1.05999839067...",1,0,False
1,1047895789_1,1,"{'1': 0.9399939203262329, '2': 0.0600060489205...",2,0,False
2,1047895789_2,1,"{'1': 0.9399995231628417, '2': 0.0600004888304...",2,0,False
3,1047895789_3,2,"{'1': -0.056175516163930295, '2': 1.0561755871...",1,0,False
4,1047887035_0,2,"{'1': -0.04201378807425499, '2': 1.04201370239...",1,0,False


In [81]:
df0_debiased.head()

Unnamed: 0,pair_id,output,preferences,label,source,correct,preferences_debiased,output_debiased,correct_debiased
0,1047895789_0,2,"{'1': -0.05999847058757041, '2': 1.05999839067...",1,0,False,"{'1': -0.05999847058757041, '2': 1.05999839067...",2,False
1,1047895789_1,1,"{'1': 0.9399939203262329, '2': 0.0600060489205...",2,0,False,"{'1': 0.9399939203262329, '2': 0.0600060489205...",1,False
2,1047895789_2,1,"{'1': 0.9399995231628417, '2': 0.0600004888304...",2,0,False,"{'1': 0.9399995231628417, '2': 0.0600004888304...",1,False
3,1047895789_3,2,"{'1': -0.056175516163930295, '2': 1.0561755871...",1,0,False,"{'1': -0.056175516163930295, '2': 1.0561755871...",2,False
4,1047887035_0,2,"{'1': -0.04201378807425499, '2': 1.04201370239...",1,0,False,"{'1': -0.04201378807425499, '2': 1.04201370239...",2,False


In [82]:
def get_joint_debiased(df_combined): 
    unique_pairs = list(set([elem.split("_")[0] for elem in df_combined["pair_id"].values])) 

    labels = []
    gold_labels = []
    for pair in unique_pairs: 
        df_temp = df_combined[(df_combined["pair_id"].str.startswith(pair)) & (df_combined["pair_id"].str.endswith("_0"))]
        # find the majority value of the correct_debiased column
        majority = df_temp["output_debiased"].value_counts().idxmax()
        gold_label = df_temp["label"].values[0]

        labels.append(majority)
        gold_labels.append(gold_label)

    return labels, gold_labels

In [70]:
df_combined_debiased = pd.concat([df0_debiased, df1_debiased, df2_debiased])

In [71]:
debiased_labels, debiased_gold_labels =  get_joint_debiased(df_combined_debiased)

In [72]:
print(classification_report(debiased_gold_labels, debiased_labels)) 

              precision    recall  f1-score   support

           1       0.61      0.39      0.47       238
           2       0.62      0.80      0.70       300

    accuracy                           0.62       538
   macro avg       0.61      0.59      0.59       538
weighted avg       0.61      0.62      0.60       538



In [52]:
def preference_sum(df):
    """
    Given a dataframe, add the preferences of all the columns and return the dict 
    """
    preferences = df["preferences"].values
    preference_sum = {'1': 0, '2': 0}
    for pref in preferences: 
        preference_sum['1'] += pref['1']
        preference_sum['2'] += pref['2']
    return preference_sum


def get_augmentation_bias(df_combined): 
    """
    Adds up the scores of the 1st and 4th row, and the second and 3rd row for the 12 preferences of each unique pair 
    @Hauke: This is the debiasing method that we discussed in the meeting earlier. 
    """


    unique_pairs = list(set([elem.split("_")[0] for elem in df_combined["pair_id"].values])) 
         
    labels = []
    gold_labels = []
    # for each unique pair, get the preferences for each of the 3 dataframes
    for elem in unique_pairs: 
        # for an unique pair, there are 12 preferences
        all_preferences = []
        
        df_temp = df_combined[df_combined["pair_id"].str.startswith(elem)].sort_values(by="pair_id")
        # label of the first row is the label 
        gold_label = df_temp.iloc[0]["label"]

        # add up preferences of all rows with pair_id ending with _0 or _3
        df_0_3 = df_temp[df_temp["pair_id"].str.endswith("_0") | df_temp["pair_id"].str.endswith("_3")]
        df_1_2 = df_temp[df_temp["pair_id"].str.endswith("_1") | df_temp["pair_id"].str.endswith("_2")]        

        preference_sum_0_3 = preference_sum(df_0_3)
        preference_sum_1_2 = preference_sum(df_1_2)

        score_1 = preference_sum_0_3['1'] + preference_sum_1_2['2']
        score_2 = preference_sum_0_3['2'] + preference_sum_1_2['1']

        if score_1 > score_2: 
            labels.append('1')
        else:
            labels.append('2')

        gold_labels.append(gold_label)

    return labels, gold_labels

In [53]:
# find the default win rate
get_default_score(df0), get_default_score(df1), get_default_score(df2)

0.6319702602230484
0.5966542750929368
0.6282527881040892


(None, None, None)

In [56]:
labels, gold_labels = get_augmentation_bias(df_combined)

In [57]:
# calculate precision, recall, f1, accuracy
from sklearn.metrics import classification_report

print(classification_report(gold_labels, labels))

              precision    recall  f1-score   support

           1       0.60      0.50      0.54       238
           2       0.65      0.74      0.69       300

    accuracy                           0.63       538
   macro avg       0.62      0.62      0.62       538
weighted avg       0.63      0.63      0.62       538

