In [4]:
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from collections import defaultdict
import krippendorff
from sklearn.metrics import cohen_kappa_score
import itertools
import random

In [None]:
llmjudge_test_path = "#path to the test set"

In [2]:
# loading LLMJudge test data
llmjudge_test = pd.read_csv(llmjudge_test_path, sep=" ", header=None, names=['qid', 'Q0', 'docid', 'score'])
llmjudge_test['score'] = [0 if x < 0 else 3 if x > 3 else x for x in llmjudge_test['score']]

In [3]:
all_results = llmjudge_test
all_results = all_results.drop('Q0', axis=1)

In [5]:
# Models
MODELS = ['blender1', 'blender2', 'blender3']

for model in MODELS:
    model_df = pd.read_csv(f"judgments/{model}.txt", sep=" ", header=None, names=['qid', 'Q0', 'docid', 'score'])
    model_df['score'] = [0 if x < 0 else 3 if x > 3 else x for x in model_df['score']]
    model_df = model_df.drop('Q0', axis=1)
    all_results = pd.merge(all_results, model_df, on=['qid', 'docid'], suffixes=('_main', '_X'))
    all_results = all_results.rename({'score_main': 'score', 'score_X': f'{model}'}, axis=1)

In [None]:
all_results

In [7]:
all_models_results = all_results.drop('qid', axis=1)
all_models_results = all_models_results.drop('docid', axis=1)
all_models_results = all_models_results.drop('score', axis=1)

In [8]:
def majority_vote(t, no_major="random"):
    # Set the seed for reproducibility
    random.seed(42)
    """
    Returns the majority element in a tuple of three integers.
    If there is no majority element, returns None.
    """
    if t[0] == t[1] or t[0] == t[2]:
        return t[0]
    elif t[1] == t[2]:
        return t[1]
    else:
        if no_major == "random":
            return random.choice(t)
        elif no_major == "max":
            return max(t)
        elif no_major == "min":
            return min(t)
        elif no_major == "avg":
            return round(np.mean(t))

In [9]:
def avg_voting(t):
    score_avg = np.mean(t)
    score_avg = round(score_avg)
    return score_avg

In [10]:
def get_cohen_kappa(test_scores, submission_scores):
    return round(cohen_kappa_score(test_scores, submission_scores), 4)

In [11]:
def get_krippendorff_alpha(test_scores, submission_scores):
    krippendorff_alpha = round(krippendorff.alpha(reliability_data=[test_scores, submission_scores], value_domain=[0,1,2,3], level_of_measurement='ordinal'), 4)
    return krippendorff_alpha

In [12]:
def write_blender_scores(filename, scores):
    blender_scores_fl = open(f"submissions/blenders/selected/{filename}.txt", 'w')
    for (qid, docid), score in zip(zip(all_results['qid'], all_results['docid']), scores):
        blender_scores_fl.write(f"{qid} 0 {docid} {score}\n")
    blender_scores_fl.close()

In [19]:
blender_type = "llm" # prompt
# MV: Majority Voting + (Avg, Rnd, Min, Max)
# Avg
aggregator_type = "Avg"

In [None]:
# Iterate over all combinations of 3 columns
for cols in tqdm(itertools.combinations(all_models_results.columns, 3)):
    blender_scores = list()
    # compute each column's corr with test
    kappa_1 = get_cohen_kappa(all_results['score'], all_models_results[cols[0]])
    kappa_2 = get_cohen_kappa(all_results['score'], all_models_results[cols[1]])
    kappa_3 = get_cohen_kappa(all_results['score'], all_models_results[cols[2]])

    for scores in zip(all_models_results[cols[0]], all_models_results[cols[1]], all_models_results[cols[2]]):
        score = avg_voting(scores) # majority_vote(scores, no_major="random") # majority_vote(scores, no_major="avg") # avg_voting(scores)
        blender_scores.append(score)

    blender_kappa = get_cohen_kappa(all_results['score'], blender_scores)
    blender_alpha = get_krippendorff_alpha(all_results['score'], blender_scores)
    write_blender_scores(f"{blender_type}_{aggregator_type}", blender_scores)
    print(blender_kappa, blender_alpha)