In [1]:
import pandas as pd
import glob
import statsmodels.api as sm

In [21]:
metric = "ndcg_cut_10" # map # ndcg_cut_10
result_format = "ndcgeval" # treceval # ndcgeval
results_from = "LLMJudge2024"

In [22]:
files = glob.glob(f'./results/{results_from}/*.{result_format}')

In [23]:
files

['./results/LLMJudge2024/all.pass.TREMA-rubric0.ndcgeval',
 './results/LLMJudge2024/all.pass.RMITIR-llama70B.ndcgeval',
 './results/LLMJudge2024/all.pass.llmjudge-thomas3.ndcgeval',
 './results/LLMJudge2024/all.pass.llmjudge-simple3.ndcgeval',
 './results/LLMJudge2024/all.pass.NISTRetrieval-instruct0.ndcgeval',
 './results/LLMJudge2024/all.pass.llmjudge-test.ndcgeval',
 './results/LLMJudge2024/all.pass.TREMA-direct.ndcgeval',
 './results/LLMJudge2024/all.pass.Olz-gpt4o.ndcgeval',
 './results/LLMJudge2024/all.pass.TREMA-4prompts.ndcgeval',
 './results/LLMJudge2024/all.pass.Olz-exp.ndcgeval']

In [24]:
# for removed_labler in ['RMITIR-llama70B', 'llmjudge-simple3', 'Olz-exp', 'TREMA-4prompts', 'TREMA-direct']:
#     removed_labler = f'./results/LLMJudge2024/all.pass.{removed_labler}.{result_format}'
#     files.remove(removed_labler)

for removed_labler in ['TREMA-rubric0', 'RMITIR-llama70B', 'llmjudge-thomas3', 'Olz-exp', 'llmjudge-simple3', 'NISTRetrieval-instruct0', 'TREMA-4prompts']:
    removed_labler = f'./results/LLMJudge2024/all.pass.{removed_labler}.{result_format}'
    files.remove(removed_labler)

In [25]:
results_df_list = []

for infile in files:
    judger = infile.split('/')[3].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
2,naverloo_fs_RR_duo,2002168,0.6996,llmjudge-test
12,naverloo_fs_RR_duo,2004282,0.6562,llmjudge-test
22,naverloo_fs_RR_duo,2004980,0.3832,llmjudge-test
32,naverloo_fs_RR_duo,2005952,0.5039,llmjudge-test
42,naverloo_fs_RR_duo,2007816,0.9197,llmjudge-test
...,...,...,...,...
9042,naverloo_bm25_splades_RR,3100119,0.9558,Olz-gpt4o
9052,naverloo_bm25_splades_RR,3100235,0.6795,Olz-gpt4o
9062,naverloo_bm25_splades_RR,3100289,0.6726,Olz-gpt4o
9072,naverloo_bm25_splades_RR,3100399,0.4828,Olz-gpt4o


In [26]:
# queries_judged = set(results_dfs['qid'])
# real_queries_judged = [x for x in queries_judged if x < 3000000]
# t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
# gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

# print(len(real_queries_judged))
# print(len(t5_queries_judged))
# print(len(gpt4_queries_judged))

In [27]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [28]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [29]:
# These are the features that do not seem to be very useful, relevant, or concrete.
# QL is a binary feature and we have QW, so it should be dropped.
data.drop(['QL'], axis=1, inplace=True)
data.drop(['isGPT4'], axis=1, inplace=True)
data.drop(['QDR'], axis=1, inplace=True)
data.drop(['QDS'], axis=1, inplace=True)
data.drop(['isLLM'], axis=1, inplace=True)

In [30]:
data['QT'] = data['QT'].astype('category') 
data['QT'] = data['QT'].replace({0: 'Human', 1: 'T5', 2: 'GPT4'}) 

In [31]:
# Labeler: {'TREMA-rubric0', 'RMITIR-llama70B', 'llmjudge-thomas3', 'llmjudge-simple3', 'NISTRetrieval-instruct0', 'llmjudge-test', 
# 'TREMA-direct', 'Olz-gpt4o', 'TREMA-4prompts', 'Olz-exp'}

data['judged_by'] = data['judged_by'].replace({'TREMA-rubric0': 'Llama3Rubric', 'RMITIR-llama70B': 'Llama3RMIT', 'llmjudge-thomas3': 'GPT4Thomas', 'llmjudge-simple3': 'GPT4Simple', 
                                               'NISTRetrieval-instruct0': 'Llama3Inst', 'llmjudge-test': 'NIST', 'TREMA-direct': 'FlanT5Direct', 'Olz-gpt4o': 'GPT4oSimple',
                                               'TREMA-4prompts': 'Llama3Prompts', 'Olz-exp': 'GPT4oExp'})

In [32]:
mixed_model = "score ~ QW + APL + MN + C(judged_by, Treatment(reference='NIST')) +  QT + C(ST, Treatment(reference='Other')) + QT * C(ST, Treatment(reference='Other')) + C(ST, Treatment(reference='Other')) * C(judged_by, Treatment(reference='NIST')) + QT * C(judged_by, Treatment(reference='NIST'))"

In [33]:
model = sm.MixedLM.from_formula(mixed_model, data, groups=data["qid"])
result = model.fit()
result.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,2625,Method:,REML
No. Groups:,25,Scale:,0.0360
Min. group size:,105,Log-Likelihood:,512.9598
Max. group size:,105,Converged:,Yes
Mean group size:,105.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.423,0.104,4.062,0.000,0.219,0.627
"C(judged_by, Treatment(reference='NIST'))[T.FlanT5Direct]",0.232,0.017,13.955,0.000,0.199,0.265
"C(judged_by, Treatment(reference='NIST'))[T.GPT4oSimple]",0.123,0.017,7.373,0.000,0.090,0.155
QT[T.T5],-0.072,0.080,-0.906,0.365,-0.229,0.084
QT[T.GPT4],0.061,0.105,0.587,0.557,-0.143,0.266
"C(ST, Treatment(reference='Other'))[T.GPT]",-0.062,0.021,-2.968,0.003,-0.102,-0.021
"C(ST, Treatment(reference='Other'))[T.T5]",0.013,0.021,0.641,0.522,-0.027,0.054
"C(ST, Treatment(reference='Other'))[T.T5+GPT]",0.164,0.019,8.657,0.000,0.127,0.201
"QT[T.T5]:C(ST, Treatment(reference='Other'))[T.GPT]",-0.066,0.028,-2.368,0.018,-0.121,-0.011


In [25]:
data2 = data.drop(['QD'], axis=1)

In [None]:
score_diff_data = data2.pivot(index=['run_id', 'qid', 'QW', 'APL', 'QT', 'ST', 'pipeline'], columns=['judged_by'], values='score')
score_diff_data.columns.name = None
score_diff_data = score_diff_data.reset_index()
score_diff_data['score_diff'] = score_diff_data['gpt4'] - score_diff_data['nist']
score_diff_data

In [35]:
score_diff_mixed_model = "score_diff ~  QW + APL + pipeline  +  QT + C(ST, Treatment(reference='Other')) + QT * C(ST, Treatment(reference='Other')) "

In [None]:
model = sm.MixedLM.from_formula(score_diff_mixed_model, score_diff_data, groups=score_diff_data["qid"])
result = model.fit()
result.summary()

In [11]:
real_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [12]:
real_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [13]:
mixed_model_condition_Qreal = "score ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline"
mixed_model_condition_Qsynthetic = "score ~ QL + QDS + QW + C(LLM, Treatment(reference='Other')) + pipeline"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_real_judgments, groups=real_queries_real_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_real_judgments, groups=synthetic_queries_real_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_synthetic_judgments, groups=real_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_synthetic_judgments, groups=synthetic_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()

In [18]:
# Merging the dataframes on 'run_id' and 'qid'
real_queries_diff = pd.merge(real_queries_real_judgments, real_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
real_queries_diff['score_ab'] = real_queries_diff['score_a'] - real_queries_diff['score_b']
real_queries_diff['score_ba'] = real_queries_diff['score_b'] - real_queries_diff['score_a']

In [None]:
real_queries_diff

In [None]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()

In [21]:
# Merging the dataframes on 'run_id' and 'qid'
synthetic_queries_diff = pd.merge(synthetic_queries_real_judgments, synthetic_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
synthetic_queries_diff['score_ab'] = synthetic_queries_diff['score_a'] - synthetic_queries_diff['score_b']
synthetic_queries_diff['score_ba'] = synthetic_queries_diff['score_b'] - synthetic_queries_diff['score_a']

In [None]:
synthetic_queries_diff

In [None]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()

## Extra Experiments (refer to: "Extra Exp. 1")

In [6]:
def get_result(result_file):
    result_df = pd.read_csv(result_file, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    return result_df

In [7]:
real_judge_results = get_result(result_file="results/all.pass.nist.ndcgeval")
synthetic_judge_results = get_result(result_file="results/all.pass.gpt4.ndcgeval")

In [9]:
# Merging the dataframes on 'run_id' and 'qid'
results_diff = pd.merge(real_judge_results, synthetic_judge_results, on=['run_id', 'qid'], suffixes=('_real', '_synthetic'))
# Subtracting the 'Score' values
results_diff['score_RS'] = results_diff['score_real'] - results_diff['score_synthetic']
results_diff['score_SR'] = results_diff['score_synthetic'] - results_diff['score_real']

In [13]:
results_diff = pd.merge(results_diff, qid_to_info, on='qid')
results_diff = pd.merge(results_diff, model_to_info, on='run_id')

In [None]:
results_diff

Selected: score_SR

In [17]:
results_diff['qd_RS'] = results_diff['QDR'] - results_diff['QDS']
results_diff['qd_SR'] = results_diff['QDS'] - results_diff['QDR']

In [21]:
mixed_model_1 = "score_SR ~ Synthetic + QL + qd_SR + QW + DL + pipeline + C(LLM, Treatment(reference='Other')) + Synthetic * C(LLM, Treatment(reference='Other'))"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_1, results_diff, groups=results_diff["run_id"])
result = model.fit()
result.summary()