In [1]:
import pandas as pd
import glob
import statsmodels.api as sm

In [2]:
# nist_result_file = f'results/all.pass.ndcgeval'
# gpt4_result_file = f'results/all.pass.gpt4.ndcgeval'
metric = "ndcg_cut_10" # map # ndcg_cut_10
result_format = "ndcgeval" # treceval # ndcgeval

In [3]:
results_df_list = []

for infile in glob.glob(f'./results/*.{result_format}'):
    judger = infile.split('/')[2].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
2,cip_run_7,2001010,0.8563,nist
12,cip_run_7,2001459,0.6551,nist
22,cip_run_7,2001575,0.3321,nist
32,cip_run_7,2002075,0.8087,nist
42,cip_run_7,2002168,0.6691,nist
...,...,...,...,...
28992,naverloo_bm25_splades_RR,3100825,0.7797,gpt4
29002,naverloo_bm25_splades_RR,3100833,0.8526,gpt4
29012,naverloo_bm25_splades_RR,3100909,0.9373,gpt4
29022,naverloo_bm25_splades_RR,3100918,0.9402,gpt4


In [4]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [5]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [6]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [7]:
data.drop(['QL'], axis=1, inplace=True)
data.drop(['isGPT4'], axis=1, inplace=True)

In [8]:
def QD_value(row):
    if row['judged_by'] == 'nist':
        return row['QDR']
    elif row['judged_by'] == 'gpt4':
        return row['QDS']

In [9]:
data['QD'] = data.apply(QD_value, axis=1)

In [10]:
data.drop(['QDR'], axis=1, inplace=True)
data.drop(['QDS'], axis=1, inplace=True)
data.drop(['isLLM'], axis=1, inplace=True)

In [11]:
data['QT'] = data['QT'].astype('category') 
data['QT'] = data['QT'].replace({0: 'Human', 1: 'T5', 2: 'GPT4'}) 

  data['QT'] = data['QT'].replace({0: 'Human', 1: 'T5', 2: 'GPT4'})


In [12]:
data

Unnamed: 0,run_id,qid,score,judged_by,QW,APL,QT,ST,pipeline,QD
0,cip_run_7,2001010,0.8563,nist,6,72.5291,Human,GPT,2,0.7840
1,cip_run_7,2001459,0.6551,nist,4,57.2620,Human,GPT,2,2.6311
2,cip_run_7,2001575,0.3321,nist,4,440.2367,Human,GPT,2,0.1040
3,cip_run_7,2002075,0.8087,nist,7,1005.6332,Human,GPT,2,1.8125
4,cip_run_7,2002168,0.6691,nist,7,56.8986,Human,GPT,2,1.3110
...,...,...,...,...,...,...,...,...,...,...
5735,naverloo_bm25_splades_RR,3100825,0.7797,gpt4,11,760.7758,GPT4,T5,8,4.0882
5736,naverloo_bm25_splades_RR,3100833,0.8526,gpt4,13,702.7082,GPT4,T5,8,0.6350
5737,naverloo_bm25_splades_RR,3100909,0.9373,gpt4,10,995.8028,GPT4,T5,8,2.7444
5738,naverloo_bm25_splades_RR,3100918,0.9402,gpt4,10,143.9186,GPT4,T5,8,2.0563


In [33]:
mixed_model = "score ~ QD + QW + APL + pipeline + C(judged_by, Treatment(reference='nist')) +  QT + C(ST, Treatment(reference='Other')) + QT * C(ST, Treatment(reference='Other')) + C(ST, Treatment(reference='Other')) * C(judged_by, Treatment(reference='nist'))  "

In [34]:
model = sm.MixedLM.from_formula(mixed_model, data, groups=data["qid"])
result = model.fit()
result.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,82,Scale:,0.0309
Min. group size:,70,Log-Likelihood:,1612.7002
Max. group size:,70,Converged:,Yes
Mean group size:,70.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.458,0.047,9.650,0.000,0.365,0.551
"C(judged_by, Treatment(reference='nist'))[T.gpt4]",0.160,0.008,19.746,0.000,0.144,0.175
QT[T.T5],-0.090,0.041,-2.198,0.028,-0.171,-0.010
QT[T.GPT4],0.020,0.049,0.412,0.680,-0.076,0.117
"C(ST, Treatment(reference='Other'))[T.GPT]",-0.077,0.011,-6.973,0.000,-0.098,-0.055
"C(ST, Treatment(reference='Other'))[T.T5]",0.030,0.011,2.648,0.008,0.008,0.052
"C(ST, Treatment(reference='Other'))[T.T5+GPT]",0.161,0.010,16.026,0.000,0.141,0.181
"QT[T.T5]:C(ST, Treatment(reference='Other'))[T.GPT]",-0.048,0.019,-2.491,0.013,-0.085,-0.010
"QT[T.GPT4]:C(ST, Treatment(reference='Other'))[T.GPT]",0.039,0.017,2.311,0.021,0.006,0.072


In [25]:
data2 = data.drop(['QD'], axis=1)

In [26]:
score_diff_data = data2.pivot(index=['run_id', 'qid', 'QW', 'APL', 'QT', 'ST', 'pipeline'], columns=['judged_by'], values='score')
score_diff_data.columns.name = None
score_diff_data = score_diff_data.reset_index()
score_diff_data['score_diff'] = score_diff_data['gpt4'] - score_diff_data['nist']
score_diff_data

Unnamed: 0,run_id,qid,QW,APL,QT,ST,pipeline,gpt4,nist,score_diff
0,WatS-Augmented-BM25,2001010,6,72.5291,Human,GPT,1,0.7168,0.5107,0.2061
1,WatS-Augmented-BM25,2001459,4,57.2620,Human,GPT,1,0.7300,0.1947,0.5353
2,WatS-Augmented-BM25,2001575,4,440.2367,Human,GPT,1,0.5469,0.2201,0.3268
3,WatS-Augmented-BM25,2002075,7,1005.6332,Human,GPT,1,0.8979,0.5440,0.3539
4,WatS-Augmented-BM25,2002168,7,56.8986,Human,GPT,1,0.7172,0.2147,0.5025
...,...,...,...,...,...,...,...,...,...,...
2865,uot-yahoo_rankgpt4,3100825,11,760.7758,GPT4,GPT,1,0.4442,0.3300,0.1142
2866,uot-yahoo_rankgpt4,3100833,13,702.7082,GPT4,GPT,1,0.5544,0.0000,0.5544
2867,uot-yahoo_rankgpt4,3100909,10,995.8028,GPT4,GPT,1,0.7573,0.6239,0.1334
2868,uot-yahoo_rankgpt4,3100918,10,143.9186,GPT4,GPT,1,0.9524,0.7283,0.2241


In [35]:
score_diff_mixed_model = "score_diff ~  QW + APL + pipeline  +  QT + C(ST, Treatment(reference='Other')) + QT * C(ST, Treatment(reference='Other')) "

In [36]:
model = sm.MixedLM.from_formula(score_diff_mixed_model, score_diff_data, groups=score_diff_data["qid"])
result = model.fit()
result.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,score_diff
No. Observations:,2870,Method:,REML
No. Groups:,82,Scale:,0.0162
Min. group size:,35,Log-Likelihood:,1620.0439
Max. group size:,35,Converged:,Yes
Mean group size:,35.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.194,0.068,2.868,0.004,0.061,0.327
QT[T.T5],-0.011,0.058,-0.184,0.854,-0.125,0.104
QT[T.GPT4],-0.028,0.070,-0.398,0.690,-0.166,0.110
"C(ST, Treatment(reference='Other'))[T.GPT]",0.037,0.009,4.249,0.000,0.020,0.055
"C(ST, Treatment(reference='Other'))[T.T5]",0.023,0.010,2.405,0.016,0.004,0.042
"C(ST, Treatment(reference='Other'))[T.T5+GPT]",0.015,0.008,1.839,0.066,-0.001,0.031
"QT[T.T5]:C(ST, Treatment(reference='Other'))[T.GPT]",-0.055,0.020,-2.845,0.004,-0.094,-0.017
"QT[T.GPT4]:C(ST, Treatment(reference='Other'))[T.GPT]",-0.048,0.017,-2.803,0.005,-0.082,-0.015
"QT[T.T5]:C(ST, Treatment(reference='Other'))[T.T5]",-0.050,0.018,-2.835,0.005,-0.085,-0.016


In [11]:
real_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [12]:
real_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [13]:
mixed_model_condition_Qreal = "score ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline"
mixed_model_condition_Qsynthetic = "score ~ QL + QDS + QW + C(LLM, Treatment(reference='Other')) + pipeline"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_real_judgments, groups=real_queries_real_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_real_judgments, groups=synthetic_queries_real_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_synthetic_judgments, groups=real_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_synthetic_judgments, groups=synthetic_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()

In [18]:
# Merging the dataframes on 'run_id' and 'qid'
real_queries_diff = pd.merge(real_queries_real_judgments, real_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
real_queries_diff['score_ab'] = real_queries_diff['score_a'] - real_queries_diff['score_b']
real_queries_diff['score_ba'] = real_queries_diff['score_b'] - real_queries_diff['score_a']

In [None]:
real_queries_diff

In [None]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()

In [21]:
# Merging the dataframes on 'run_id' and 'qid'
synthetic_queries_diff = pd.merge(synthetic_queries_real_judgments, synthetic_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
synthetic_queries_diff['score_ab'] = synthetic_queries_diff['score_a'] - synthetic_queries_diff['score_b']
synthetic_queries_diff['score_ba'] = synthetic_queries_diff['score_b'] - synthetic_queries_diff['score_a']

In [None]:
synthetic_queries_diff

In [None]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()

## Extra Experiments (refer to: "Extra Exp. 1")

In [6]:
def get_result(result_file):
    result_df = pd.read_csv(result_file, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    return result_df

In [7]:
real_judge_results = get_result(result_file="results/all.pass.nist.ndcgeval")
synthetic_judge_results = get_result(result_file="results/all.pass.gpt4.ndcgeval")

In [9]:
# Merging the dataframes on 'run_id' and 'qid'
results_diff = pd.merge(real_judge_results, synthetic_judge_results, on=['run_id', 'qid'], suffixes=('_real', '_synthetic'))
# Subtracting the 'Score' values
results_diff['score_RS'] = results_diff['score_real'] - results_diff['score_synthetic']
results_diff['score_SR'] = results_diff['score_synthetic'] - results_diff['score_real']

In [13]:
results_diff = pd.merge(results_diff, qid_to_info, on='qid')
results_diff = pd.merge(results_diff, model_to_info, on='run_id')

In [None]:
results_diff

Selected: score_SR

In [17]:
results_diff['qd_RS'] = results_diff['QDR'] - results_diff['QDS']
results_diff['qd_SR'] = results_diff['QDS'] - results_diff['QDR']

In [21]:
mixed_model_1 = "score_SR ~ Synthetic + QL + qd_SR + QW + DL + pipeline + C(LLM, Treatment(reference='Other')) + Synthetic * C(LLM, Treatment(reference='Other'))"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_1, results_diff, groups=results_diff["run_id"])
result = model.fit()
result.summary()