In [1]:
import pandas as pd
import glob
import statsmodels.api as sm

In [2]:
# nist_result_file = f'results/all.pass.ndcgeval'
# gpt4_result_file = f'results/all.pass.gpt4.ndcgeval'
metric = "map" # map # ndcg_cut_10
result_format = "treceval" # treceval # ndcgeval

In [3]:
results_df_list = []

for infile in glob.glob(f'./results/*.{result_format}'):
    judger = infile.split('/')[2].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
3,naverloo_bm25_splades_RR,2001010,0.3698,gpt4
30,naverloo_bm25_splades_RR,2001459,0.2113,gpt4
57,naverloo_bm25_splades_RR,2001575,0.3611,gpt4
84,naverloo_bm25_splades_RR,2002075,0.2001,gpt4
111,naverloo_bm25_splades_RR,2002168,0.1848,gpt4
...,...,...,...,...
263914,uogtr_b_grf_e_gb,3100825,0.0958,nist
264005,uogtr_b_grf_e_gb,3100833,0.3054,nist
264096,uogtr_b_grf_e_gb,3100909,0.1652,nist
264187,uogtr_b_grf_e_gb,3100918,0.0807,nist


In [4]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [5]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [6]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [7]:
data

Unnamed: 0,run_id,qid,score,judged_by,QL,QDR,QDS,QW,DL,isSynthetic,isGPT4,LLM,isLLM,pipeline
0,naverloo_bm25_splades_RR,2001010,0.3698,gpt4,0,0.7840,0.7023,6,72.5291,0,0,T5,1,8
1,naverloo_bm25_splades_RR,2001010,0.9314,nist,0,0.7840,0.7023,6,72.5291,0,0,T5,1,8
2,naverloo_bm25_splades_RR,2001459,0.2113,gpt4,0,2.6311,6.7917,4,57.2620,0,0,T5,1,8
3,naverloo_bm25_splades_RR,2001459,0.1754,nist,0,2.6311,6.7917,4,57.2620,0,0,T5,1,8
4,naverloo_bm25_splades_RR,2001575,0.3611,gpt4,0,0.1040,0.7523,4,440.2367,0,0,T5,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5735,naverloo_fs_RR_duo,3100909,0.2306,nist,1,1.1883,2.7444,10,995.8028,1,1,T5,1,10
5736,naverloo_fs_RR_duo,3100918,0.1955,gpt4,1,0.4276,2.0563,10,143.9186,1,1,T5,1,10
5737,naverloo_fs_RR_duo,3100918,0.1076,nist,1,0.4276,2.0563,10,143.9186,1,1,T5,1,10
5738,naverloo_fs_RR_duo,3100922,0.5086,gpt4,1,0.0383,0.5447,10,446.6774,1,1,T5,1,10


In [8]:
mixed_model = "score ~ judged_by + QL + QDR + QW + DL + isSynthetic + C(LLM, Treatment(reference='Other')) + pipeline"

In [9]:
model = sm.MixedLM.from_formula(mixed_model, data, groups=data["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0226
Min. group size:,164,Log-Likelihood:,2645.5814
Max. group size:,164,Converged:,Yes
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.210,0.015,13.909,0.000,0.180,0.239
judged_by[T.nist],-0.027,0.004,-6.694,0.000,-0.034,-0.019
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.035,0.018,-1.913,0.056,-0.070,0.001
"C(LLM, Treatment(reference='Other'))[T.T5]",-0.002,0.022,-0.077,0.939,-0.044,0.041
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.079,0.017,4.737,0.000,0.046,0.112
QL,0.103,0.009,11.080,0.000,0.085,0.121
QDR,-0.015,0.002,-9.031,0.000,-0.018,-0.012
QW,-0.011,0.001,-9.281,0.000,-0.014,-0.009
DL,0.000,0.000,19.101,0.000,0.000,0.000


In [10]:
mixed_model_int = "score ~ isGPT4 + Judge + QL + QDR + QW + C(LLM, Treatment(reference='Other')) + C(Judge, Treatment(reference='nist')) * C(LLM, Treatment(reference='Other')) + isGPT4 * C(LLM, Treatment(reference='Other')) + pipeline"

In [36]:
model = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0426
Min. group size:,164,Log-Likelihood:,797.0661
Max. group size:,164,Converged:,No
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.362,298374.509,0.000,1.000,-584802.929,584803.653
Judge[T.nist],0.034,298374.509,0.000,1.000,-584803.257,584803.325
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.086,0.103,-0.835,0.404,-0.288,0.116
"C(LLM, Treatment(reference='Other'))[T.T5]",0.034,0.124,0.274,0.784,-0.208,0.276
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.164,0.095,1.720,0.085,-0.023,0.351
"C(Judge, Treatment(reference='nist'))[T.gpt4]",0.229,298374.509,0.000,1.000,-584803.062,584803.520
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.GPT]",0.018,0.016,1.133,0.257,-0.013,0.049
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.T5]",0.008,0.014,0.581,0.561,-0.020,0.037
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.003,0.014,0.234,0.815,-0.025,0.032


## Extra Experiments (refer to: "Extra Exp. 1")

In [13]:
def get_result(result_file):
    result_df = pd.read_csv(result_file, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    return result_df

In [14]:
real_judge_results = get_result(result_file=f"results/all.pass.nist.{result_format}")
synthetic_judge_results = get_result(result_file=f"results/all.pass.gpt4.{result_format}")

In [16]:
# Merging the dataframes on 'run_id' and 'qid'
results_diff = pd.merge(real_judge_results, synthetic_judge_results, on=['run_id', 'qid'], suffixes=('_real', '_synthetic'))
# Subtracting the 'Score' values
results_diff['score_RS'] = results_diff['score_real'] - results_diff['score_synthetic']
results_diff['score_SR'] = results_diff['score_synthetic'] - results_diff['score_real']

In [17]:
results_diff = pd.merge(results_diff, qid_to_info, on='qid')
results_diff = pd.merge(results_diff, model_to_info, on='run_id')

In [18]:
results_diff

Unnamed: 0,run_id,qid,score_real,score_synthetic,score_RS,score_SR,QL,QDR,QDS,QW,DL,isSynthetic,isGPT4,LLM,isLLM,pipeline
0,cip_run_7,2001010,0.7243,0.3724,0.3519,-0.3519,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
1,cip_run_7,2001459,0.0773,0.1532,-0.0759,0.0759,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
2,cip_run_7,2001575,0.2305,0.2898,-0.0593,0.0593,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2
3,cip_run_7,2002075,0.1900,0.1817,0.0083,-0.0083,0,1.8125,5.2791,7,1005.6332,0,0,GPT,1,2
4,cip_run_7,2002168,0.0963,0.1492,-0.0529,0.0529,0,1.3110,4.3667,7,56.8986,0,0,GPT,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,uogtr_b_grf_e_gb,3100825,0.0958,0.1451,-0.0493,0.0493,1,0.7835,4.0882,11,760.7758,1,1,T5,1,4
2866,uogtr_b_grf_e_gb,3100833,0.3054,0.3453,-0.0399,0.0399,1,0.1144,0.6350,13,702.7082,1,1,T5,1,4
2867,uogtr_b_grf_e_gb,3100909,0.1652,0.1485,0.0167,-0.0167,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4
2868,uogtr_b_grf_e_gb,3100918,0.0807,0.1860,-0.1053,0.1053,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4


Selected: score_SR

In [19]:
results_diff['qd_RS'] = results_diff['QDR'] - results_diff['QDS']
results_diff['qd_SR'] = results_diff['QDS'] - results_diff['QDR']

In [22]:
mixed_model_1 = "score_SR ~ isSynthetic + QL + qd_SR + QW + DL + pipeline + C(LLM, Treatment(reference='Other')) + isSynthetic * C(LLM, Treatment(reference='Other'))"

In [23]:
model = sm.MixedLM.from_formula(mixed_model_1, results_diff, groups=results_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_SR
No. Observations:,2870,Method:,REML
No. Groups:,35,Scale:,0.0255
Min. group size:,82,Log-Likelihood:,1132.6446
Max. group size:,82,Converged:,Yes
Mean group size:,82.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.060,0.013,4.461,0.000,0.034,0.087
"C(LLM, Treatment(reference='Other'))[T.GPT]",0.042,0.012,3.414,0.001,0.018,0.066
"C(LLM, Treatment(reference='Other'))[T.T5]",0.008,0.014,0.587,0.557,-0.019,0.035
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",-0.040,0.011,-3.505,0.000,-0.062,-0.018
isSynthetic,-0.006,0.011,-0.516,0.606,-0.026,0.015
"isSynthetic:C(LLM, Treatment(reference='Other'))[T.GPT]",-0.027,0.018,-1.518,0.129,-0.062,0.008
"isSynthetic:C(LLM, Treatment(reference='Other'))[T.T5]",-0.025,0.016,-1.524,0.127,-0.057,0.007
"isSynthetic:C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.035,0.016,2.162,0.031,0.003,0.067
QL,0.006,0.014,0.425,0.671,-0.021,0.033
