In [10]:
import pandas as pd
import glob
import statsmodels.api as sm

In [11]:
# nist_result_file = f'results/all.pass.ndcgeval'
# gpt4_result_file = f'results/all.pass.gpt4.ndcgeval'
metric = "ndcg_cut_10" # map # ndcg_cut_10
result_format = "ndcgeval" # treceval # ndcgeval

In [12]:
results_df_list = []

for infile in glob.glob(f'./results/*.{result_format}'):
    judger = infile.split('/')[2].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
2,cip_run_7,2001010,0.8563,nist
12,cip_run_7,2001459,0.6551,nist
22,cip_run_7,2001575,0.3321,nist
32,cip_run_7,2002075,0.8087,nist
42,cip_run_7,2002168,0.6691,nist
...,...,...,...,...
28992,naverloo_bm25_splades_RR,3100825,0.7797,gpt4
29002,naverloo_bm25_splades_RR,3100833,0.8526,gpt4
29012,naverloo_bm25_splades_RR,3100909,0.9373,gpt4
29022,naverloo_bm25_splades_RR,3100918,0.9402,gpt4


In [13]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [14]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [15]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [16]:
data

Unnamed: 0,run_id,qid,score,judged_by,QL,QDR,QDS,QW,DL,isSynthetic,isGPT4,LLM,isLLM,pipeline
0,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
1,cip_run_7,2001010,0.7281,gpt4,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
2,cip_run_7,2001459,0.6551,nist,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
3,cip_run_7,2001459,0.9303,gpt4,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
4,cip_run_7,2001575,0.3321,nist,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5735,uogtr_b_grf_e_gb,3100909,0.8671,gpt4,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4
5736,uogtr_b_grf_e_gb,3100918,0.6336,nist,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4
5737,uogtr_b_grf_e_gb,3100918,0.9472,gpt4,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4
5738,uogtr_b_grf_e_gb,3100922,0.7062,nist,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4


In [17]:
mixed_model = "score ~ judged_by + QL + QDR + QW + DL + isSynthetic + C(LLM, Treatment(reference='Other')) + pipeline"

In [9]:
model = sm.MixedLM.from_formula(mixed_model, data, groups=data["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0420
Min. group size:,164,Log-Likelihood:,854.0988
Max. group size:,164,Converged:,Yes
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.622,0.029,21.375,0.000,0.565,0.679
judged_by[T.nist],-0.201,0.005,-37.144,0.000,-0.212,-0.190
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.067,0.038,-1.729,0.084,-0.142,0.009
"C(LLM, Treatment(reference='Other'))[T.T5]",0.038,0.046,0.827,0.408,-0.052,0.129
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.168,0.036,4.720,0.000,0.098,0.237
QL,0.044,0.013,3.444,0.001,0.019,0.068
QDR,0.025,0.002,11.245,0.000,0.021,0.030
QW,0.001,0.002,0.793,0.428,-0.002,0.005
DL,-0.000,0.000,-8.177,0.000,-0.000,-0.000


In [35]:
mixed_model_int = "score ~ isGPT4 + Judge + QL + QDR + QW + C(LLM, Treatment(reference='Other')) + C(Judge, Treatment(reference='nist')) * C(LLM, Treatment(reference='Other')) + isGPT4 * C(LLM, Treatment(reference='Other')) + pipeline"

In [36]:
model = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0426
Min. group size:,164,Log-Likelihood:,797.0661
Max. group size:,164,Converged:,No
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.362,298374.509,0.000,1.000,-584802.929,584803.653
Judge[T.nist],0.034,298374.509,0.000,1.000,-584803.257,584803.325
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.086,0.103,-0.835,0.404,-0.288,0.116
"C(LLM, Treatment(reference='Other'))[T.T5]",0.034,0.124,0.274,0.784,-0.208,0.276
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.164,0.095,1.720,0.085,-0.023,0.351
"C(Judge, Treatment(reference='nist'))[T.gpt4]",0.229,298374.509,0.000,1.000,-584803.062,584803.520
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.GPT]",0.018,0.016,1.133,0.257,-0.013,0.049
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.T5]",0.008,0.014,0.581,0.561,-0.020,0.037
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.003,0.014,0.234,0.815,-0.025,0.032


## Extra Experiments (refer to: "Extra Exp. 1")

In [6]:
def get_result(result_file):
    result_df = pd.read_csv(result_file, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    return result_df

In [7]:
real_judge_results = get_result(result_file="results/all.pass.nist.ndcgeval")
synthetic_judge_results = get_result(result_file="results/all.pass.gpt4.ndcgeval")

In [9]:
# Merging the dataframes on 'run_id' and 'qid'
results_diff = pd.merge(real_judge_results, synthetic_judge_results, on=['run_id', 'qid'], suffixes=('_real', '_synthetic'))
# Subtracting the 'Score' values
results_diff['score_RS'] = results_diff['score_real'] - results_diff['score_synthetic']
results_diff['score_SR'] = results_diff['score_synthetic'] - results_diff['score_real']

In [13]:
results_diff = pd.merge(results_diff, qid_to_info, on='qid')
results_diff = pd.merge(results_diff, model_to_info, on='run_id')

In [18]:
results_diff

Unnamed: 0,run_id,qid,score_real,score_synthetic,score_RS,score_SR,QL,QDR,QDS,QW,DL,Synthetic,isGPT4,LLM,isLLM,pipeline,qd_SR,qd_RS
0,cip_run_7,2001010,0.8563,0.7281,0.1282,-0.1282,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,-0.0817,0.0817
1,cip_run_7,2001459,0.6551,0.9303,-0.2752,0.2752,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2,4.1606,-4.1606
2,cip_run_7,2001575,0.3321,0.9299,-0.5978,0.5978,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2,0.6483,-0.6483
3,cip_run_7,2002075,0.8087,1.0000,-0.1913,0.1913,0,1.8125,5.2791,7,1005.6332,0,0,GPT,1,2,3.4666,-3.4666
4,cip_run_7,2002168,0.6691,0.9537,-0.2846,0.2846,0,1.3110,4.3667,7,56.8986,0,0,GPT,1,2,3.0557,-3.0557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,uogtr_b_grf_e_gb,3100825,0.5348,0.6676,-0.1328,0.1328,1,0.7835,4.0882,11,760.7758,1,1,T5,1,4,3.3047,-3.3047
2866,uogtr_b_grf_e_gb,3100833,0.4907,0.8526,-0.3619,0.3619,1,0.1144,0.6350,13,702.7082,1,1,T5,1,4,0.5206,-0.5206
2867,uogtr_b_grf_e_gb,3100909,0.9511,0.8671,0.0840,-0.0840,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4,1.5561,-1.5561
2868,uogtr_b_grf_e_gb,3100918,0.6336,0.9472,-0.3136,0.3136,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4,1.6287,-1.6287


Selected: score_SR

In [17]:
results_diff['qd_RS'] = results_diff['QDR'] - results_diff['QDS']
results_diff['qd_SR'] = results_diff['QDS'] - results_diff['QDR']

In [21]:
mixed_model_1 = "score_SR ~ Synthetic + QL + qd_SR + QW + DL + pipeline + C(LLM, Treatment(reference='Other')) + Synthetic * C(LLM, Treatment(reference='Other'))"

In [22]:
model = sm.MixedLM.from_formula(mixed_model_1, results_diff, groups=results_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_SR
No. Observations:,2870,Method:,REML
No. Groups:,35,Scale:,0.0450
Min. group size:,82,Log-Likelihood:,325.0453
Max. group size:,82,Converged:,Yes
Mean group size:,82.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.158,0.017,9.191,0.000,0.124,0.192
"C(LLM, Treatment(reference='Other'))[T.GPT]",0.037,0.015,2.547,0.011,0.009,0.066
"C(LLM, Treatment(reference='Other'))[T.T5]",0.023,0.016,1.442,0.149,-0.008,0.055
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.015,0.014,1.102,0.270,-0.012,0.041
Synthetic,-0.010,0.014,-0.696,0.487,-0.038,0.018
"Synthetic:C(LLM, Treatment(reference='Other'))[T.GPT]",-0.051,0.024,-2.150,0.032,-0.098,-0.005
"Synthetic:C(LLM, Treatment(reference='Other'))[T.T5]",-0.041,0.022,-1.911,0.056,-0.084,0.001
"Synthetic:C(LLM, Treatment(reference='Other'))[T.T5+GPT]",-0.031,0.022,-1.424,0.154,-0.073,0.012
QL,-0.011,0.019,-0.592,0.554,-0.047,0.025
