In [1]:
import pandas as pd
import glob
import statsmodels.api as sm

In [2]:
# nist_result_file = f'results/all.pass.ndcgeval'
# gpt4_result_file = f'results/all.pass.gpt4.ndcgeval'
metric = "ndcg_cut_10"

In [5]:
results_df_list = []

for infile in glob.glob('./results/*.ndcgeval'):
    judger = infile.split('/')[2].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    # run_df['runid'] = os.path.basename(infile)
    result_df = result_df[result_df['qid'] != 'all']
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['Judge'] = judger
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,Judge
2,cip_run_7,2001010,0.8563,nist
12,cip_run_7,2001459,0.6551,nist
22,cip_run_7,2001575,0.3321,nist
32,cip_run_7,2002075,0.8087,nist
42,cip_run_7,2002168,0.6691,nist
...,...,...,...,...
28992,naverloo_bm25_splades_RR,3100825,0.7797,gpt4
29002,naverloo_bm25_splades_RR,3100833,0.8526,gpt4
29012,naverloo_bm25_splades_RR,3100909,0.9373,gpt4
29022,naverloo_bm25_splades_RR,3100918,0.9402,gpt4


In [6]:
len(result_df['run_id'].unique())

35

In [7]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [33]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [9]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [10]:
data

Unnamed: 0,run_id,qid,score,Judge,QL,QDR,QDS,QW,DL,Synthetic,isGPT4,LLM,isLLM,pipeline
0,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
1,cip_run_7,2001010,0.7281,gpt4,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
2,cip_run_7,2001459,0.6551,nist,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
3,cip_run_7,2001459,0.9303,gpt4,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
4,cip_run_7,2001575,0.3321,nist,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5735,uogtr_b_grf_e_gb,3100909,0.8671,gpt4,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4
5736,uogtr_b_grf_e_gb,3100918,0.6336,nist,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4
5737,uogtr_b_grf_e_gb,3100918,0.9472,gpt4,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4
5738,uogtr_b_grf_e_gb,3100922,0.7062,nist,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4


In [25]:
mixed_model = "score ~ isGPT4 + Judge + QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline"

In [26]:
model = sm.MixedLM.from_formula(mixed_model, data, groups=data["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0428
Min. group size:,164,Log-Likelihood:,815.0246
Max. group size:,164,Converged:,Yes
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.591,0.029,20.433,0.000,0.534,0.648
Judge[T.nist],-0.201,0.005,-36.817,0.000,-0.212,-0.190
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.067,0.038,-1.729,0.084,-0.142,0.009
"C(LLM, Treatment(reference='Other'))[T.T5]",0.038,0.046,0.827,0.408,-0.052,0.129
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.168,0.036,4.720,0.000,0.098,0.237
isGPT4,0.037,0.010,3.707,0.000,0.018,0.057
QL,-0.000,0.014,-0.024,0.980,-0.027,0.026
QDR,0.028,0.002,12.402,0.000,0.023,0.032
QW,-0.001,0.002,-0.347,0.729,-0.004,0.003


In [35]:
mixed_model_int = "score ~ isGPT4 + Judge + QL + QDR + QW + C(LLM, Treatment(reference='Other'))+ C(Judge, Treatment(reference='nist')) * C(LLM, Treatment(reference='Other')) + isGPT4*C(LLM, Treatment(reference='Other')) + pipeline"

In [36]:
model = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0426
Min. group size:,164,Log-Likelihood:,797.0661
Max. group size:,164,Converged:,No
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.362,298374.509,0.000,1.000,-584802.929,584803.653
Judge[T.nist],0.034,298374.509,0.000,1.000,-584803.257,584803.325
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.086,0.103,-0.835,0.404,-0.288,0.116
"C(LLM, Treatment(reference='Other'))[T.T5]",0.034,0.124,0.274,0.784,-0.208,0.276
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.164,0.095,1.720,0.085,-0.023,0.351
"C(Judge, Treatment(reference='nist'))[T.gpt4]",0.229,298374.509,0.000,1.000,-584803.062,584803.520
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.GPT]",0.018,0.016,1.133,0.257,-0.013,0.049
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.T5]",0.008,0.014,0.581,0.561,-0.020,0.037
"C(Judge, Treatment(reference='nist'))[T.gpt4]:C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.003,0.014,0.234,0.815,-0.025,0.032


In [11]:
real_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [12]:
real_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [13]:
mixed_model_condition_Qreal = "score ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline"
mixed_model_condition_Qsynthetic = "score ~ QL + QDS + QW + C(LLM, Treatment(reference='Other')) + pipeline"

In [14]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_real_judgments, groups=real_queries_real_judgments["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,1785,Method:,REML
No. Groups:,35,Scale:,0.0468
Min. group size:,51,Log-Likelihood:,143.8759
Max. group size:,51,Converged:,Yes
Mean group size:,51.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.384,0.033,11.612,0.000,0.319,0.449
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.086,0.040,-2.178,0.029,-0.164,-0.009
"C(LLM, Treatment(reference='Other'))[T.T5]",0.024,0.048,0.498,0.618,-0.070,0.117
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.156,0.037,4.244,0.000,0.084,0.227
QL,0.040,0.033,1.240,0.215,-0.023,0.104
QDR,0.033,0.003,9.556,0.000,0.026,0.040
QW,0.003,0.003,1.064,0.287,-0.003,0.009
pipeline,0.019,0.008,2.338,0.019,0.003,0.034
Group Var,0.005,0.008,,,,


In [15]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_real_judgments, groups=synthetic_queries_real_judgments["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,1085,Method:,REML
No. Groups:,35,Scale:,0.0477
Min. group size:,31,Log-Likelihood:,65.3165
Max. group size:,31,Converged:,Yes
Mean group size:,31.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.503,0.038,13.171,0.000,0.428,0.578
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.058,0.036,-1.605,0.109,-0.128,0.013
"C(LLM, Treatment(reference='Other'))[T.T5]",0.052,0.043,1.202,0.229,-0.033,0.136
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.183,0.033,5.524,0.000,0.118,0.248
QL,0.135,0.029,4.674,0.000,0.079,0.192
QDS,-0.012,0.003,-4.346,0.000,-0.018,-0.007
QW,-0.017,0.004,-3.754,0.000,-0.026,-0.008
pipeline,0.019,0.007,2.694,0.007,0.005,0.034
Group Var,0.004,0.006,,,,


In [16]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_synthetic_judgments, groups=real_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,1785,Method:,REML
No. Groups:,35,Scale:,0.0385
Min. group size:,51,Log-Likelihood:,312.7592
Max. group size:,51,Converged:,Yes
Mean group size:,51.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.538,0.033,16.183,0.000,0.473,0.603
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.049,0.041,-1.180,0.238,-0.130,0.032
"C(LLM, Treatment(reference='Other'))[T.T5]",0.047,0.050,0.941,0.347,-0.051,0.144
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.170,0.038,4.454,0.000,0.095,0.245
QL,-0.030,0.030,-1.014,0.311,-0.088,0.028
QDR,0.021,0.003,6.526,0.000,0.014,0.027
QW,0.013,0.003,4.598,0.000,0.007,0.018
pipeline,0.019,0.008,2.273,0.023,0.003,0.035
Group Var,0.006,0.009,,,,


In [17]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_synthetic_judgments, groups=synthetic_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,1085,Method:,REML
No. Groups:,35,Scale:,0.0329
Min. group size:,31,Log-Likelihood:,259.4124
Max. group size:,31,Converged:,Yes
Mean group size:,31.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.533,0.035,15.165,0.000,0.464,0.602
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.072,0.037,-1.937,0.053,-0.144,0.001
"C(LLM, Treatment(reference='Other'))[T.T5]",0.034,0.044,0.773,0.440,-0.053,0.121
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.167,0.034,4.912,0.000,0.101,0.234
QL,0.104,0.024,4.319,0.000,0.057,0.151
QDS,0.028,0.002,11.967,0.000,0.024,0.033
QW,-0.006,0.004,-1.525,0.127,-0.013,0.002
pipeline,0.020,0.007,2.630,0.009,0.005,0.034
Group Var,0.004,0.008,,,,


In [18]:
# Merging the dataframes on 'run_id' and 'qid'
real_queries_diff = pd.merge(real_queries_real_judgments, real_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
real_queries_diff['score_ab'] = real_queries_diff['score_a'] - real_queries_diff['score_b']
real_queries_diff['score_ba'] = real_queries_diff['score_b'] - real_queries_diff['score_a']

In [19]:
real_queries_diff

Unnamed: 0,run_id,qid,score_a,Judge,QL,QDR,QDS,QW,DL,Synthetic,isGPT4,LLM,isLLM,pipeline,score_b,score_ab,score_ba
0,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,0.7281,0.1282,-0.1282
1,cip_run_7,2001459,0.6551,nist,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2,0.9303,-0.2752,0.2752
2,cip_run_7,2001575,0.3321,nist,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2,0.9299,-0.5978,0.5978
3,cip_run_7,2002075,0.8087,nist,0,1.8125,5.2791,7,1005.6332,0,0,GPT,1,2,1.0000,-0.1913,0.1913
4,cip_run_7,2002168,0.6691,nist,0,1.3110,4.3667,7,56.8986,0,0,GPT,1,2,0.9537,-0.2846,0.2846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780,uogtr_b_grf_e_gb,2046312,0.6592,nist,0,1.7750,14.8571,6,990.1974,0,0,T5,1,4,0.7044,-0.0452,0.0452
1781,uogtr_b_grf_e_gb,2047836,0.4104,nist,0,0.0955,1.0235,8,1757.3351,0,0,T5,1,4,0.5026,-0.0922,0.0922
1782,uogtr_b_grf_e_gb,2047929,0.5153,nist,0,0.5714,5.2857,7,345.2637,0,0,T5,1,4,0.8125,-0.2972,0.2972
1783,uogtr_b_grf_e_gb,2051782,0.6581,nist,0,0.1696,0.1364,6,1288.9583,0,0,T5,1,4,0.7438,-0.0857,0.0857


In [19]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_ab
No. Observations:,1785,Method:,REML
No. Groups:,35,Scale:,0.0435
Min. group size:,51,Log-Likelihood:,236.8533
Max. group size:,51,Converged:,Yes
Mean group size:,51.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-0.154,0.020,-7.595,0.000,-0.193,-0.114
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.038,0.015,-2.587,0.010,-0.066,-0.009
"C(LLM, Treatment(reference='Other'))[T.T5]",-0.023,0.017,-1.323,0.186,-0.057,0.011
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",-0.015,0.013,-1.107,0.268,-0.041,0.011
QL,0.070,0.031,2.238,0.025,0.009,0.132
QDR,0.013,0.003,3.768,0.000,0.006,0.019
QW,-0.009,0.003,-3.221,0.001,-0.015,-0.004
pipeline,-0.000,0.003,-0.094,0.925,-0.006,0.005
Group Var,0.000,,,,,


In [20]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_ba
No. Observations:,1785,Method:,REML
No. Groups:,35,Scale:,0.0435
Min. group size:,51,Log-Likelihood:,236.8533
Max. group size:,51,Converged:,Yes
Mean group size:,51.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.154,0.020,7.595,0.000,0.114,0.193
"C(LLM, Treatment(reference='Other'))[T.GPT]",0.038,0.015,2.587,0.010,0.009,0.066
"C(LLM, Treatment(reference='Other'))[T.T5]",0.023,0.017,1.323,0.186,-0.011,0.057
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.015,0.013,1.107,0.268,-0.011,0.041
QL,-0.070,0.031,-2.238,0.025,-0.132,-0.009
QDR,-0.013,0.003,-3.768,0.000,-0.019,-0.006
QW,0.009,0.003,3.221,0.001,0.004,0.015
pipeline,0.000,0.003,0.094,0.925,-0.005,0.006
Group Var,0.000,,,,,


In [21]:
# Merging the dataframes on 'run_id' and 'qid'
synthetic_queries_diff = pd.merge(synthetic_queries_real_judgments, synthetic_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
synthetic_queries_diff['score_ab'] = synthetic_queries_diff['score_a'] - synthetic_queries_diff['score_b']
synthetic_queries_diff['score_ba'] = synthetic_queries_diff['score_b'] - synthetic_queries_diff['score_a']

In [22]:
synthetic_queries_diff

Unnamed: 0,run_id,qid,score_a,Judge,QL,QDR,QDS,QW,isGPT4,LLM,isLLM,pipeline,score_b,score_ab,score_ba
0,cip_run_7,3005001,0.1357,nist,0,0.5885,4.8852,7,0,GPT,1,2,0.6821,-0.5464,0.5464
1,cip_run_7,3010623,0.7387,nist,0,0.0952,0.1979,6,0,GPT,1,2,0.7280,0.0107,-0.0107
2,cip_run_7,3021830,0.1703,nist,0,0.1552,0.0361,8,0,GPT,1,2,0.3234,-0.1531,0.1531
3,cip_run_7,3027897,0.7329,nist,0,0.2544,7.8929,6,0,GPT,1,2,0.8049,-0.0720,0.0720
4,cip_run_7,3030812,0.1765,nist,0,0.1298,0.0386,5,0,GPT,1,2,0.5184,-0.3419,0.3419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,uogtr_b_grf_e_gb,3100825,0.5348,nist,1,0.7835,4.0882,11,1,T5,1,4,0.6676,-0.1328,0.1328
1081,uogtr_b_grf_e_gb,3100833,0.4907,nist,1,0.1144,0.6350,13,1,T5,1,4,0.8526,-0.3619,0.3619
1082,uogtr_b_grf_e_gb,3100909,0.9511,nist,1,1.1883,2.7444,10,1,T5,1,4,0.8671,0.0840,-0.0840
1083,uogtr_b_grf_e_gb,3100918,0.6336,nist,1,0.4276,2.0563,10,1,T5,1,4,0.9472,-0.3136,0.3136


In [23]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_ab
No. Observations:,1085,Method:,REML
No. Groups:,35,Scale:,0.0576
Min. group size:,31,Log-Likelihood:,-15.7672
Max. group size:,31,Converged:,No
Mean group size:,31.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-0.090,0.036,-2.512,0.012,-0.161,-0.020
"C(LLM, Treatment(reference='Other'))[T.GPT]",0.014,0.021,0.646,0.518,-0.028,0.056
"C(LLM, Treatment(reference='Other'))[T.T5]",0.018,0.026,0.685,0.493,-0.033,0.068
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.016,0.020,0.804,0.422,-0.023,0.055
QL,0.071,0.032,2.215,0.027,0.008,0.133
QDR,-0.013,0.018,-0.687,0.492,-0.049,0.023
QW,-0.015,0.005,-3.137,0.002,-0.025,-0.006
pipeline,-0.000,0.004,-0.013,0.990,-0.008,0.008
Group Var,0.000,,,,,


In [24]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_ba
No. Observations:,1085,Method:,REML
No. Groups:,35,Scale:,0.0576
Min. group size:,31,Log-Likelihood:,-15.7672
Max. group size:,31,Converged:,No
Mean group size:,31.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.090,0.036,2.512,0.012,0.020,0.161
"C(LLM, Treatment(reference='Other'))[T.GPT]",-0.014,0.021,-0.646,0.518,-0.056,0.028
"C(LLM, Treatment(reference='Other'))[T.T5]",-0.018,0.026,-0.685,0.493,-0.068,0.033
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",-0.016,0.020,-0.804,0.422,-0.055,0.023
QL,-0.071,0.032,-2.215,0.027,-0.133,-0.008
QDR,0.013,0.018,0.687,0.492,-0.023,0.049
QW,0.015,0.005,3.137,0.002,0.006,0.025
pipeline,0.000,0.004,0.013,0.990,-0.008,0.008
Group Var,0.000,,,,,


## Extra Experiments (refer to: "Extra Exp. 1")

In [25]:
def get_result(result_file):
    result_df = pd.read_csv(result_file, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    return result_df

In [27]:
real_judge_results = get_result(result_file="results/all.pass.nist.ndcgeval")
synthetic_judge_results = get_result(result_file="results/all.pass.gpt4.ndcgeval")

In [42]:
# Merging the dataframes on 'run_id' and 'qid'
results_diff = pd.merge(real_judge_results, synthetic_judge_results, on=['run_id', 'qid'], suffixes=('_real', '_synthetic'))
# Subtracting the 'Score' values
results_diff['score_RS'] = results_diff['score_real'] - results_diff['score_synthetic']
results_diff['score_SR'] = results_diff['score_synthetic'] - results_diff['score_real']

In [43]:
results_diff = pd.merge(results_diff, qid_to_info, on='qid')
results_diff = pd.merge(results_diff, model_to_info, on='run_id')

In [44]:
results_diff

Unnamed: 0,run_id,qid,score_real,score_synthetic,score_RS,score_SR,QL,QDR,QDS,QW,DL,Synthetic,isGPT4,LLM,isLLM,pipeline
0,cip_run_7,2001010,0.8563,0.7281,0.1282,-0.1282,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
1,cip_run_7,2001459,0.6551,0.9303,-0.2752,0.2752,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
2,cip_run_7,2001575,0.3321,0.9299,-0.5978,0.5978,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2
3,cip_run_7,2002075,0.8087,1.0000,-0.1913,0.1913,0,1.8125,5.2791,7,1005.6332,0,0,GPT,1,2
4,cip_run_7,2002168,0.6691,0.9537,-0.2846,0.2846,0,1.3110,4.3667,7,56.8986,0,0,GPT,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,uogtr_b_grf_e_gb,3100825,0.5348,0.6676,-0.1328,0.1328,1,0.7835,4.0882,11,760.7758,1,1,T5,1,4
2866,uogtr_b_grf_e_gb,3100833,0.4907,0.8526,-0.3619,0.3619,1,0.1144,0.6350,13,702.7082,1,1,T5,1,4
2867,uogtr_b_grf_e_gb,3100909,0.9511,0.8671,0.0840,-0.0840,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4
2868,uogtr_b_grf_e_gb,3100918,0.6336,0.9472,-0.3136,0.3136,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4


In [45]:
mixed_model_1 = "score_SR ~ isGPT4 + QL + QDR*C(Synthetic, Treatment(reference=0)) + QW + C(LLM, Treatment(reference='Other')) + pipeline"

In [46]:
model = sm.MixedLM.from_formula(mixed_model_1, results_diff, groups=results_diff["run_id"])
result = model.fit()
result.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score_SR
No. Observations:,2870,Method:,REML
No. Groups:,35,Scale:,0.0489
Min. group size:,82,Log-Likelihood:,220.2626
Max. group size:,82,Converged:,Yes
Mean group size:,82.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.152,0.018,8.264,0.000,0.116,0.188
"C(Synthetic, Treatment(reference=0))[T.1]",-0.048,0.014,-3.485,0.000,-0.075,-0.021
"C(LLM, Treatment(reference='Other'))[T.GPT]",0.018,0.012,1.493,0.135,-0.006,0.042
"C(LLM, Treatment(reference='Other'))[T.T5]",0.008,0.015,0.527,0.598,-0.021,0.036
"C(LLM, Treatment(reference='Other'))[T.T5+GPT]",0.003,0.011,0.287,0.774,-0.019,0.025
isGPT4,0.000,0.018,0.017,0.987,-0.036,0.036
QL,-0.054,0.021,-2.608,0.009,-0.094,-0.013
QDR,-0.012,0.004,-3.343,0.001,-0.019,-0.005
"QDR:C(Synthetic, Treatment(reference=0))[T.1]",0.024,0.017,1.397,0.162,-0.010,0.058
