In [1]:
import pandas as pd
import glob
import statsmodels.api as sm

In [2]:
# nist_result_file = f'results/all.pass.ndcgeval'
# gpt4_result_file = f'results/all.pass.gpt4.ndcgeval'
metric = "ndcg_cut_10" # map # ndcg_cut_10
result_format = "ndcgeval" # treceval # ndcgeval

In [3]:
results_df_list = []

for infile in glob.glob(f'./results/*.{result_format}'):
    judger = infile.split('/')[2].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
2,cip_run_7,2001010,0.8563,nist
12,cip_run_7,2001459,0.6551,nist
22,cip_run_7,2001575,0.3321,nist
32,cip_run_7,2002075,0.8087,nist
42,cip_run_7,2002168,0.6691,nist
...,...,...,...,...
28992,naverloo_bm25_splades_RR,3100825,0.7797,gpt4
29002,naverloo_bm25_splades_RR,3100833,0.8526,gpt4
29012,naverloo_bm25_splades_RR,3100909,0.9373,gpt4
29022,naverloo_bm25_splades_RR,3100918,0.9402,gpt4


In [4]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [5]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [6]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [7]:
### Adding human labels as a true relevance
qrel_human_df = pd.read_csv('./qrels/2023.qrels.pass.withDupes.txt', delimiter=' ', header=None, names=['qid', 'Q0', 'docid', 'label_human'])
qrel_llm_df = pd.read_csv('./qrels/2023.qrels.pass.gpt4.txt', delimiter=' ', header=None, names=['qid', 'Q0', 'docid', 'label_llm'])

In [8]:
qrel_human_df.drop(['docid'], axis=1, inplace=True)
qrel_human_df.drop(['Q0'], axis=1, inplace=True)

qrel_llm_df.drop(['docid'], axis=1, inplace=True)
qrel_llm_df.drop(['Q0'], axis=1, inplace=True)

In [9]:
data

Unnamed: 0,run_id,qid,score,judged_by,QL,QDR,QDS,QW,DL,isSynthetic,isGPT4,LLM,isLLM,pipeline
0,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
1,cip_run_7,2001010,0.7281,gpt4,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
2,cip_run_7,2001459,0.6551,nist,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
3,cip_run_7,2001459,0.9303,gpt4,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
4,cip_run_7,2001575,0.3321,nist,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5735,uogtr_b_grf_e_gb,3100909,0.8671,gpt4,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4
5736,uogtr_b_grf_e_gb,3100918,0.6336,nist,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4
5737,uogtr_b_grf_e_gb,3100918,0.9472,gpt4,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4
5738,uogtr_b_grf_e_gb,3100922,0.7062,nist,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4


In [12]:
qrel_human_df

Unnamed: 0,qid,label_human
0,2001010,0
1,2001010,0
2,2001010,0
3,2001010,0
4,2001010,1
...,...,...
22322,3100922,0
22323,3100922,0
22324,3100922,0
22325,3100922,0


In [10]:
data = pd.merge(data, qrel_human_df, on='qid')

In [11]:
data

Unnamed: 0,run_id,qid,score,judged_by,QL,QDR,QDS,QW,DL,isSynthetic,isGPT4,LLM,isLLM,pipeline,label_human
0,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,0
1,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,0
2,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,0
3,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,0
4,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562885,uogtr_b_grf_e_gb,3100922,0.5474,gpt4,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4,0
1562886,uogtr_b_grf_e_gb,3100922,0.5474,gpt4,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4,0
1562887,uogtr_b_grf_e_gb,3100922,0.5474,gpt4,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4,0
1562888,uogtr_b_grf_e_gb,3100922,0.5474,gpt4,1,0.0383,0.5447,10,446.6774,1,1,T5,1,4,0


In [None]:
qrel_llm_df

In [None]:
data = pd.merge(data, qrel_llm_df, on='qid')

In [None]:
data['llm_human_diff'] = data['label_llm'] - data['label_human']

In [None]:
data

In [15]:
mixed_model = "score ~ judged_by + QL + QDR + QW + DL + pipeline + label"

In [None]:
model = sm.MixedLM.from_formula(mixed_model, data, groups=data["run_id"])
result = model.fit()
result.summary()

In [35]:
mixed_model_int = "score ~ isGPT4 + Judge + QL + QDR + QW + C(LLM, Treatment(reference='Other')) + C(Judge, Treatment(reference='nist')) * C(LLM, Treatment(reference='Other')) + isGPT4 * C(LLM, Treatment(reference='Other')) + pipeline"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result = model.fit()
result.summary()

In [11]:
real_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_real_judgments = data[((data['Judge'] == 'nist') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [12]:
real_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(real_queries_judged)))]
synthetic_queries_synthetic_judgments = data[((data['Judge'] == 'gpt4') & (data['qid'].isin(t5_queries_judged) | data['qid'].isin(gpt4_queries_judged)))]

In [13]:
mixed_model_condition_Qreal = "score ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline"
mixed_model_condition_Qsynthetic = "score ~ QL + QDS + QW + C(LLM, Treatment(reference='Other')) + pipeline"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_real_judgments, groups=real_queries_real_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_real_judgments, groups=synthetic_queries_real_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qreal, real_queries_synthetic_judgments, groups=real_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula(mixed_model_condition_Qsynthetic, synthetic_queries_synthetic_judgments, groups=synthetic_queries_synthetic_judgments["run_id"])
result = model.fit()
result.summary()

In [18]:
# Merging the dataframes on 'run_id' and 'qid'
real_queries_diff = pd.merge(real_queries_real_judgments, real_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
real_queries_diff['score_ab'] = real_queries_diff['score_a'] - real_queries_diff['score_b']
real_queries_diff['score_ba'] = real_queries_diff['score_b'] - real_queries_diff['score_a']

In [None]:
real_queries_diff

In [None]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", real_queries_diff, groups=real_queries_diff["run_id"])
result = model.fit()
result.summary()

In [21]:
# Merging the dataframes on 'run_id' and 'qid'
synthetic_queries_diff = pd.merge(synthetic_queries_real_judgments, synthetic_queries_synthetic_judgments[['run_id', 'qid', 'score']], on=['run_id', 'qid'], suffixes=('_a', '_b'))
# Subtracting the 'Score' values
synthetic_queries_diff['score_ab'] = synthetic_queries_diff['score_a'] - synthetic_queries_diff['score_b']
synthetic_queries_diff['score_ba'] = synthetic_queries_diff['score_b'] - synthetic_queries_diff['score_a']

In [None]:
synthetic_queries_diff

In [None]:
model = sm.MixedLM.from_formula("score_ab ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("score_ba ~ QL + QDR + QW + C(LLM, Treatment(reference='Other')) + pipeline", synthetic_queries_diff, groups=synthetic_queries_diff["run_id"])
result = model.fit()
result.summary()

## Extra Experiments (refer to: "Extra Exp. 1")

In [6]:
def get_result(result_file):
    result_df = pd.read_csv(result_file, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['score'] = result_df['score'].astype(float)
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    return result_df

In [7]:
real_judge_results = get_result(result_file="results/all.pass.nist.ndcgeval")
synthetic_judge_results = get_result(result_file="results/all.pass.gpt4.ndcgeval")

In [9]:
# Merging the dataframes on 'run_id' and 'qid'
results_diff = pd.merge(real_judge_results, synthetic_judge_results, on=['run_id', 'qid'], suffixes=('_real', '_synthetic'))
# Subtracting the 'Score' values
results_diff['score_RS'] = results_diff['score_real'] - results_diff['score_synthetic']
results_diff['score_SR'] = results_diff['score_synthetic'] - results_diff['score_real']

In [13]:
results_diff = pd.merge(results_diff, qid_to_info, on='qid')
results_diff = pd.merge(results_diff, model_to_info, on='run_id')

In [None]:
results_diff

Selected: score_SR

In [17]:
results_diff['qd_RS'] = results_diff['QDR'] - results_diff['QDS']
results_diff['qd_SR'] = results_diff['QDS'] - results_diff['QDR']

In [21]:
mixed_model_1 = "score_SR ~ Synthetic + QL + qd_SR + QW + DL + pipeline + C(LLM, Treatment(reference='Other')) + Synthetic * C(LLM, Treatment(reference='Other'))"

In [None]:
model = sm.MixedLM.from_formula(mixed_model_1, results_diff, groups=results_diff["run_id"])
result = model.fit()
result.summary()