In [None]:
import pandas as pd
import glob
import statsmodels.api as sm

In [None]:
metric = "map" # map # ndcg_cut_10
result_format = "treceval" # treceval # ndcgeval

In [None]:
results_df_list = []

for infile in glob.glob(f'./results/TRECDL2023/*.{result_format}'):
    judger = infile.split('/')[3].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

In [None]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [None]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/pass_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [None]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [None]:
data.describe(include='object')
data

In [None]:
mixed_model_int = (
"score ~ C(judged_by, Treatment(reference='nist')) * (QDR + QW + DL + isGPT4 + C(ST, Treatment(reference='Other')) + MN) "
)

In [None]:
model_int = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result_int = model_int.fit()
result_int.summary()

# NDCG

In [None]:
metric = 'ndcg_cut_10'
result_format = 'ndcgeval'

In [None]:
results_df_list = []

for infile in glob.glob(f'./results/TRECDL2023/*.{result_format}'):
    judger = infile.split('/')[3].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

In [None]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/pass_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [None]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [None]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [None]:
data

In [None]:
mixed_model_int = (
"score ~ C(judged_by, Treatment(reference='nist')) * (QDR + QW + DL + isGPT4 + C(ST, Treatment(reference='Other')) + MN) "
)

In [None]:
model_int = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result_int = model_int.fit()
result_int.summary()