# All Qrels in One Place

In [2]:
import pandas as pd

In [3]:
RESOURCE = "TestJudge"

## Loading GPT-4 Qrels

In [4]:
# loading 10-depth pools with synthetic judgments
final_synthetic_judgment_path = "gpt4-judgments/all-queries-processed-errors"
dl19_qrel = pd.read_csv(f"{final_synthetic_judgment_path}/gpt4_judgments_dl2019_processed.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl20_qrel = pd.read_csv(f"{final_synthetic_judgment_path}/gpt4_judgments_dl2020_processed.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl21_qrel = pd.read_csv(f"{final_synthetic_judgment_path}/gpt4_judgments_dl2021_processed.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl22_qrel = pd.read_csv(f"{final_synthetic_judgment_path}/gpt4_judgments_dl2022_processed.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl23_qrel = pd.read_csv(f"{final_synthetic_judgment_path}/gpt4_judgments_dl2023_processed.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])

In [5]:
dls_qrel = pd.concat([dl19_qrel, dl20_qrel, dl21_qrel, dl22_qrel, dl23_qrel], ignore_index=True, sort=False)
dls_qrel.shape

(196846, 4)

In [6]:
# Drop duplicate rows for when the qid and docid are same from different years of judgment pools
dls_qrel.drop_duplicates(subset=['qid', 'docid'], inplace=True)
dls_qrel.shape

(196846, 4)

In [7]:
dls_gpt4_judgments = open(f"final-qrels/{RESOURCE}_qrels_pass.txt", 'w')
for eachqrel in dls_qrel.itertuples(index=True):
    dls_gpt4_judgments.write(f"{eachqrel.qid} 0 {eachqrel.docid} {eachqrel.rel}\n")
dls_gpt4_judgments.close()

## Loading NIST Qrel

In [10]:
# load human qrels
dl19_qrel_nist = pd.read_csv("TREC-DL-2019/qrels-pass.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl20_qrel_nist = pd.read_csv("TREC-DL-2020/qrels-pass.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl21_qrel_nist = pd.read_csv("TREC-DL-2021/qrels-pass.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl22_qrel_nist = pd.read_csv("TREC-DL-2022/qrels-pass.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
dl23_qrel_nist = pd.read_csv("TREC-DL-2023/qrels-pass.txt", sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])

In [11]:
# merge all human qrels
dls_qrel_nist = pd.concat([dl19_qrel_nist, dl20_qrel_nist, dl21_qrel_nist, dl22_qrel_nist, dl23_qrel_nist], ignore_index=True, sort=False)
dls_qrel_nist.shape

(440217, 4)

In [13]:
# remove dups
dls_qrel_nist.drop_duplicates(subset=['qid', 'docid'], inplace=True)
dls_qrel_nist.shape

(440217, 4)

In [14]:
# concat both nist and gpt4 qrels
dls_qrel_nistXgpt4 = pd.concat([dls_qrel_nist, dls_qrel], ignore_index=True, sort=False)
dls_qrel_nistXgpt4.shape

(637063, 4)

In [15]:
# remove dups
dls_qrel_nistXgpt4.drop_duplicates(subset=['qid', 'docid'], inplace=True)
dls_qrel_nistXgpt4.shape

(637063, 4)

In [16]:
# create a qrel including both: nist and gpt4
dls_nistXgpt4_judgments = open(f"final-qrels/{RESOURCE}_qrels_pass_withNIST.txt", 'w')
for eachqrel in dls_qrel_nistXgpt4.itertuples(index=True):
    dls_nistXgpt4_judgments.write(f"{eachqrel.qid} 0 {eachqrel.docid} {eachqrel.rel}\n")
dls_nistXgpt4_judgments.close()

In [19]:
len(set(dls_qrel_nistXgpt4['qid']))

1988

### Qrel Statistics

In [23]:
print(len(set(dls_qrel['qid'])))
print(dls_qrel.shape)

1763
(196846, 4)


In [24]:
print(len(set(dls_qrel_nistXgpt4['qid'])))
print(dls_qrel_nistXgpt4.shape)

1988
(637063, 4)


In [30]:
print(dls_qrel[dls_qrel['rel'] == 0].shape[0])
print(dls_qrel[dls_qrel['rel'] == 1].shape[0])
print(dls_qrel[dls_qrel['rel'] == 2].shape[0])
print(dls_qrel[dls_qrel['rel'] == 3].shape[0])

51966
63212
32658
49010


In [31]:
print(dls_qrel_nistXgpt4[dls_qrel_nistXgpt4['rel'] == 0].shape[0])
print(dls_qrel_nistXgpt4[dls_qrel_nistXgpt4['rel'] == 1].shape[0])
print(dls_qrel_nistXgpt4[dls_qrel_nistXgpt4['rel'] == 2].shape[0])
print(dls_qrel_nistXgpt4[dls_qrel_nistXgpt4['rel'] == 3].shape[0])

369567
126406
86162
54928
