In [1]:
import pandas as pd
import numpy as np
import requests
import tarfile
import json
import gzip
from tqdm import tqdm

In [2]:
nist_qrel_file = 'qrels/2023.qrels.pass.withDupes.txt'
gpt4_qrel_file = 'qrels/2023.qrels.pass.gpt4.txt'
dl_2023_queries = "dl-2023-queries.tsv"

In [3]:
# loading judged queries
nist_qrels = pd.read_csv(nist_qrel_file, sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])
queries_judged = set(nist_qrels['qid'])

# no. of relevant/irrelevant doc per query
# Group by qid and score, then count the occurrences
nist_qrels_counts = nist_qrels.groupby(['qid', 'rel']).size().reset_index(name='count')

# Pivot the dataframe to get counts for each score
pivot_df = nist_qrels_counts.pivot(index='qid', columns='rel', values='count').fillna(0)

# Calculate the sum of relevant scores (1, 2, 3) and the count of non-relevant scores (0)
pivot_df['relevant_count'] = pivot_df[1] + pivot_df[2] + pivot_df[3]
pivot_df['non_relevant_count'] = pivot_df[0]

# Calculate the average number of relevant scores divided by the count of non-relevant scores
pivot_df['average_relevant'] = pivot_df['relevant_count'] / pivot_df['non_relevant_count']

# Select only the necessary columns
nist_qrels_avg = pivot_df[['relevant_count', 'non_relevant_count', 'average_relevant']].reset_index()

In [4]:
doc_to_info = pd.read_csv("infos/doc_to_info.txt", sep='\t')
nist_qrels_with_docinfo = pd.merge(nist_qrels, doc_to_info, on='docid')
# Group by 'GroupColumn' and calculate the mean and count of 'ValueColumn'
qid_to_MeanDocLength = nist_qrels_with_docinfo.groupby('qid')['DW'].agg('mean').to_dict()

In [5]:
# loading judged queries - GPT-4
gpt4_qrels = pd.read_csv(gpt4_qrel_file, sep=' ', header=None, names=['qid', 'Q0', 'docid', 'rel'])

# no. of relevant/irrelevant doc per query
# Group by qid and score, then count the occurrences
gpt4_qrels_counts = gpt4_qrels.groupby(['qid', 'rel']).size().reset_index(name='count')

# Pivot the dataframe to get counts for each score
pivot_df = gpt4_qrels_counts.pivot(index='qid', columns='rel', values='count').fillna(0)

# Calculate the sum of relevant scores (1, 2, 3) and the count of non-relevant scores (0)
pivot_df['relevant_count'] = pivot_df[1] + pivot_df[2] + pivot_df[3]
pivot_df['non_relevant_count'] = pivot_df[0]

# Calculate the average number of relevant scores divided by the count of non-relevant scores
pivot_df['average_relevant'] = pivot_df['relevant_count'] / pivot_df['non_relevant_count']

# Select only the necessary columns
gpt4_qrels_avg = pivot_df[['relevant_count', 'non_relevant_count', 'average_relevant']].reset_index()

In [6]:
qid_to_avgrelevant_nist = dict(zip(nist_qrels_avg.qid, nist_qrels_avg.average_relevant))
qid_to_avgrelevant_gpt4 = dict(zip(gpt4_qrels_avg.qid, gpt4_qrels_avg.average_relevant))

In [7]:
queries = pd.read_csv(dl_2023_queries, sep='\t', header=None, names=['qid', 'qtext'])
queries.head()

Unnamed: 0,qid,qtext
0,2000138,How does the process of digestion and metaboli...
1,2000438,apatite definition
2,2000727,calculate salary from basic pay
3,2000882,cerebellar disease definition
4,2001010,cost comparison of funerals in australia


In [8]:
def get_query_length(query_text):
    query_len = len(query_text.split(' '))
    if query_len >= 10:
        # long query == 1
        query_len_type = 1
    else:
        # short query == 0
        query_len_type = 0
    return query_len_type, query_len

In [12]:
with open("query_to_info.txt", 'w') as q2i:
    q2i.write("qid\tQL\tQDR\tQDS\tQW\tDL\tSynthetic\tisGPT4\n")
    for eachquery in queries.itertuples(index=True):
        if eachquery.qid in queries_judged:
            qid_len_type, qid_len = get_query_length(eachquery.qtext)
            qid_avgrel_real = round(qid_to_avgrelevant_nist[eachquery.qid], 4)
            qid_avgrel_gpt4 = round(qid_to_avgrelevant_gpt4[eachquery.qid], 4)
            doc_len = round(qid_to_MeanDocLength[eachquery.qid], 4)
            if eachquery.qid < 3000000:
                qid_synthetic = 0
            else:
                qid_synthetic = 1
            if eachquery.qid < 3100000:
                qid_isGPT4 = 0
            else:
                qid_isGPT4 = 1
            q2i.write(f"{eachquery.qid}\t{qid_len_type}\t{qid_avgrel_real}\t{qid_avgrel_gpt4}\t{qid_len}\t{doc_len}\t{qid_synthetic}\t{qid_isGPT4}\n")

## Documents/Passages

In [None]:
docids = set(nist_qrels['docid'])

In [None]:
def read_bundles(bundlenum):
     with gzip.open(f'msmarco_v2_passage/msmarco_passage_{bundlenum}.gz','r') as fpassage:
          for passage in fpassage:
            json_passage = json.loads(passage.decode('utf8'))
            if json_passage['pid'] in docids:
                docid_to_passage[json_passage['pid']] = json_passage['passage']

In [None]:
# read TREC passagess
docid_to_passage = {}

for bundlenum in tqdm(range(0, 70)):
    if bundlenum < 10:
        bundlenum = f'0{str(bundlenum)}'
    read_bundles(bundlenum=bundlenum)

In [None]:
def get_doc_length(passage):
    passage_len = len(passage.split(' '))
    return passage_len

In [None]:
with open("doc_to_info.txt", 'w') as d2i:
    d2i.write("docid\tDW\n")
    for eachline in nist_qrels.itertuples(index=True):
        doc_len = get_doc_length(docid_to_passage[eachline.docid])
        d2i.write(f"{eachline.docid}\t{doc_len}\n")