In [1]:
import pandas as pd
import json

In [2]:
def generate(out_path : str, index_location : str = None, perturbation_type : str = 'TFC1', stopwords : bool = False, exact_match : bool = False):
    if index_location is None:
        index_location = pt.get_dataset(MSMARCO_TERRIER).get_index("terrier_stemmed_text")

    if perturbation_type == 'TFC1':
        perturbation = TFC1(index_location=index_location, stem=True, stopwords=stopwords, exact_match=exact_match)
    elif perturbation_type == 'TDC':
        perturbation = TDC(index_location=index_location)
    else:
        raise ValueError("perturbation must be either 'TFC1' or 'TDC'")
    
    DL19_dataset = irds.load(DL19)
    DL20_dataset = irds.load(DL20)

    qrels = pd.concat([pd.DataFrame(DL19_dataset.qrels_iter()), pd.DataFrame(DL20_dataset.qrels_iter())])

    docs = pd.DataFrame(DL19_dataset.docs_iter()).set_index("doc_id").text.to_dict()
    queries = pd.DataFrame(DL19_dataset.queries_iter()).set_index("query_id").text.to_dict()
    queries.update(pd.DataFrame(DL20_dataset.queries_iter()).set_index("query_id").text.to_dict())
    
    def convert_to_trec(df : pd.DataFrame):
        output = {
            'qid': [],
            'query': [],
            'docno': [],
            'text': [],
            'relevance': [],
            'perturbed': [],
        }

        for row in tqdm(df.itertuples(), desc="Converting to TREC format"):
            if queries[row.query_id] is None or len(queries[row.query_id]) == 0:
                print("ARGH")
            output['qid'].append(row.query_id)
            output['query'].append(queries[row.query_id])
            output['docno'].append(row.doc_id)
            output['text'].append(docs[row.doc_id])
            output['relevance'].append(row.relevance)
            output['perturbed'].append(False)

        output = pd.DataFrame(output)
        perturbed_output = output.copy()
        perturbed_output['perturbed'] = True
        perturbed_output['text'] = perturbed_output.apply(lambda x : perturbation(x.text, x.query), axis=1)
        
        output = pd.concat([output, perturbed_output])
        output['score'] = 0.

        return output
    
    # Calculate all deltas
    all_data = convert_to_trec(qrels)
    output_file = f"{out_path}/{perturbation_type}-data.tsv.gz"

    all_data.to_csv(output_file, sep="\t", index=False)

    return 0

In [24]:
msmarco_queries = pd.read_json('/mnt/qb/work/eickhoff/esx208/axiomatic-ir/data/tfc1/queries.jsonl', lines=True)
msmarco_queries = msmarco_queries[['_id', 'text']].rename(columns={'_id': 'qid', 'text': 'query'})
msmarco_queries

Unnamed: 0,qid,query
0,1185869,)what was the immediate impact of the success ...
1,1185868,_________ justice is designed to repair the ha...
2,597651,what color is amber urine
3,403613,is autoimmune hepatitis a bile acid synthesis ...
4,1183785,elegxo meaning
...,...,...
509957,147073,difference between discrete and process manufa...
509958,243761,how long did abraham lincoln serve
509959,162662,does adult acne rosacea give you blepharitis
509960,247194,how long do you bake muffins


In [25]:
original_documents = json.load(open('/mnt/qb/work/eickhoff/esx208/axiomatic-ir/data/tfc1/tfc1_add_baseline_final_dd_corpus.json'))
perturbed_documents = json.load(open('/mnt/qb/work/eickhoff/esx208/axiomatic-ir/data/tfc1/tfc1_add_append_final_dd_corpus.json'))

def convert_to_pd(documents):
    res = []
    for qid in documents['corpus']:
        for doc_id, doc in documents['corpus'][qid].items():
            res.append({
                'doc_id': int(doc_id),
                'text': doc['text']
            })
    return pd.DataFrame(res)

original_documents = convert_to_pd(original_documents)
perturbed_documents = convert_to_pd(perturbed_documents)

In [59]:
data_msmarco_tfc1 = pd.read_csv('/mnt/qb/work/eickhoff/esx208/axiomatic-ir/data/tfc1/tfc1_add_append_target_qids_scores.csv')
data_msmarco_tfc1_original = data_msmarco_tfc1[['qid', 'doc_id']].merge(original_documents.drop_duplicates(subset=['doc_id']), on=['doc_id'], how='inner')
data_msmarco_tfc1_original['perturbed'] = False
data_msmarco_tfc1_perturbed = data_msmarco_tfc1[['qid', 'doc_id']].merge(perturbed_documents.drop_duplicates(subset=['doc_id']), on=['doc_id'], how='inner')
data_msmarco_tfc1_perturbed['perturbed'] = True
data_msmarco_tfc1 = pd.concat([data_msmarco_tfc1_original, data_msmarco_tfc1_perturbed])
data_msmarco_tfc1.rename(columns={'doc_id': 'docno'}, inplace=True)
data_msmarco_tfc1 = data_msmarco_tfc1.merge(msmarco_queries.drop_duplicates(), on='qid')
data_msmarco_tfc1['relevance'] = 0
data_msmarco_tfc1['score'] = 0

In [63]:
data_msmarco_tfc1

Unnamed: 0,qid,docno,text,perturbed,query,relevance,score
0,1089763,7081592,The Miners State Bank Routing Number the miner...,False,the miners state bank routing number,0,0
1,1089763,7081591,THE MINERS STATE BANK ROUTING ABA NUMBER. 0911...,False,the miners state bank routing number,0,0
2,1089763,7081593,Routing numbers depend on the type of transact...,False,the miners state bank routing number,0,0
3,1089763,7081597,Please call The Miners State Bank representati...,False,the miners state bank routing number,0,0
4,1089763,7081594,View THE MINERS STATE BANK routing numbers lis...,False,the miners state bank routing number,0,0
...,...,...,...,...,...,...,...
19995,292225,6000323,"The flood waters did not rush into Paris, crea...",True,how many people died in the molasses flood,0,0
19996,292225,4994296,"Floods and erosion. 1 15 October 1879, in Murc...",True,how many people died in the molasses flood,0,0
19997,292225,296908,"The Forgotten, Non-Kool-Aid-Drinking Victims o...",True,how many people died in the molasses flood,0,0
19998,292225,4798769,The Johnstown Flood. In a river valley in cent...,True,how many people died in the molasses flood,0,0


In [64]:
data_msmarco_tfc1.columns

Index(['qid', 'docno', 'text', 'perturbed', 'query', 'relevance', 'score'], dtype='object')

In [65]:
data_msmarco_tfc1.to_csv('data/TFC1-data-msmarco.tsv.gz', sep='\t', index=False)

In [30]:
data_trec_tfc1 = pd.read_csv('data/TFC1-data.tsv.gz', sep='\t')

In [31]:
data_trec_tfc1

Unnamed: 0,qid,query,docno,text,relevance,perturbed,score
0,19335,anthropological definition of environment,1017759,Man and environment reciprocal relationship. T...,0,False,0.0
1,19335,anthropological definition of environment,1082489,Ethnographic research is a qualitative method ...,0,False,0.0
2,19335,anthropological definition of environment,109063,1. Identify the fields of anthropology and maj...,0,False,0.0
3,19335,anthropological definition of environment,1160863,Human impact on the environment. Human impact ...,0,False,0.0
4,19335,anthropological definition of environment,1160871,Human impact on the environment. Human impact ...,0,False,0.0
...,...,...,...,...,...,...,...
41287,1136962,why did the ancient egyptians call their land ...,8526087,The Aswan High Dam brought the Nile’s devastat...,0,True,0.0
41288,1136962,why did the ancient egyptians call their land ...,8537921,Why have most Egyptians lived along the Nile R...,0,True,0.0
41289,1136962,why did the ancient egyptians call their land ...,8742482,Egyptian Civilization. The basic element in th...,0,True,0.0
41290,1136962,why did the ancient egyptians call their land ...,937258,1 The people in Ancient Egypt divided Egypt in...,1,True,0.0


In [32]:
data_trec_tfc1.columns

Index(['qid', 'query', 'docno', 'text', 'relevance', 'perturbed', 'score'], dtype='object')

In [None]:
data_msmarco_tfc1 = pd.read_csv('/mnt/qb/work/eickhoff/esx208/axiomatic-ir/data/tfc1/tfc1_add_append_target_qids_scores.csv')

In [4]:
data_trec_tfc1

Unnamed: 0,qid,doc_id,og_score,p_score,score_diff,percent_change,og_rank,p_rank,change_in_rank
0,1089763,7081592,111.112000,111.044907,-0.067093,-0.060383,1,1,0
1,1089763,7081591,110.666267,110.643227,-0.023041,-0.020820,2,2,0
2,1089763,7081593,109.789818,109.606476,-0.183342,-0.166994,3,3,0
3,1089763,7081597,108.552063,108.311203,-0.240860,-0.221884,4,4,0
4,1089763,7081594,107.231239,107.332611,0.101372,0.094536,5,5,0
...,...,...,...,...,...,...,...,...,...
9995,292225,6000323,97.079063,102.538704,5.459641,5.623911,96,79,17
9996,292225,4994296,97.078011,102.424881,5.346870,5.507808,97,81,16
9997,292225,296908,97.065033,101.472061,4.407028,4.540284,98,90,8
9998,292225,4798769,97.024399,101.900589,4.876190,5.025736,99,88,11


In [67]:
top_k_data = pd.read_csv('/mnt/qb/work/eickhoff/esx208/MechIR/data/topk_my/sebastian-hofstaetter-distilbert-dot-tas_b-b256-msmarco_bi_TFC1_topk_10000_msmarco.tsv', sep='\t')

In [68]:
top_k_data

Unnamed: 0,qid,docno,text,perturbed,query,relevance,perturbed_score,rank,original_score,score_delta,perturbed_text
0,1090945,7566814,(800) 44-00680,True,ryder phone number,0,114.699036,0,98.044525,16.654510,(800) 44-00680 ryder
1,1090945,4217534,Phone Contact Numbers. 1 Customer Service: 1-8...,True,ryder phone number,0,112.238360,1,96.861860,15.376495,Phone Contact Numbers. 1 Customer Service: 1-8...
2,1090945,754683,Voice Phone: 601-936-5856 (Please use this num...,True,ryder phone number,0,111.933380,2,96.629410,15.303970,Voice Phone: 601-936-5856 (Please use this num...
3,1100403,2495459,+1 519-885-6680,True,arcadis phone number,0,110.384600,0,96.137440,14.247154,+1 519-885-6680 arcadis
4,477286,6263214,"Population Estimates: 2030: 38,538: 2025: 37,8...",True,population of antigonish,0,108.876495,0,94.646850,14.229645,"Population Estimates: 2030: 38,538: 2025: 37,8..."
...,...,...,...,...,...,...,...,...,...,...,...
9995,1058515,7171315,"Cladribine, sold under the brand name Leustati...",True,what is cladribine,0,104.786770,8,105.737150,-0.950378,"Cladribine, sold under the brand name Leustati..."
9996,1090945,7668174,"318 Corporate Woods Drive, Magnolia, TX 77354....",True,ryder phone number,0,99.163605,103,100.211334,-1.047729,"318 Corporate Woods Drive, Magnolia, TX 77354...."
9997,1091654,1870899,Phone: 718-409-7221 / 800-642-1874 / 800-654-1...,True,phone number for parisian beauty school,0,94.384990,198,95.949830,-1.564842,Phone: 718-409-7221 / 800-642-1874 / 800-654-1...
9998,1091654,7901585,Phone 800-765-2122 (312-715-1010 in the Chicag...,True,phone number for parisian beauty school,0,95.213580,197,97.300720,-2.087143,Phone 800-765-2122 (312-715-1010 in the Chicag...
