In [1]:
# TASK 1: Analysis of step-one performance retrieval.
# First thing, start easy: build two-indexes, evaluate them top-m retrievals
# Hit-once and Hit-all. Then see other metrics like precision & recall + Roni's analysis.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# cleaning sample_data directory
!rm -r sample_data

In [3]:
!ls drive/My\ Drive/touche-2022-prototyping

dataset-prep-and-retrieval-diagnostic-analysis.ipynb
indexes
initial-retrieval-metric-analysis.ipynb
mon-duo-retrieval-prototyping-and-analysis.ipynb
topics-task2-51-100.xml
topics-task-2.xml
touche2020-task2-relevance-withbaseline.qrels
touche_results_2021.csv
touche-task2-51-100-relevance.qrels
touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl


In [4]:
# installing important packages for analyzing the code.
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [5]:
# import statements
import pandas as pd
import jsonlines
from xml.dom import minidom
import numpy as np

In [None]:
# original corpus, not loading this corpus for the analysis.
# corpus_list = list(jsonlines.open('drive/My Drive/touche-2022-prototyping/touche-task2-passages-version-002.jsonl'))

In [6]:
# query expansion corpus, the latest one.
qcorpus_list = list(jsonlines.open('drive/My Drive/touche-2022-prototyping/touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl'))

In [7]:
# printing demo query, only part of the document is available for retrieval.
qcorpus_list[0]

{'chatNoirUrl': 'https://chatnoir.eu/cache?uuid=25e04d49-8df7-58c9-8fae-c5bd54a070ba&index=cw12&raw&plain',
 'contents': "Do Asian-Americans Face Bias in Admissions at Elite Colleges? - NYTimes.com Home Page Today's Paper Video Most Popular Times Topics Search All NYTimes.com Education World U.S. Politics Education Bay Area Chicago Texas N.Y. / Region Business Technology Science Health Sports Opinion Arts Style Travel Jobs Real Estate Autos February 8, 2012, 1:43 pm Do Asian-Americans Face Bias in Admissions at Elite Colleges? By DANIEL E. SLOTNIK 6:08 p.m. | Updated A statement from Princeton was added to the story. The Department of Education’s Office for Civil Rights is examining complaints thatHarvard and Princeton have discriminated against Asian-American undergraduate applicants, highlighting a concern of many Asian-American parents. The inquiry was first reported by Bloomberg News.<query> do asian americans face bias in admissions</query>",
 'id': 'clueweb12-0000tw-00-14115___1'

In [8]:
# define function for loading all the topics from the topics files.
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["Number","Title"])
  return parsed

In [9]:
topics_2020 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task-2.xml")
topics_2020.head()

Unnamed: 0,Number,Title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n


In [10]:
topics_2021 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-51-100.xml")
topics_2021.head()

Unnamed: 0,Number,Title
0,51,"What is better at reducing fever in children, ..."
1,52,What are the best rice cookers?
2,53,Should I buy steel or ceramic knives?
3,54,Is morning or afternoon sun the best for fruit...
4,55,"What is better for back pain, chiropractic the..."


In [16]:
touche_topics = topics_2020.append(topics_2021, ignore_index=True)
touche_topics

Unnamed: 0,Number,Title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n
...,...,...
95,96,"Which is healthier to wear, boxers or briefs?"
96,97,What is the difference between a blender vs a ...
97,98,"Which is better, rock or rap?"
98,99,Do you think imagination is better than knowle...


In [None]:
# Run loop for topics_2021 data, store retrieved in order of appearance m=275
solution_dict = {} # topic_id, corresponding_document list.
for id_, q_ in zip(topics_2021['Number'], topics_2021['Title']):
    hits = searcher.search(q_, k=500)
    d_list = []
    for h_ in hits:
        d_= h_.docid.split('___')[0]
        if d_ not in d_list:
            d_list.append(d_)

    solution_dict[id_] = d_list

In [None]:
searcher_opt = LuceneSearcher('/content/drive/MyDrive/touche-2022-prototyping/indexes/baseline_index')
searcher_opt.set_bm25(1.2, 0.68)
hits = searcher_opt.search('Who is stronger, Hulk or Superman?', k=12)
# Value of K determines the number of documents that can be returned.
print('Number of document matches from the index: '+str(len(hits)))
for i in range(len(hits)):
    print(f'{i+1} {hits[i].docid} {hits[i].score:.5f}')

Number of document matches from the index: 12
1 clueweb12-1400wb-06-30596___3 15.37180
2 clueweb12-1709wb-12-07226___7 14.85930
3 clueweb12-0605wb-29-30060___11 14.84560
4 clueweb12-0605wb-29-30060___10 13.93880
5 clueweb12-0605wb-29-30060___2 13.85450
6 clueweb12-1400wb-06-30596___4 13.81460
7 clueweb12-0904wb-33-14589___7 13.78280
8 clueweb12-1411wb-25-14634___8 13.60230
9 clueweb12-1014wb-84-07457___12 13.35340
10 clueweb12-1400tw-67-08199___28 13.04680
11 clueweb12-0306wb-63-29079___5 12.96960
12 clueweb12-0605wb-57-17257___8 12.88170


In [None]:
# Run loop for topics_2021 data, store retrieved in order of appearance m=275
solution_dict_opt = {} # topic_id, corresponding_document list.
for id_, q_ in zip(topics_2021['Number'], topics_2021['Title']):
    hits = searcher_opt.search(q_, k=500)
    d_list = []
    for h_ in hits:
        d_= h_.docid.split('___')[0]
        if d_ not in d_list:
            d_list.append(d_)

    solution_dict_opt[id_] = d_list

In [17]:
# creating new json formatted file which only has the matching doc-ids from corpus.
jsonObj = pd.read_json(path_or_buf='drive/My Drive/touche-2022-prototyping/touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl', lines=True)
jsonObj["old_id"]=jsonObj["id"].str.split("___",expand=True)[0]
documents_2021=jsonObj.groupby(by="old_id")["contents"].apply(" ".join).reset_index()
del jsonObj
documents_2021.head()

Unnamed: 0,old_id,contents
0,clueweb12-0000tw-00-14115,Do Asian-Americans Face Bias in Admissions at ...
1,clueweb12-0000tw-00-15738,Big Data Analytics a Key Enabler for Social CR...
2,clueweb12-0000tw-00-17905,Best Bets: Fern brings magic to Milford - Delm...
3,clueweb12-0000tw-01-02230,Python Data Analysis Library — pandas: Python ...
4,clueweb12-0000tw-01-15084,MILLARWORLD WEEK: Mark Millar’s Master Plan | ...


In [19]:
# creating new jsonl file for building newer indexes for the pyserini library.
output = jsonlines.open('/content/drive/MyDrive/touche-2022-prototyping/documents.jsonl', 'w')
for id_, cn_ in zip(documents_2021['old_id'], documents_2021['contents']):
    output.write({
        'id': id_,
        'contents': cn_
})

In [18]:
rel_2021 = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/touche-task2-51-100-relevance.qrels',
                        sep="\s+", 
                        names=["qid", "no", "doc", "rel"])
rel_2021 = rel_2021.astype(str)
rel_2021.head()

Unnamed: 0,qid,no,doc,rel
0,54,0,clueweb12-0205wb-64-11095,0
1,54,0,clueweb12-0501wb-64-06459,1
2,54,0,clueweb12-0207wb-30-15337,0
3,54,0,clueweb12-0906wb-38-27123,0
4,54,0,clueweb12-0907wb-55-11510,0


In [21]:
rel_2020 = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/touche2020-task2-relevance-withbaseline.qrels',
                        sep="\s+", 
                        names=["qid", "no", "doc", "rel"])
rel_2020 = rel_2020.astype(str)
rel_2020.head()

Unnamed: 0,qid,no,doc,rel
0,1,0,clueweb12-0001wb-05-12311,0
1,1,0,clueweb12-1811wb-62-08424,1
2,1,0,clueweb12-1811wb-62-08423,1
3,1,0,clueweb12-1217wb-47-14048,0
4,1,0,clueweb12-1811wb-62-08425,1


In [22]:
qrel_topics = rel_2020.append(rel_2021, ignore_index=True)
qrel_topics

Unnamed: 0,qid,no,doc,rel
0,1,0,clueweb12-0001wb-05-12311,0
1,1,0,clueweb12-1811wb-62-08424,1
2,1,0,clueweb12-1811wb-62-08423,1
3,1,0,clueweb12-1217wb-47-14048,0
4,1,0,clueweb12-1811wb-62-08425,1
...,...,...,...,...
3854,86,0,clueweb12-0008wb-85-29076,0
3855,86,0,clueweb12-1008wb-62-10779,0
3856,86,0,clueweb12-0202wb-34-19787,0
3857,86,0,clueweb12-0915wb-71-22856,0


In [24]:
# printing number of 'qid' values in the qrel dataframe.
print(len(set(qrel_topics['qid'])))

100


In [25]:
# printing the data types
print(touche_topics.dtypes)
print('\n')
print(qrel_topics.dtypes)

Number    object
Title     object
dtype: object


qid    object
no     object
doc    object
rel    object
dtype: object


In [26]:
percentage=[]
not_common_docs=[]
for qid in touche_topics["Number"]:
    relevant=qrel_topics.loc[qrel_topics["qid"]==qid]["doc"]
    intersection=set(relevant) & set(documents_2021["old_id"])
    not_common=list(set(relevant) -set(documents_2021["old_id"]))
    percent=len(intersection)/len(relevant)
    not_common_docs.append(not_common)
    percentage.append({"title_id":qid,"Percentage":percent,"Not_common":not_common})
df=pd.DataFrame(percentage)
df.head()

Unnamed: 0,title_id,Percentage,Not_common
0,1,0.735294,"[clueweb12-0412wb-04-03887, clueweb12-1811wb-6..."
1,2,0.909091,"[clueweb12-0109wb-28-26419, clueweb12-1516wb-6..."
2,3,0.96875,[clueweb12-0500wb-68-25390]
3,4,0.972222,[clueweb12-1700tw-38-15104]
4,5,0.972222,[clueweb12-1514wb-77-21531]


In [27]:
df

Unnamed: 0,title_id,Percentage,Not_common
0,1,0.735294,"[clueweb12-0412wb-04-03887, clueweb12-1811wb-6..."
1,2,0.909091,"[clueweb12-0109wb-28-26419, clueweb12-1516wb-6..."
2,3,0.968750,[clueweb12-0500wb-68-25390]
3,4,0.972222,[clueweb12-1700tw-38-15104]
4,5,0.972222,[clueweb12-1514wb-77-21531]
...,...,...,...
95,96,0.903226,"[clueweb12-1203wb-55-00251, clueweb12-0206wb-4..."
96,97,0.914286,"[clueweb12-1019wb-25-02525, clueweb12-1810wb-9..."
97,98,0.927273,"[clueweb12-0805wb-63-13916, clueweb12-0110wb-5..."
98,99,0.886792,"[clueweb12-0400wb-19-11055, clueweb12-0406wb-9..."


In [28]:
not_common_docs=list(np.concatenate(not_common_docs).flat)
new_rel_2021=qrel_topics[~qrel_topics["doc"].isin(not_common_docs)]
new_rel_2021.to_csv("/content/drive/MyDrive/touche-2022-prototyping/touche_ground_truth.csv")
touche_topics.to_csv("/content/drive/MyDrive/touche-2022-prototyping/touche_complete_topics.csv")

In [30]:
# verifying the irrelevant doc-id elimination operation.
percentage=[]
not_common_docs=[]
for qid in touche_topics["Number"]:
    relevant=new_rel_2021.loc[new_rel_2021["qid"]==qid]["doc"]
    intersection=set(relevant) & set(documents_2021["old_id"])
    not_common=list(set(relevant) -set(documents_2021["old_id"]))
    percent=len(intersection)/len(relevant)
    not_common_docs.append(not_common)
    percentage.append({"title_id":qid,"Percentage":percent,"Not_common":not_common})
print(percentage)
# successfully the additionals documents are eliminated.
# -> (2076+1783-3497), loss of 362 documents relevant documents from the ground truth.
# when we convert

[{'title_id': '1', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '2', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '3', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '4', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '5', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '6', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '7', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '8', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '9', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '10', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '11', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '12', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '13', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '14', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '15', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '16', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '17', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '18', 'Percentage': 1.0, '

In [32]:
new_rel_2021 = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/touche_ground_truth.csv')
new_rel_2021.head()

Unnamed: 0.1,Unnamed: 0,qid,no,doc,rel
0,0,1,0,clueweb12-0001wb-05-12311,0
1,3,1,0,clueweb12-1217wb-47-14048,0
2,7,1,0,clueweb12-1214wb-88-29751,2
3,8,1,0,clueweb12-0307wb-05-31620,0
4,9,1,0,clueweb12-1500wb-68-15142,0


In [33]:
new_rel_2021

Unnamed: 0.1,Unnamed: 0,qid,no,doc,rel
0,0,1,0,clueweb12-0001wb-05-12311,0
1,3,1,0,clueweb12-1217wb-47-14048,0
2,7,1,0,clueweb12-1214wb-88-29751,2
3,8,1,0,clueweb12-0307wb-05-31620,0
4,9,1,0,clueweb12-1500wb-68-15142,0
...,...,...,...,...,...
3492,3852,86,0,clueweb12-0304wb-56-05255,1
3493,3853,86,0,clueweb12-1804wb-01-07034,0
3494,3854,86,0,clueweb12-0008wb-85-29076,0
3495,3856,86,0,clueweb12-0202wb-34-19787,0
