In [1]:
# TASK 1: Analysis of step-one performance retrieval.
# First thing, start easy: build two-indexes, evaluate them top-m retrievals
# Hit-once and Hit-all. Then see other metrics like precision & recall + Roni's analysis.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# cleaning sample_data directory
!rm -r sample_data

In [3]:
!ls drive/My\ Drive/touche-2022-prototyping

dataset-prep-and-retrieval-diagnostic-analysis.ipynb
indexes
initial-retrieval-metric-analysis.ipynb
mon-duo-retrieval-prototyping-and-analysis.ipynb
topics-task2-51-100.xml
topics-task-2.xml
touche2020-task2-relevance-withbaseline.qrels
touche_results_2021.csv
touche-task2-51-100-relevance.qrels
touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl


In [4]:
# installing important packages for analyzing the code.
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [5]:
# import statements
import pandas as pd
import jsonlines
from xml.dom import minidom
import numpy as np

In [None]:
# original corpus, not loading this corpus for the analysis.
# corpus_list = list(jsonlines.open('drive/My Drive/touche-2022-prototyping/touche-task2-passages-version-002.jsonl'))

In [6]:
# query expansion corpus, the latest one.
qcorpus_list = list(jsonlines.open('drive/My Drive/touche-2022-prototyping/touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl'))

In [7]:
# printing demo query, only part of the document is available for retrieval.
qcorpus_list[0]

{'chatNoirUrl': 'https://chatnoir.eu/cache?uuid=25e04d49-8df7-58c9-8fae-c5bd54a070ba&index=cw12&raw&plain',
 'contents': "Do Asian-Americans Face Bias in Admissions at Elite Colleges? - NYTimes.com Home Page Today's Paper Video Most Popular Times Topics Search All NYTimes.com Education World U.S. Politics Education Bay Area Chicago Texas N.Y. / Region Business Technology Science Health Sports Opinion Arts Style Travel Jobs Real Estate Autos February 8, 2012, 1:43 pm Do Asian-Americans Face Bias in Admissions at Elite Colleges? By DANIEL E. SLOTNIK 6:08 p.m. | Updated A statement from Princeton was added to the story. The Department of Education’s Office for Civil Rights is examining complaints thatHarvard and Princeton have discriminated against Asian-American undergraduate applicants, highlighting a concern of many Asian-American parents. The inquiry was first reported by Bloomberg News.<query> do asian americans face bias in admissions</query>",
 'id': 'clueweb12-0000tw-00-14115___1'

In [8]:
# define function for loading all the topics from the topics files.
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["Number","Title"])
  return parsed

In [9]:
topics_2020 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task-2.xml")
topics_2020.head()

Unnamed: 0,Number,Title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n


In [10]:
topics_2021 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-51-100.xml")
topics_2021.head()

Unnamed: 0,Number,Title
0,51,"What is better at reducing fever in children, ..."
1,52,What are the best rice cookers?
2,53,Should I buy steel or ceramic knives?
3,54,Is morning or afternoon sun the best for fruit...
4,55,"What is better for back pain, chiropractic the..."


In [16]:
touche_topics = topics_2020.append(topics_2021, ignore_index=True)
touche_topics

Unnamed: 0,Number,Title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n
...,...,...
95,96,"Which is healthier to wear, boxers or briefs?"
96,97,What is the difference between a blender vs a ...
97,98,"Which is better, rock or rap?"
98,99,Do you think imagination is better than knowle...


In [None]:
# Run loop for topics_2021 data, store retrieved in order of appearance m=275
solution_dict = {} # topic_id, corresponding_document list.
for id_, q_ in zip(topics_2021['Number'], topics_2021['Title']):
    hits = searcher.search(q_, k=500)
    d_list = []
    for h_ in hits:
        d_= h_.docid.split('___')[0]
        if d_ not in d_list:
            d_list.append(d_)

    solution_dict[id_] = d_list

In [None]:
searcher_opt = LuceneSearcher('/content/drive/MyDrive/touche-2022-prototyping/indexes/baseline_index')
searcher_opt.set_bm25(1.2, 0.68)
hits = searcher_opt.search('Who is stronger, Hulk or Superman?', k=12)
# Value of K determines the number of documents that can be returned.
print('Number of document matches from the index: '+str(len(hits)))
for i in range(len(hits)):
    print(f'{i+1} {hits[i].docid} {hits[i].score:.5f}')

Number of document matches from the index: 12
1 clueweb12-1400wb-06-30596___3 15.37180
2 clueweb12-1709wb-12-07226___7 14.85930
3 clueweb12-0605wb-29-30060___11 14.84560
4 clueweb12-0605wb-29-30060___10 13.93880
5 clueweb12-0605wb-29-30060___2 13.85450
6 clueweb12-1400wb-06-30596___4 13.81460
7 clueweb12-0904wb-33-14589___7 13.78280
8 clueweb12-1411wb-25-14634___8 13.60230
9 clueweb12-1014wb-84-07457___12 13.35340
10 clueweb12-1400tw-67-08199___28 13.04680
11 clueweb12-0306wb-63-29079___5 12.96960
12 clueweb12-0605wb-57-17257___8 12.88170


In [None]:
# Run loop for topics_2021 data, store retrieved in order of appearance m=275
solution_dict_opt = {} # topic_id, corresponding_document list.
for id_, q_ in zip(topics_2021['Number'], topics_2021['Title']):
    hits = searcher_opt.search(q_, k=500)
    d_list = []
    for h_ in hits:
        d_= h_.docid.split('___')[0]
        if d_ not in d_list:
            d_list.append(d_)

    solution_dict_opt[id_] = d_list

In [17]:
# creating new json formatted file which only has the matching doc-ids from corpus.
jsonObj = pd.read_json(path_or_buf='drive/My Drive/touche-2022-prototyping/touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl', lines=True)
jsonObj["old_id"]=jsonObj["id"].str.split("___",expand=True)[0]
documents_2021=jsonObj.groupby(by="old_id")["contents"].apply(" ".join).reset_index()
del jsonObj
documents_2021.head()

Unnamed: 0,old_id,contents
0,clueweb12-0000tw-00-14115,Do Asian-Americans Face Bias in Admissions at ...
1,clueweb12-0000tw-00-15738,Big Data Analytics a Key Enabler for Social CR...
2,clueweb12-0000tw-00-17905,Best Bets: Fern brings magic to Milford - Delm...
3,clueweb12-0000tw-01-02230,Python Data Analysis Library — pandas: Python ...
4,clueweb12-0000tw-01-15084,MILLARWORLD WEEK: Mark Millar’s Master Plan | ...


In [45]:
# creating a directory for storing the merged json file for index building.
# execute only once to make the directory.
!mkdir /content/drive/MyDrive/touche-2022-prototyping/merged_documents/

In [46]:
# creating new jsonl file for building newer indexes for the pyserini library.
output = jsonlines.open('/content/drive/MyDrive/touche-2022-prototyping/merged_documents/documents.jsonl', 'w')
for id_, cn_ in zip(documents_2021['old_id'], documents_2021['contents']):
    output.write({
        'id': id_,
        'contents': cn_
})

In [47]:
!ls /content/drive/MyDrive/touche-2022-prototyping/merged_documents/

documents.jsonl


In [18]:
rel_2021 = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/touche-task2-51-100-relevance.qrels',
                        sep="\s+", 
                        names=["qid", "no", "doc", "rel"])
rel_2021 = rel_2021.astype(str)
rel_2021.head()

Unnamed: 0,qid,no,doc,rel
0,54,0,clueweb12-0205wb-64-11095,0
1,54,0,clueweb12-0501wb-64-06459,1
2,54,0,clueweb12-0207wb-30-15337,0
3,54,0,clueweb12-0906wb-38-27123,0
4,54,0,clueweb12-0907wb-55-11510,0


In [21]:
rel_2020 = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/touche2020-task2-relevance-withbaseline.qrels',
                        sep="\s+", 
                        names=["qid", "no", "doc", "rel"])
rel_2020 = rel_2020.astype(str)
rel_2020.head()

Unnamed: 0,qid,no,doc,rel
0,1,0,clueweb12-0001wb-05-12311,0
1,1,0,clueweb12-1811wb-62-08424,1
2,1,0,clueweb12-1811wb-62-08423,1
3,1,0,clueweb12-1217wb-47-14048,0
4,1,0,clueweb12-1811wb-62-08425,1


In [22]:
qrel_topics = rel_2020.append(rel_2021, ignore_index=True)
qrel_topics

Unnamed: 0,qid,no,doc,rel
0,1,0,clueweb12-0001wb-05-12311,0
1,1,0,clueweb12-1811wb-62-08424,1
2,1,0,clueweb12-1811wb-62-08423,1
3,1,0,clueweb12-1217wb-47-14048,0
4,1,0,clueweb12-1811wb-62-08425,1
...,...,...,...,...
3854,86,0,clueweb12-0008wb-85-29076,0
3855,86,0,clueweb12-1008wb-62-10779,0
3856,86,0,clueweb12-0202wb-34-19787,0
3857,86,0,clueweb12-0915wb-71-22856,0


In [24]:
# printing number of 'qid' values in the qrel dataframe.
print(len(set(qrel_topics['qid'])))

100


In [25]:
# printing the data types
print(touche_topics.dtypes)
print('\n')
print(qrel_topics.dtypes)

Number    object
Title     object
dtype: object


qid    object
no     object
doc    object
rel    object
dtype: object


In [26]:
percentage=[]
not_common_docs=[]
for qid in touche_topics["Number"]:
    relevant=qrel_topics.loc[qrel_topics["qid"]==qid]["doc"]
    intersection=set(relevant) & set(documents_2021["old_id"])
    not_common=list(set(relevant) -set(documents_2021["old_id"]))
    percent=len(intersection)/len(relevant)
    not_common_docs.append(not_common)
    percentage.append({"title_id":qid,"Percentage":percent,"Not_common":not_common})
df=pd.DataFrame(percentage)
df.head()

Unnamed: 0,title_id,Percentage,Not_common
0,1,0.735294,"[clueweb12-0412wb-04-03887, clueweb12-1811wb-6..."
1,2,0.909091,"[clueweb12-0109wb-28-26419, clueweb12-1516wb-6..."
2,3,0.96875,[clueweb12-0500wb-68-25390]
3,4,0.972222,[clueweb12-1700tw-38-15104]
4,5,0.972222,[clueweb12-1514wb-77-21531]


In [27]:
df

Unnamed: 0,title_id,Percentage,Not_common
0,1,0.735294,"[clueweb12-0412wb-04-03887, clueweb12-1811wb-6..."
1,2,0.909091,"[clueweb12-0109wb-28-26419, clueweb12-1516wb-6..."
2,3,0.968750,[clueweb12-0500wb-68-25390]
3,4,0.972222,[clueweb12-1700tw-38-15104]
4,5,0.972222,[clueweb12-1514wb-77-21531]
...,...,...,...
95,96,0.903226,"[clueweb12-1203wb-55-00251, clueweb12-0206wb-4..."
96,97,0.914286,"[clueweb12-1019wb-25-02525, clueweb12-1810wb-9..."
97,98,0.927273,"[clueweb12-0805wb-63-13916, clueweb12-0110wb-5..."
98,99,0.886792,"[clueweb12-0400wb-19-11055, clueweb12-0406wb-9..."


In [39]:
not_common_docs=list(np.concatenate(not_common_docs).flat)
new_rel_2021=qrel_topics[~qrel_topics["doc"].isin(not_common_docs)]
new_rel_2021.to_csv("/content/drive/MyDrive/touche-2022-prototyping/touche_ground_truth.csv", index=False)
touche_topics.to_csv("/content/drive/MyDrive/touche-2022-prototyping/touche_complete_topics.csv", index=False)

In [30]:
# verifying the irrelevant doc-id elimination operation.
percentage=[]
not_common_docs=[]
for qid in touche_topics["Number"]:
    relevant=new_rel_2021.loc[new_rel_2021["qid"]==qid]["doc"]
    intersection=set(relevant) & set(documents_2021["old_id"])
    not_common=list(set(relevant) -set(documents_2021["old_id"]))
    percent=len(intersection)/len(relevant)
    not_common_docs.append(not_common)
    percentage.append({"title_id":qid,"Percentage":percent,"Not_common":not_common})
print(percentage)
# successfully the additionals documents are eliminated.
# -> (2076+1783-3497), loss of 362 documents relevant documents from the ground truth.
# when we convert

[{'title_id': '1', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '2', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '3', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '4', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '5', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '6', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '7', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '8', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '9', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '10', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '11', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '12', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '13', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '14', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '15', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '16', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '17', 'Percentage': 1.0, 'Not_common': []}, {'title_id': '18', 'Percentage': 1.0, '

In [32]:
new_rel_2021 = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/touche_ground_truth.csv')
new_rel_2021.head()

Unnamed: 0.1,Unnamed: 0,qid,no,doc,rel
0,0,1,0,clueweb12-0001wb-05-12311,0
1,3,1,0,clueweb12-1217wb-47-14048,0
2,7,1,0,clueweb12-1214wb-88-29751,2
3,8,1,0,clueweb12-0307wb-05-31620,0
4,9,1,0,clueweb12-1500wb-68-15142,0


In [33]:
new_rel_2021

Unnamed: 0.1,Unnamed: 0,qid,no,doc,rel
0,0,1,0,clueweb12-0001wb-05-12311,0
1,3,1,0,clueweb12-1217wb-47-14048,0
2,7,1,0,clueweb12-1214wb-88-29751,2
3,8,1,0,clueweb12-0307wb-05-31620,0
4,9,1,0,clueweb12-1500wb-68-15142,0
...,...,...,...,...,...
3492,3852,86,0,clueweb12-0304wb-56-05255,1
3493,3853,86,0,clueweb12-1804wb-01-07034,0
3494,3854,86,0,clueweb12-0008wb-85-29076,0
3495,3856,86,0,clueweb12-0202wb-34-19787,0


In [None]:
# Preparing index on the merged documents and evaluating performance accordingly.
# Finding the documents missed out during the initial retrieval and making
# classifications of the documents that are hard to retrieve on the first attempt.

In [41]:
# installing linux related stuff for pyserini
# removing error "ImportError: No module named '_swigfaiss"
# Reference Link: https://github.com/facebookresearch/faiss/issues/821
!sudo apt-get install libomp-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 1s (402 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty

In [42]:
# installing important packages for building the new index on merged documents.
!pip install pyserini
!pip install faiss

Collecting pyserini
  Downloading pyserini-0.16.0-py3-none-any.whl (84.6 MB)
[K     |████████████████████████████████| 84.6 MB 97 kB/s 
[?25hCollecting sentencepiece>=0.1.95
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 53.6 MB/s 
Collecting transformers>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 23.8 MB/s 
[?25hCollecting lightgbm>=3.3.2
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 38.6 MB/s 
Collecting onnxruntime>=1.8.1
  Downloading onnxruntime-1.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 51.3 MB/s 
[?25hCollecting nmslib>=2.1.1
  Downloading nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5 MB)
[K     |████████████████████████████████| 13.5 MB 

In [48]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input /content/drive/MyDrive/touche-2022-prototyping/merged_documents/ \
  --index /content/drive/MyDrive/touche-2022-prototyping/indexes/merged_docs_index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw

2022-04-07 23:11:36,425 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-04-07 23:11:36,436 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-04-07 23:11:36,437 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: /content/drive/MyDrive/touche-2022-prototyping/merged_documents/
2022-04-07 23:11:36,437 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-04-07 23:11:36,438 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-04-07 23:11:36,438 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 4
2022-04-07 23:11:36,439 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-04-07 23:11:36,439 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-04-07 23:11:36,440 INFO  [main] index.Ind

In [49]:
from pyserini.search.lucene import LuceneSearcher
# searcher = LuceneSearcher('indexes/sample_collection_jsonl')
searcher = LuceneSearcher('/content/drive/MyDrive/touche-2022-prototyping/indexes/merged_docs_index')
hits = searcher.search('Who is stronger, Hulk or Superman?', k=12)
# Value of K determines the number of documents that can be returned.
print('Number of document matches from the index: '+str(len(hits)))
# Attempted tuning BM25 and RM3, nothing has changed the document ranking below.
for i in range(len(hits)):
    print(f'{i+1} {hits[i].docid} {hits[i].score:.5f}')

Number of document matches from the index: 12
1 clueweb12-0605wb-29-30060 11.76480
2 clueweb12-1400wb-06-30596 11.40190
3 clueweb12-1800tw-43-08878 11.03210
4 clueweb12-1800tw-01-24187 10.90040
5 clueweb12-0307wb-46-08640 10.83760
6 clueweb12-1410wb-69-01807 10.57060
7 clueweb12-1014wb-29-14561 10.45100
8 clueweb12-0105wb-89-26953 10.41560
9 clueweb12-1400tw-67-08199 10.30320
10 clueweb12-1910wb-39-01758 10.29830
11 clueweb12-0008wb-48-28848 10.28410
12 clueweb12-1500tw-54-23242 10.26620


In [50]:
searcher_opt = LuceneSearcher('/content/drive/MyDrive/touche-2022-prototyping/indexes/merged_docs_index')
searcher_opt.set_bm25(1.2, 0.68)
hits = searcher_opt.search('Who is stronger, Hulk or Superman?', k=12)
# Value of K determines the number of documents that can be returned.
print('Number of document matches from the index: '+str(len(hits)))
for i in range(len(hits)):
    print(f'{i+1} {hits[i].docid} {hits[i].score:.5f}')

Number of document matches from the index: 12
1 clueweb12-0605wb-29-30060 11.67780
2 clueweb12-1400wb-06-30596 11.45780
3 clueweb12-1410wb-69-01807 10.66800
4 clueweb12-1800tw-01-24187 10.62450
5 clueweb12-1910wb-39-01758 10.40410
6 clueweb12-1800tw-43-08878 10.39770
7 clueweb12-0105wb-89-26953 10.33510
8 clueweb12-0307wb-46-08640 10.33410
9 clueweb12-1014wb-29-14561 10.28550
10 clueweb12-0306wb-63-29079 10.06140
11 clueweb12-1500tw-54-23242 10.04920
12 clueweb12-0904wb-33-14589 9.92410


In [71]:
# Run loop for topics data, store retrieved in order of appearance m=250
solution_dict = {} # topic_id, corresponding_document list.
for id_, q_ in zip(touche_topics['Number'], touche_topics['Title']):
    hits = searcher.search(q_, k=1000)
    d_list = []
    for h_ in hits:
        d_= h_.docid.split('___')[0]
        if d_ not in d_list:
            d_list.append(d_)

    solution_dict[id_] = d_list

In [182]:
# Run loop for topics_2021 data, store retrieved in order of appearance m=250
solution_dict_opt = {} # topic_id, corresponding_document list.
for id_, q_ in zip(touche_topics['Number'], touche_topics['Title']):
    hits = searcher_opt.search(q_, k=1500)
    d_list = []
    for h_ in hits:
        d_= h_.docid.split('___')[0]
        if d_ not in d_list:
            d_list.append(d_)

    solution_dict_opt[id_] = d_list

In [56]:
from collections import defaultdict
ground_truth_dict = defaultdict(list)
rel0_truth_dict = defaultdict(list)
rel1_truth_dict = defaultdict(list)
rel2_truth_dict = defaultdict(list)
rel3_truth_dict = defaultdict(list)

for i_, d_, x_ in zip(new_rel_2021['qid'], new_rel_2021['doc'], new_rel_2021['rel']):
    i_ = str(i_)
    d_ = str(d_)    
    if int(x_) > 0:
        ground_truth_dict[i_].append(d_)
    if int(x_) == 0:
        rel0_truth_dict[i_].append(d_)
    if int(x_) == 1:
        rel1_truth_dict[i_].append(d_)
    if int(x_) == 2:
        rel2_truth_dict[i_].append(d_)
    if int(x_) == 3:
        rel2_truth_dict[i_].append(d_)

In [73]:
# the final dictionaries for basic metric evaluation and analysis.
# Average percentage common, Hit-once and Hit-all metric basic definition.
hit_one = 0
hit_all = 0
total = 100
per_comm_avg = 0

for id_i, doc_i in ground_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit_one += 1
            if doc_j.issuperset(doc_i):
                hit_all += 1
            per_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Hit one: {round(hit_one / total, 4)}')
print(f'Hit all: {round(hit_all / total, 4)}')
print(f'Average common ratio: {round(per_comm_avg / total, 4)}')

hit0_one = 0
hit0_all = 0
per0_comm_avg = 0

for id_i, doc_i in rel0_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit0_one += 1
            if doc_j.issuperset(doc_i):
                hit0_all += 1
            per0_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Zero Relevance, Hit one: {round(hit0_one / total, 4)}')
print(f'Zero Relevance, Hit all: {round(hit0_all / total, 4)}')
print(f'Zero Relevance, Average common ratio: {round(per0_comm_avg / total, 4)}')

hit1_one = 0
hit1_all = 0
per1_comm_avg = 0

for id_i, doc_i in rel1_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit1_one += 1
            if doc_j.issuperset(doc_i):
                hit1_all += 1
            per1_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'One Relevance, Hit one: {round(hit1_one / total, 4)}')
print(f'One Relevance, Hit all: {round(hit1_all / total, 4)}')
print(f'One Relevance, Average common ratio: {round(per1_comm_avg / total, 4)}')

hit2_one = 0
hit2_all = 0
per2_comm_avg = 0

for id_i, doc_i in rel2_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit2_one += 1
            if doc_j.issuperset(doc_i):
                hit2_all += 1
            per2_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Two Relevance, Hit one: {round(hit2_one / total, 4)}')
print(f'Two Relevance, Hit all: {round(hit2_all / total, 4)}')
print(f'Two Relevance, Average common ratio: {round(per2_comm_avg / total, 4)}')

Hit one: 1.0
Hit all: 0.34
Average common ratio: 0.8907
Zero Relevance, Hit one: 1.0
Zero Relevance, Hit all: 0.11
Zero Relevance, Average common ratio: 0.8299
One Relevance, Hit one: 0.98
One Relevance, Hit all: 0.42
One Relevance, Average common ratio: 0.867
Two Relevance, Hit one: 0.9
Two Relevance, Hit all: 0.52
Two Relevance, Average common ratio: 0.814


In [78]:
# the final dictionaries for basic metric evaluation and analysis.
# Average percentage common, Hit-once and Hit-all metric basic definition.
hit_one = 0
hit_all = 0
total = 100
per_comm_avg = 0

for id_i, doc_i in ground_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict_opt.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit_one += 1
            if doc_j.issuperset(doc_i):
                hit_all += 1
            per_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Hit one: {round(hit_one / total, 4)}')
print(f'Hit all: {round(hit_all / total, 4)}')
print(f'Average common ratio: {round(per_comm_avg / total, 4)}')

hit0_one = 0
hit0_all = 0
per0_comm_avg = 0

for id_i, doc_i in rel0_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict_opt.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit0_one += 1
            if doc_j.issuperset(doc_i):
                hit0_all += 1
            per0_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Zero Relevance, Hit one: {round(hit0_one / total, 4)}')
print(f'Zero Relevance, Hit all: {round(hit0_all / total, 4)}')
print(f'Zero Relevance, Average common ratio: {round(per0_comm_avg / total, 4)}')

hit1_one = 0
hit1_all = 0
per1_comm_avg = 0

for id_i, doc_i in rel1_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict_opt.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit1_one += 1
            if doc_j.issuperset(doc_i):
                hit1_all += 1
            per1_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'One Relevance, Hit one: {round(hit1_one / total, 4)}')
print(f'One Relevance, Hit all: {round(hit1_all / total, 4)}')
print(f'One Relevance, Average common ratio: {round(per1_comm_avg / total, 4)}')

hit2_one = 0
hit2_all = 0
per2_comm_avg = 0

for id_i, doc_i in rel2_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict_opt.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit2_one += 1
            if doc_j.issuperset(doc_i):
                hit2_all += 1
            per2_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Two Relevance, Hit one: {round(hit2_one / total, 4)}')
print(f'Two Relevance, Hit all: {round(hit2_all / total, 4)}')
print(f'Two Relevance, Average common ratio: {round(per2_comm_avg / total, 4)}')

Hit one: 1.0
Hit all: 0.35
Average common ratio: 0.8967
Zero Relevance, Hit one: 1.0
Zero Relevance, Hit all: 0.13
Zero Relevance, Average common ratio: 0.8456
One Relevance, Hit one: 0.99
One Relevance, Hit all: 0.45
One Relevance, Average common ratio: 0.8816
Two Relevance, Hit one: 0.9
Two Relevance, Hit all: 0.52
Two Relevance, Average common ratio: 0.814


In [None]:
# Result summarization and A case for query expansion.
# Measuring on an average what's the document retrieval coverage.

# 1. k = 250, opt
# Hit one: 1.0
# Hit all: 0.2
# Average common ratio: 0.8095
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.02
# Zero Relevance, Average common ratio: 0.6274
# One Relevance, Hit one: 0.98
# One Relevance, Hit all: 0.26
# One Relevance, Average common ratio: 0.7671
# Two Relevance, Hit one: 0.89
# Two Relevance, Hit all: 0.43
# Two Relevance, Average common ratio: 0.7774

# 2. k = 500, opt
# Hit one: 1.0
# Hit all: 0.28
# Average common ratio: 0.8696
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.08
# Zero Relevance, Average common ratio: 0.7825
# One Relevance, Hit one: 0.98
# One Relevance, Hit all: 0.34
# One Relevance, Average common ratio: 0.8439
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.5
# Two Relevance, Average common ratio: 0.8055

# 3. k = 786, opt
# Hit one: 1.0
# Hit all: 0.32
# Average common ratio: 0.8931
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.11
# Zero Relevance, Average common ratio: 0.818
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.42
# One Relevance, Average common ratio: 0.8754
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.51
# Two Relevance, Average common ratio: 0.8131

# 4. k = 1000, opt
# Hit one: 1.0
# Hit all: 0.35
# Average common ratio: 0.8955
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.12
# Zero Relevance, Average common ratio: 0.8323
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.44
# One Relevance, Average common ratio: 0.8795
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.52
# Two Relevance, Average common ratio: 0.814

# 5. k = 1200, opt
# Hit one: 1.0
# Hit all: 0.35
# Average common ratio: 0.8961
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.12
# Zero Relevance, Average common ratio: 0.8409
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.45
# One Relevance, Average common ratio: 0.8807
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.52
# Two Relevance, Average common ratio: 0.814

# 6. k = 1375, opt
# Hit one: 1.0
# Hit all: 0.35
# Average common ratio: 0.8967
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.13
# Zero Relevance, Average common ratio: 0.8456
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.45
# One Relevance, Average common ratio: 0.8816
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.52
# Two Relevance, Average common ratio: 0.814

In [183]:
# preparing the missing document analysis, alongside the content
# for observing the limitations of the existing one-query retrieval approach.

tp_rel1_dict = {}
tp_rel2_dict = {}

for id_i, doc_i in rel1_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict_opt.items():
        doc_j = set(doc_j)
        # print(tp_rel1_dict)
        # print(id_i)
        if id_i == id_j :
            # print(tp_rel1_dict)
            # print(id_i)
            d_ = doc_i.difference(doc_j)
            tp_rel1_dict[id_i] = (list(d_))
            break

for id_i, doc_i in rel2_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict_opt.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            d_ = doc_i.difference(doc_j)
            tp_rel2_dict[id_i] = (list(d_))
            break

In [122]:
# documents_2021['old_id'] = documents_2021['old_id'].astype('str') 
documents_2021 = documents_2021.astype({"old_id": str})
print(documents_2021.dtypes)

old_id      object
contents    object
dtype: object


In [160]:
documents_2021_ids = documents_2021['old_id']
documents_2021_contents =  documents_2021['contents']
print(len(documents_2021_ids),len(documents_2021_contents))

33401 33401


In [184]:
# storing the whole result in a csv file dataframe for k=250, 500, 786, 1000, 1375, 1500.
docs_id_list = []
topics_id_list = []
rel_id_list = []
content_list = []

for t_, docl_ in tp_rel1_dict.items():
    if len(docl_) > 0 :
        for di_ in docl_:
            if str(di_) in list(documents_2021['old_id']):
                docs_id_list.append(di_)
                topics_id_list.append(t_)
                rel_id_list.append(int(1))
                for x_, y_ in zip(documents_2021_ids, documents_2021_contents):
                    if str(di_) == str(x_):
                        content_list.append(y_)
                        break

for t_, docl_ in tp_rel2_dict.items():
    if len(docl_) > 0 :
        for di_ in docl_:
            if str(di_) in list(documents_2021['old_id']):
                docs_id_list.append(di_)
                topics_id_list.append(t_)
                rel_id_list.append(int(2))
                for x_, y_ in zip(documents_2021_ids, documents_2021_contents):
                    if str(di_) == str(x_):
                        content_list.append(y_)
                        break

In [185]:
missed = pd.DataFrame(list(zip(docs_id_list, topics_id_list, rel_id_list, content_list)),
               columns =['doc_id', 'qid', 'rel', 'content'])
missed.to_csv("/content/drive/MyDrive/touche-2022-prototyping/missed_full_1500.csv", index=False)