In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np 
from gensim.parsing.preprocessing import STOPWORDS
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
special_characters=string.punctuation
special_characters=special_characters+'“”’—'

porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()


def spl_chars_removal(lst):
  lst1=list()
  for element in lst:
    str=""
    str="".join([i for i in element if i not in special_characters])
    lst1.append(str)
  return lst1

def stopwords_removal_gensim_custom(lst):
  tokens_without_sw = [word for word in lst if not word in STOPWORDS]
  return tokens_without_sw

In [None]:
# In this function documents are preprocessed 
def preprocessing(documents):
  new=[]
  for index, row in tqdm(documents.iterrows()):
    i=row['contents']
    i=re.sub("[</]query","",i)
    i=i.lower()
    z = i.split(" ")
    z=spl_chars_removal(z)
    z=stopwords_removal_gensim_custom(z)
    z=' '.join(z)
    new.append({"contents":z,"id": row["id"]})
  new=pd.DataFrame(new)
  return new

In [None]:
#In this function list of frequencies of words for a particular topic from the top k reranked documents are returned
def findwords(k, reranked,preprocessed):
  relIndex = {}
  for i in range(0, k):
    doc= reranked[i]
    text=preprocessed.loc[preprocessed["id"]==doc].iloc[0]["contents"]
    for term in text.split(" "):
      if term in relIndex.keys():
          relIndex[term] += 1
      else:
          relIndex[term] = 1
  return relIndex

In [None]:
#In this function new queries are generated from the top_n frequent words of the top_k reranked documents 
#the parameter combined specifies if you want to combine all words in one query or not
def psuedo(topics,preprocessed,reranked,top_n,top_k,combined=True):
  final=[]
  for title, number in tqdm(zip(topics['Title'], topics['Number'])):
    d_list=[]
    d_list=reranked.loc[reranked["title"]==title].sort_values(by="Score",ascending=False)["id"].tolist()
    freq_words=findwords(top_k,d_list,preprocessed)
    sorted_words = sorted(freq_words.items(), key=lambda x:x[1], reverse=True)
    j=0
    new_title =title.lower()
    new_title =new_title.split(" ")
    new_title=[wordnet_lemmatizer.lemmatize(word) for word in new_title]
    new_title=' '.join(new_title)
    newQuery=title
    for i in range(100):
      term,frequency = sorted_words[i]
      if wordnet_lemmatizer.lemmatize(term) not in new_title:
        j=j+1
        newQuery +=  " "
        newQuery +=  term
        if combined==False:
          final.append({"Original":title,"New":newQuery,"Frequency":frequency})
          newQuery=title 
      if j==top_n:
        if combined==True:
          final.append({"Original":title,"New":newQuery,"Frequency":frequency})
        break
  return final


In [None]:
topics=pd.read_csv("/content/drive/MyDrive/Touche/topics-task2.csv")
topics.head()

In [None]:
documents= pd.read_json(path_or_buf='/content/drive/MyDrive/Touche/touche-task2-passages-expanded-with-queries.jsonl', lines=True)
documents.head()

In [None]:
reranked=pd.read_csv("/content/drive/MyDrive/Touche/reranked.csv")
reranked.head()

In [None]:
preprocessed=preprocessing(documents)

In [None]:
Final=psuedo(topics,preprocessed,reranked,3,100)
Final=pd.DataFrame(Final)
Final.head()

# Comparison between normal queries and psuedo relevance queries for 2020 (Intial Retrieval)

In [None]:
!sudo apt-get install libomp-dev
# installing important packages for building the new index on merged documents.
!pip install pyserini
!pip install faiss

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 41 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 1s (397 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty

Collecting faiss
  Downloading faiss-1.5.3-cp37-cp37m-manylinux1_x86_64.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.7 MB/s 
Installing collected packages: faiss
Successfully installed faiss-1.5.3


In [None]:
import pyserini
from xml.dom import minidom

In [None]:
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["Number","Title"])
  return parsed

In [None]:
topics=parse_xml("/content/drive/MyDrive/Touche/topics-task-2-2020.xml")
topics.head()

Unnamed: 0,Number,Title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n


In [None]:
documents=pd.read_csv("/content/drive/MyDrive/Touche/2020/docs_2020.csv")
documents= documents.rename(columns={"old_id":"id"})
documents.head()

Unnamed: 0.1,Unnamed: 0,id,contents
0,0,clueweb12-0000tw-00-14115,Do Asian-Americans Face Bias in Admissions at ...
1,1,clueweb12-0000tw-00-15738,Big Data Analytics a Key Enabler for Social CR...
2,2,clueweb12-0000tw-00-17905,Best Bets: Fern brings magic to Milford - Delm...
3,3,clueweb12-0000tw-01-02230,Python Data Analysis Library — pandas: Python ...
4,4,clueweb12-0000tw-01-15084,MILLARWORLD WEEK: Mark Millar’s Master Plan | ...


In [None]:
reranked=pd.read_csv("/content/drive/MyDrive/Touche/2020/reranked_2020__mono_t5.csv")
reranked= reranked.rename(columns={"doc_id":"id"})
reranked.head()

Unnamed: 0.1,Unnamed: 0,id,Score,title
0,0,clueweb12-0818wb-26-13074,-13.594674,\nWhat is the difference between sex and love?\n
1,1,clueweb12-0010wb-33-27298,-13.818763,\nWhat is the difference between sex and love?\n
2,2,clueweb12-0103wb-41-14265,-13.821753,\nWhat is the difference between sex and love?\n
3,3,clueweb12-1013wb-51-17839,-13.743243,\nWhat is the difference between sex and love?\n
4,4,clueweb12-1400tw-39-23968,-14.184422,\nWhat is the difference between sex and love?\n


In [None]:
preprocessed=preprocessing(documents)

33401it [03:12, 173.09it/s]


In [None]:
topics=parse_xml("/content/drive/MyDrive/Touche/topics-task-2-2020.xml")
Final=psuedo(topics,preprocessed,reranked,1,25)
Final=pd.DataFrame(Final)
Final

50it [00:04, 12.35it/s]


Unnamed: 0,Original,New,Frequency
0,\nWhat is the difference between sex and love?\n,\nWhat is the difference between sex and love?...,120
1,"\nWhich is better, a laptop or a desktop?\n","\nWhich is better, a laptop or a desktop?\n co...",86
2,"\nWhich is better, Canon or Nikon?\n","\nWhich is better, Canon or Nikon?\n camera",365
3,\nWhat are the best dish detergents?\n,\nWhat are the best dish detergents?\n dishwasher,167
4,\nWhat are the best cities to live in?\n,\nWhat are the best cities to live in?\n posted,276
5,\nWhat is the longest river in the U.S.?\n,\nWhat is the longest river in the U.S.?\n mis...,273
6,"\nWhich is healthiest: coffee, green tea or bl...","\nWhich is healthiest: coffee, green tea or bl...",164
7,\nWhat are the advantages and disadvantages of...,\nWhat are the advantages and disadvantages of...,232
8,\nWhy is Linux better than Windows?\n,\nWhy is Linux better than Windows?\n software,130
9,\nHow to sleep better?\n,\nHow to sleep better?\n bed,156


In [None]:
topics=parse_xml("/content/drive/MyDrive/Touche/topics-task-2-2020.xml")
topics=topics.join(Final.set_index("Original"),on="Title")
topics.head()

Unnamed: 0,Number,Title,New,Frequency
0,1,\nWhat is the difference between sex and love?\n,\nWhat is the difference between sex and love?...,120
1,2,"\nWhich is better, a laptop or a desktop?\n","\nWhich is better, a laptop or a desktop?\n co...",86
2,3,"\nWhich is better, Canon or Nikon?\n","\nWhich is better, Canon or Nikon?\n camera",365
3,4,\nWhat are the best dish detergents?\n,\nWhat are the best dish detergents?\n dishwasher,167
4,5,\nWhat are the best cities to live in?\n,\nWhat are the best cities to live in?\n posted,276


In [None]:
from pyserini.search import SimpleSearcher
from pyserini.search.lucene import LuceneSearcher
searcher = LuceneSearcher('/content/drive/MyDrive/Touche/2020/sample_collection_jsonl')
searcher.set_bm25(1.2, 0.68)
h1=[]
for l,m,j in tqdm(zip(topics["Title"],topics["New"],topics["Number"])):
  hits = searcher.search(m, k=1000)
  for i in range(len(hits)):
    contents=str.split(hits[i].raw,'"contents" :',)[1]
    contents=contents.replace("}","")
    contents=contents.replace('"',"")
    contents=contents.replace('\n',"")
    h1.append({"title_id":j,"title":l,'doc_id': hits[i].docid,'score': hits[i].score,"content": contents})

27it [00:14,  1.33it/s]

In [None]:
df=pd.DataFrame(h1)
df.head()

Unnamed: 0,title_id,title,doc_id,score,content
0,1,\nWhat is the difference between sex and love?\n,clueweb12-0103wb-41-14265,7.1686,lecture series science software engineering...
1,1,\nWhat is the difference between sex and love?\n,clueweb12-0010wb-33-27298,6.6241,relational quantum mechanics stanford encyclo...
2,1,\nWhat is the difference between sex and love?\n,clueweb12-0818wb-26-13074,6.4762,toofly nyc » hip hop culture toofly nyc clien...
3,1,\nWhat is the difference between sex and love?\n,clueweb12-1013wb-51-17839,6.1648,ethical buyers guide toy cars trains planes s...
4,1,\nWhat is the difference between sex and love?\n,clueweb12-0400tw-82-06704,6.0766,practical approach paleo diet 1 myfivefinger...


In [None]:
df=pd.read_csv("/content/drive/MyDrive/Touche/2020/relevant_bm25_2020.csv")
df=df.drop("Unnamed: 0", axis =1)
df.head()

Unnamed: 0,title_id,title,doc_id,score,content
0,1,\nWhat is the difference between sex and love?\n,clueweb12-0818wb-26-13074,7.1866,toofly nyc » hip hop culture toofly nyc clien...
1,1,\nWhat is the difference between sex and love?\n,clueweb12-0010wb-33-27298,7.1624,relational quantum mechanics stanford encyclo...
2,1,\nWhat is the difference between sex and love?\n,clueweb12-0103wb-41-14265,6.9118,lecture series science software engineering...
3,1,\nWhat is the difference between sex and love?\n,clueweb12-1013wb-51-17839,6.4538,ethical buyers guide toy cars trains planes s...
4,1,\nWhat is the difference between sex and love?\n,clueweb12-1400tw-39-23968,6.2936,video video cbr tv movie trailersclips tv tra...


In [None]:
df.to_csv("/content/drive/MyDrive/Touche/2020/Psuedo_Intial_BM25_retrieval.csv")

In [None]:
from sklearn.metrics import ndcg_score
new=df
rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/new_rel_2020.csv")
rel_2020=rel_2020.drop(["Unnamed: 0","no"],axis=1)
topics_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")
rel_2020=rel_2020.join(topics_2020.set_index("Number"),on="qid").drop("qid",axis=1)
scores=[]
for title in tqdm(new["title"].unique()):
   predicted=new.loc[new["title"]==title]
   true=rel_2020.loc[rel_2020["Title"]==title].drop("Unnamed: 0",axis=1)
   joined=true.join(predicted.set_index("doc_id"),on="doc", how="inner")
   joined["rel"]=joined["rel"].astype(float)
   #print(joined)
   x=np.reshape(np.asarray(joined["rel"]),(1,len(joined["rel"])))
   y=np.reshape(np.asarray(joined["score"]),(1,len(joined["rel"])))
   #print(x.shape)
   scores.append({"Title":title,"Score":ndcg_score(x,y,k=5)})

100%|██████████| 50/50 [00:00<00:00, 98.41it/s]


In [None]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,Title,Score
0,\nWhat is the difference between sex and love?\n,0.242614
1,"\nWhich is better, a laptop or a desktop?\n",0.893007
2,"\nWhich is better, Canon or Nikon?\n",1.0
3,\nWhat are the best dish detergents?\n,0.764817
4,\nWhat are the best cities to live in?\n,0.150393
5,\nWhat is the longest river in the U.S.?\n,0.639945
6,"\nWhich is healthiest: coffee, green tea or bl...",0.285164
7,\nWhat are the advantages and disadvantages of...,0.58557
8,\nWhy is Linux better than Windows?\n,0.469057
9,\nHow to sleep better?\n,0.446854


In [None]:
np.mean(np.asarray(scores["Score"].astype(float)))

0.6075354043432281

In [None]:
# Normal BM25 Ndcg=0.6245066216824341

#100 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6056172984600517

#Top 3
# Psuedo relevance Intial Retrieval Ndcg=0.5831943937936228

#Top 10
# Psuedo relevance Intial Retrieval Ndcg=0.5606175240906345

#########################################################################################
#10 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6080456834956907

#########################################################################################
#50 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6516079239646148

#########################################################################################
#75 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6075354043432281

In [None]:
from collections import defaultdict
ground_truth_dict = defaultdict(list)
rel0_truth_dict = defaultdict(list)
rel1_truth_dict = defaultdict(list)
rel2_truth_dict = defaultdict(list)

rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/new_rel_2020.csv")
rel_2020=rel_2020.drop(["Unnamed: 0","no"],axis=1)

for i_, d_, x_ in zip(rel_2020['qid'], rel_2020['doc'], rel_2020['rel']):
    i_ = str(i_)
    d_ = str(d_)    
    if int(x_) > 0:
        ground_truth_dict[i_].append(d_)
    if int(x_) == 0:
        rel0_truth_dict[i_].append(d_)
    if int(x_) == 1:
        rel1_truth_dict[i_].append(d_)
    if int(x_) == 2:
        rel2_truth_dict[i_].append(d_)

In [None]:
solution_dict_opt = {} # topic_id, corresponding_document list.
solution=df
for title, number in tqdm(zip(topics_2020['Title'], topics_2020['Number'])):
  d_list=[]
  number=str(number)
  d_list=solution.loc[solution["title"]==title].sort_values(by="score",ascending=False).iloc[0:1000,:]["doc_id"].tolist()
  solution_dict_opt[number] = d_list
solution_dict=solution_dict_opt

50it [00:00, 219.11it/s]


In [None]:
# the final dictionaries for basic metric evaluation and analysis.
# Average percentage common, Hit-once and Hit-all metric basic definition.
hit_one = 0
hit_all = 0
total = 50
per_comm_avg = 0

for id_i, doc_i in ground_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit_one += 1
            if doc_j.issuperset(doc_i):
                hit_all += 1
            per_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Hit one: {round(hit_one / total, 4)}')
print(f'Hit all: {round(hit_all / total, 4)}')
print(f'Average common ratio: {round(per_comm_avg / total, 4)}')

hit0_one = 0
hit0_all = 0
per0_comm_avg = 0

for id_i, doc_i in rel0_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit0_one += 1
            if doc_j.issuperset(doc_i):
                hit0_all += 1
            per0_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Zero Relevance, Hit one: {round(hit0_one / total, 4)}')
print(f'Zero Relevance, Hit all: {round(hit0_all / total, 4)}')
print(f'Zero Relevance, Average common ratio: {round(per0_comm_avg / total, 4)}')

hit1_one = 0
hit1_all = 0
per1_comm_avg = 0

for id_i, doc_i in rel1_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit1_one += 1
            if doc_j.issuperset(doc_i):
                hit1_all += 1
            per1_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'One Relevance, Hit one: {round(hit1_one / total, 4)}')
print(f'One Relevance, Hit all: {round(hit1_all / total, 4)}')
print(f'One Relevance, Average common ratio: {round(per1_comm_avg / total, 4)}')

hit2_one = 0
hit2_all = 0
per2_comm_avg = 0

for id_i, doc_i in rel2_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit2_one += 1
            if doc_j.issuperset(doc_i):
                hit2_all += 1
            per2_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Two Relevance, Hit one: {round(hit2_one / total, 4)}')
print(f'Two Relevance, Hit all: {round(hit2_all / total, 4)}')
print(f'Two Relevance, Average common ratio: {round(per2_comm_avg / total, 4)}')

Hit one: 1.0
Hit all: 0.64
Average common ratio: 0.9526
Zero Relevance, Hit one: 1.0
Zero Relevance, Hit all: 0.24
Zero Relevance, Average common ratio: 0.8502
One Relevance, Hit one: 1.0
One Relevance, Hit all: 0.78
One Relevance, Average common ratio: 0.9537
Two Relevance, Hit one: 0.9
Two Relevance, Hit all: 0.66
Two Relevance, Average common ratio: 0.858


In [None]:
#10 documents used

#Top 1

# Hit one: 1.0
# Hit all: 0.64
# Average common ratio: 0.956
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.24
# Zero Relevance, Average common ratio: 0.8496
# One Relevance, Hit one: 1.0
# One Relevance, Hit all: 0.76
# One Relevance, Average common ratio: 0.9553
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.68
# Two Relevance, Average common ratio: 0.8615

#50 documents used

#Top 1

# Hit one: 1.0
# Hit all: 0.64
# Average common ratio: 0.9488
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.22
# Zero Relevance, Average common ratio: 0.8359
# One Relevance, Hit one: 0.98
# One Relevance, Hit all: 0.76
# One Relevance, Average common ratio: 0.9517
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.68
# Two Relevance, Average common ratio: 0.854

#75 documents used

#Top 1

# Hit one: 1.0
# Hit all: 0.64
# Average common ratio: 0.9526
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.24
# Zero Relevance, Average common ratio: 0.8502
# One Relevance, Hit one: 1.0
# One Relevance, Hit all: 0.78
# One Relevance, Average common ratio: 0.9537
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.66
# Two Relevance, Average common ratio: 0.858


#100 documents used

#Top 1

# Hit one: 1.0
# Hit all: 0.64
# Average common ratio: 0.9547
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.22
# Zero Relevance, Average common ratio: 0.8473
# One Relevance, Hit one: 1.0
# One Relevance, Hit all: 0.76
# One Relevance, Average common ratio: 0.9556
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.68
# Two Relevance, Average common ratio: 0.8607




# Comparison between normal queries and psuedo relevance queries for 2020+2021 (Intial Retrieval)

In [None]:
!sudo apt-get install libomp-dev
# installing important packages for building the new index on merged documents.
!pip install pyserini
!pip install faiss

In [None]:
topics=pd.read_csv("/content/drive/MyDrive/Touche/touche_complete_topics.csv")
topics.head()

In [None]:
documents= pd.read_json(path_or_buf='/content/drive/MyDrive/Touche/touche-task2-passages-expanded-with-queries.jsonl', lines=True)
documents.head()

In [None]:
reranked=pd.read_csv("/content/drive/MyDrive/Touche/reranked_2020_2021_mono_t5.csv")
reranked= reranked.rename(columns={"doc_id":"id"})
reranked.head()

In [None]:
preprocessed=preprocessing(documents)
preprocessed.head()

868655it [04:24, 3284.57it/s]


In [None]:
preprocessed.head()

Unnamed: 0,contents,id
0,asianamericans face bias admissions elite coll...,clueweb12-0000tw-00-14115___1
1,insisting upholding affirmative action college...,clueweb12-0000tw-00-14115___10
2,dont richfamous daddy mommy want career stem a...,clueweb12-0000tw-00-14115___11
3,speculative opinion way definitely worth looki...,clueweb12-0000tw-00-14115___12
4,probably 23 percent life kent clarkfield 4 fe...,clueweb12-0000tw-00-14115___13


In [None]:
topics=pd.read_csv("/content/drive/MyDrive/Touche/touche_complete_topics.csv")
Final=psuedo(topics,preprocessed,reranked,1,10)
Final=pd.DataFrame(Final)
Final

100it [00:52,  1.92it/s]


Unnamed: 0,Original,New,Frequency
0,\nWhat is the difference between sex and love?\n,\nWhat is the difference between sex and love?...,18
1,"\nWhich is better, a laptop or a desktop?\n","\nWhich is better, a laptop or a desktop?\n home",9
2,"\nWhich is better, Canon or Nikon?\n","\nWhich is better, Canon or Nikon?\n vs",29
3,\nWhat are the best dish detergents?\n,\nWhat are the best dish detergents?\n brands,35
4,\nWhat are the best cities to live in?\n,\nWhat are the best cities to live in?\n places,13
...,...,...,...
95,"Which is healthier to wear, boxers or briefs?","Which is healthier to wear, boxers or briefs? men",16
96,What is the difference between a blender vs a ...,What is the difference between a blender vs a ...,19
97,"Which is better, rock or rap?","Which is better, rock or rap? music",29
98,Do you think imagination is better than knowle...,Do you think imagination is better than knowle...,13


In [None]:
Final.to_csv("/content/drive/MyDrive/Touche/touche_complete_topics_psuedo.csv")

In [None]:
topics=pd.read_csv("/content/drive/MyDrive/Touche/touche_complete_topics.csv")
topics=topics.join(Final.set_index("Original"),on="Title")
topics.head()

Unnamed: 0,Number,Title,New,Frequency
0,1,\nWhat is the difference between sex and love?\n,\nWhat is the difference between sex and love?...,18
1,2,"\nWhich is better, a laptop or a desktop?\n","\nWhich is better, a laptop or a desktop?\n home",9
2,3,"\nWhich is better, Canon or Nikon?\n","\nWhich is better, Canon or Nikon?\n vs",29
3,4,\nWhat are the best dish detergents?\n,\nWhat are the best dish detergents?\n brands,35
4,5,\nWhat are the best cities to live in?\n,\nWhat are the best cities to live in?\n places,13


In [None]:
from pyserini.search import SimpleSearcher
from pyserini.search.lucene import LuceneSearcher
searcher = LuceneSearcher('/content/drive/MyDrive/Touche/sample_collection_jsonl')
searcher.set_bm25(1.2, 0.68)
h1=[]
for l,m,j in tqdm(zip(topics["Title"],topics["New"],topics["Number"])):
  hits = searcher.search(m, k=1500)
  for i in range(len(hits)):
    contents=str.split(hits[i].raw,'"contents" :',)[1]
    contents=contents.replace("}","")
    contents=contents.replace('"',"")
    contents=contents.replace('\n',"")
    h1.append({"title_id":j,"title":l,'doc_id': hits[i].docid,'score': hits[i].score,"content": contents})

100it [00:10,  9.54it/s]


In [None]:
df=pd.DataFrame(h1)
df["id"]=df["doc_id"].str.split("__",expand=True)[0]
df=df.groupby(by=["title_id","id","title"])["score"].max().reset_index()
df=df.rename(columns={"id":"doc_id"})
df.head()

Unnamed: 0,title_id,doc_id,title,score
0,1,clueweb12-0000tw-14-21168,\nWhat is the difference between sex and love?\n,6.7985
1,1,clueweb12-0000tw-36-21848,\nWhat is the difference between sex and love?\n,4.9699
2,1,clueweb12-0000tw-38-17144,\nWhat is the difference between sex and love?\n,5.9029
3,1,clueweb12-0000wb-01-00005,\nWhat is the difference between sex and love?\n,6.1371
4,1,clueweb12-0000wb-65-26363,\nWhat is the difference between sex and love?\n,4.4143


In [None]:
df=pd.read_csv("/content/drive/MyDrive/Touche/relevant_bm25_2020_2021.csv")
df["id"]=df["doc_id"].str.split("__",expand=True)[0]
df=df.groupby(by=["title_id","id","title"])["score"].max().reset_index()
df=df.rename(columns={"id":"doc_id"})
df.head()

Unnamed: 0,title_id,doc_id,title,score
0,1,clueweb12-0000tw-05-14315,\nWhat is the difference between sex and love?\n,4.6611
1,1,clueweb12-0000tw-14-21168,\nWhat is the difference between sex and love?\n,7.1002
2,1,clueweb12-0000tw-22-19226,\nWhat is the difference between sex and love?\n,5.0118
3,1,clueweb12-0000tw-36-21848,\nWhat is the difference between sex and love?\n,5.2702
4,1,clueweb12-0000tw-38-17144,\nWhat is the difference between sex and love?\n,6.08


In [None]:
df.to_csv("/content/drive/MyDrive/Touche/2020/Psuedo_Intial_BM25_retrieval.csv")

In [None]:
from sklearn.metrics import ndcg_score
new=df
rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/touche_ground_truth.csv")
rel_2020=rel_2020.drop(["no"],axis=1)
topics_2020=pd.read_csv("/content/drive/MyDrive/Touche/touche_complete_topics.csv")
rel_2020=rel_2020.join(topics_2020.set_index("Number"),on="qid").drop("qid",axis=1)
scores=[]
for title in tqdm(new["title"].unique()):
  predicted=new.loc[new["title"]==title]
  true=rel_2020.loc[rel_2020["Title"]==title]
  joined=true.join(predicted.set_index("doc_id"),on="doc", how="inner")
  joined["rel"]=joined["rel"].astype(float)
  #print(joined)
  x=np.reshape(np.asarray(joined["rel"]),(1,len(joined["rel"])))
  y=np.reshape(np.asarray(joined["score"]),(1,len(joined["rel"])))
  # print(x.shape)
  # print(y.shape)
  try:
    scores.append({"Title":title,"Score":ndcg_score(x,y,k=5)})
  except:
    print("No joined records for ",title)

100%|██████████| 100/100 [00:00<00:00, 146.45it/s]


In [None]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,Title,Score
0,\nWhat is the difference between sex and love?\n,0.699215
1,"\nWhich is better, a laptop or a desktop?\n",0.853932
2,"\nWhich is better, Canon or Nikon?\n",0.616434
3,\nWhat are the best dish detergents?\n,0.722727
4,\nWhat are the best cities to live in?\n,0.701224
...,...,...
95,"Which is healthier to wear, boxers or briefs?",0.393007
96,What is the difference between a blender vs a ...,0.862224
97,"Which is better, rock or rap?",0.539686
98,Do you think imagination is better than knowle...,0.445302


In [None]:
np.mean(np.asarray(scores["Score"].astype(float)))

0.657384879751739

In [None]:
# Normal BM25 Ndcg=0.61926458296273
#########################################################################################
#100 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6125694233673229

#########################################################################################
#10 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.657384879751739

#########################################################################################
#50 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.620693459545643

#########################################################################################
#75 documents used

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6019248101062263

In [None]:
from collections import defaultdict
ground_truth_dict = defaultdict(list)
rel0_truth_dict = defaultdict(list)
rel1_truth_dict = defaultdict(list)
rel2_truth_dict = defaultdict(list)

rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/touche_ground_truth.csv")
rel_2020=rel_2020.drop(["no"],axis=1)

for i_, d_, x_ in zip(rel_2020['qid'], rel_2020['doc'], rel_2020['rel']):
    i_ = str(i_)
    d_ = str(d_)    
    if int(x_) > 0:
        ground_truth_dict[i_].append(d_)
    if int(x_) == 0:
        rel0_truth_dict[i_].append(d_)
    if int(x_) == 1:
        rel1_truth_dict[i_].append(d_)
    if int(x_) == 2:
        rel2_truth_dict[i_].append(d_)

In [None]:
solution_dict_opt = {} # topic_id, corresponding_document list.
solution=df
for title, number in tqdm(zip(topics_2020['Title'], topics_2020['Number'])):
  d_list=[]
  number=str(number)
  d_list=solution.loc[solution["title"]==title].sort_values(by="score",ascending=False).iloc[0:1500,:]["doc_id"].tolist()
  solution_dict_opt[number] = d_list
solution_dict=solution_dict_opt

100it [00:00, 260.46it/s]


In [None]:
# the final dictionaries for basic metric evaluation and analysis.
# Average percentage common, Hit-once and Hit-all metric basic definition.
hit_one = 0
hit_all = 0
total = 100
per_comm_avg = 0

for id_i, doc_i in ground_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit_one += 1
            if doc_j.issuperset(doc_i):
                hit_all += 1
            per_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Hit one: {round(hit_one / total, 4)}')
print(f'Hit all: {round(hit_all / total, 4)}')
print(f'Average common ratio: {round(per_comm_avg / total, 4)}')

hit0_one = 0
hit0_all = 0
per0_comm_avg = 0

for id_i, doc_i in rel0_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit0_one += 1
            if doc_j.issuperset(doc_i):
                hit0_all += 1
            per0_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Zero Relevance, Hit one: {round(hit0_one / total, 4)}')
print(f'Zero Relevance, Hit all: {round(hit0_all / total, 4)}')
print(f'Zero Relevance, Average common ratio: {round(per0_comm_avg / total, 4)}')

hit1_one = 0
hit1_all = 0
per1_comm_avg = 0

for id_i, doc_i in rel1_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit1_one += 1
            if doc_j.issuperset(doc_i):
                hit1_all += 1
            per1_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'One Relevance, Hit one: {round(hit1_one / total, 4)}')
print(f'One Relevance, Hit all: {round(hit1_all / total, 4)}')
print(f'One Relevance, Average common ratio: {round(per1_comm_avg / total, 4)}')

hit2_one = 0
hit2_all = 0
per2_comm_avg = 0

for id_i, doc_i in rel2_truth_dict.items():
    doc_i = set(doc_i)
    for id_j , doc_j in solution_dict.items():
        doc_j = set(doc_j)
        if id_i == id_j:
            if doc_j.intersection(doc_i):
                hit2_one += 1
            if doc_j.issuperset(doc_i):
                hit2_all += 1
            per2_comm_avg += len(doc_j.intersection(doc_i))/len(doc_i)
            break

print(f'Two Relevance, Hit one: {round(hit2_one / total, 4)}')
print(f'Two Relevance, Hit all: {round(hit2_all / total, 4)}')
print(f'Two Relevance, Average common ratio: {round(per2_comm_avg / total, 4)}')

Hit one: 1.0
Hit all: 0.29
Average common ratio: 0.8672
Zero Relevance, Hit one: 1.0
Zero Relevance, Hit all: 0.09
Zero Relevance, Average common ratio: 0.7398
One Relevance, Hit one: 0.99
One Relevance, Hit all: 0.41
One Relevance, Average common ratio: 0.8459
Two Relevance, Hit one: 0.9
Two Relevance, Hit all: 0.49
Two Relevance, Average common ratio: 0.8047


In [None]:
# Normal BM25

# Hit one: 1.0
# Hit all: 0.29
# Average common ratio: 0.8722
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.09
# Zero Relevance, Average common ratio: 0.7587
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.4
# One Relevance, Average common ratio: 0.8516
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.49
# Two Relevance, Average common ratio: 0.803



#50 documents used
#Top 1

# Hit one: 1.0
# Hit all: 0.29
# Average common ratio: 0.8694
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.08
# Zero Relevance, Average common ratio: 0.7407
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.4
# One Relevance, Average common ratio: 0.8491
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.49
# Two Relevance, Average common ratio: 0.8057

#10 documents used
#Top 1

# Hit one: 1.0
# Hit all: 0.28
# Average common ratio: 0.865
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.08
# Zero Relevance, Average common ratio: 0.7419
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.38
# One Relevance, Average common ratio: 0.8445
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.49
# Two Relevance, Average common ratio: 0.8024


#75 documents used
#Top 1

# Hit one: 1.0
# Hit all: 0.27
# Average common ratio: 0.8654
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.09
# Zero Relevance, Average common ratio: 0.7386
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.38
# One Relevance, Average common ratio: 0.8435
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.49
# Two Relevance, Average common ratio: 0.8057

#100 documents used
#Top 1

# Hit one: 1.0
# Hit all: 0.29
# Average common ratio: 0.8672
# Zero Relevance, Hit one: 1.0
# Zero Relevance, Hit all: 0.09
# Zero Relevance, Average common ratio: 0.7398
# One Relevance, Hit one: 0.99
# One Relevance, Hit all: 0.41
# One Relevance, Average common ratio: 0.8459
# Two Relevance, Hit one: 0.9
# Two Relevance, Hit all: 0.49
# Two Relevance, Average common ratio: 0.8047


# Comparison between normal queries and psuedo relevance queries for 2020 (Reranking)

In [None]:
#install pygaggle before transformers (first create the index using pyserini then install pygaggle)
!pip install pygaggle==0.0.2
!pip install transformers==4.17.0
import transformers
import pygaggle

In [None]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, DuoT5
reranker =  MonoT5()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1841.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691413.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [None]:
import pandas as pd
from tqdm import tqdm
from xml.dom import minidom

In [None]:
relevant_bm25=pd.read_csv("/content/drive/MyDrive/Touche/2020/Psuedo_Intial_BM25_retrieval.csv")
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,title,doc_id,score,content
0,0,1,\nWhat is the difference between sex and love?\n,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...
1,1,1,\nWhat is the difference between sex and love?\n,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...
2,2,1,\nWhat is the difference between sex and love?\n,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...
3,3,1,\nWhat is the difference between sex and love?\n,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...
4,4,1,\nWhat is the difference between sex and love?\n,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...


In [None]:
topics= pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")
relevant_bm25=relevant_bm25.join(topics[["Title","New"]].set_index("Title"),on="title", how="inner")
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,title,doc_id,score,content,New
0,0,1,\nWhat is the difference between sex and love?\n,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...,\nWhat is the difference between sex and love?...
1,1,1,\nWhat is the difference between sex and love?\n,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...,\nWhat is the difference between sex and love?...
2,2,1,\nWhat is the difference between sex and love?\n,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...,\nWhat is the difference between sex and love?...
3,3,1,\nWhat is the difference between sex and love?\n,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...,\nWhat is the difference between sex and love?...
4,4,1,\nWhat is the difference between sex and love?\n,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...,\nWhat is the difference between sex and love?...


In [None]:
relevant_bm25=relevant_bm25.drop(["title"],axis=1)
relevant_bm25=relevant_bm25.rename(columns={"New":"title"})
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,doc_id,score,content,title
0,0,1,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...,\nWhat is the difference between sex and love?...
1,1,1,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...,\nWhat is the difference between sex and love?...
2,2,1,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...,\nWhat is the difference between sex and love?...
3,3,1,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...,\nWhat is the difference between sex and love?...
4,4,1,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...,\nWhat is the difference between sex and love?...


In [None]:
# Mono T5 reranker
new=[]
for title in tqdm(relevant_bm25["title"].unique()):
  passages= relevant_bm25.loc[relevant_bm25["title"]==title]
  texts = [ Text(p[1].content, {'docid': p[1].doc_id}, 0) for p in passages.iterrows()]
  query = Query(title)
  reranked = reranker.rerank(query, texts)
  for i in range(0, 1000):
    new.append({"doc_id":reranked[i].metadata["docid"], "Score":reranked[i].score, "title":title})

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
100%|██████████| 50/50 [1:40:50<00:00, 121.02s/it]


In [None]:
df=pd.DataFrame(new)

In [None]:
from sklearn.metrics import ndcg_score
new=df.join(topics[["Title","New"]].set_index("New"),on="title", how="inner")
new=new.drop("title",axis=1)
new=new.rename(columns={"Title":"title"})
rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/new_rel_2020.csv")
rel_2020=rel_2020.drop(["Unnamed: 0","no"],axis=1)
topics_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")
rel_2020=rel_2020.join(topics_2020.set_index("Number"),on="qid").drop("qid",axis=1)
scores=[]
for title in tqdm(new["title"].unique()):
   predicted=new.loc[new["title"]==title]
   true=rel_2020.loc[rel_2020["Title"]==title]
   joined=true.join(predicted.set_index("doc_id"),on="doc", how="inner")
   joined["rel"]=joined["rel"].astype(float)
   x=np.reshape(np.asarray(joined["rel"]),(1,len(joined["rel"])))
   y=np.reshape(np.asarray(joined["Score"]),(1,len(joined["rel"])))
   #print(x.shape)
   scores.append({"Title":title,"Score":ndcg_score(x,y,k=5)})

In [None]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,Title,Score
0,\nWhat is the difference between sex and love?\n,0.926966
1,"\nWhich is better, a laptop or a desktop?\n",1.0
2,"\nWhich is better, Canon or Nikon?\n",0.83042
3,\nWhat are the best dish detergents?\n,0.654809
4,\nWhat are the best cities to live in?\n,0.488244
5,\nWhat is the longest river in the U.S.?\n,0.529635
6,"\nWhich is healthiest: coffee, green tea or bl...",0.30744
7,\nWhat are the advantages and disadvantages of...,0.15102
8,\nWhy is Linux better than Windows?\n,0.66084
9,\nHow to sleep better?\n,0.535104


In [None]:
np.mean(np.asarray(scores["Score"].astype(float)))

0.7031541399074839

In [None]:
# Normal BM25 Ndcg=0.6245066216824341
# Mono t5 Ndcg@5=0.7337937746825516
# distibert Ndcg@5=0.41921988405319743

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6056172984600517

#Top 3
# Psuedo relevance Intial Retrieval Ndcg=0.5831943937936228

#Top 10
# Psuedo relevance Intial Retrieval Ndcg=0.5606175240906345

#Top 1
# Psuedo relevance Reranking Ndcg=0.7031541399074839
