In [10]:
from tqdm import tqdm
import pandas as pd
import numpy as np 
from gensim.parsing.preprocessing import STOPWORDS
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

2022-04-21 21:39:43 [INFO] textcleaner: 'pattern' package not found; tag filters are not available for English


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
special_characters=string.punctuation
special_characters=special_characters+'“”’—'

porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()


def spl_chars_removal(lst):
  lst1=list()
  for element in lst:
    str=""
    str="".join([i for i in element if i not in special_characters])
    lst1.append(str)
  return lst1

def stopwords_removal_gensim_custom(lst):
  tokens_without_sw = [word for word in lst if not word in STOPWORDS]
  return tokens_without_sw

In [3]:
# In this function documents are preprocessed 
def preprocessing(documents):
  new=[]
  for index, row in tqdm(documents.iterrows()):
    i=row['contents']
    i=re.sub("[</]query","",i)
    i=i.lower()
    z = i.split(" ")
    z=spl_chars_removal(z)
    z=stopwords_removal_gensim_custom(z)
    z=' '.join(z)
    new.append({"contents":z,"id": row["id"]})
  new=pd.DataFrame(new)
  return new

In [4]:
#In this function list of frequencies of words for a particular topic from the top k reranked documents are returned
def findwords(k, reranked,preprocessed):
  relIndex = {}
  for i in range(0, k):
    doc= reranked[i]
    text=preprocessed.loc[preprocessed["id"]==doc].iloc[0]["contents"]
    for term in text.split(" "):
      if term in relIndex.keys():
          relIndex[term] += 1
      else:
          relIndex[term] = 1
  return relIndex

In [5]:
#In this function new queries are generated from the top_n frequent words of the top_k reranked documents 
#the parameter combined specifies if you want to combine all words in one query or not
def psuedo(topics,documents,reranked,top_n,top_k,combined=True):
  final=[]
  preprocessed=preprocessing(documents)
  for title, number in tqdm(zip(topics['Title'], topics['Number'])):
    d_list=[]
    d_list=reranked.loc[reranked["title"]==title].sort_values(by="Score",ascending=False)["id"].tolist()
    freq_words=findwords(top_k,d_list,preprocessed)
    sorted_words = sorted(freq_words.items(), key=lambda x:x[1], reverse=True)
    j=0
    new_title =title.lower()
    new_title =new_title.split(" ")
    new_title=[wordnet_lemmatizer.lemmatize(word) for word in new_title]
    new_title=' '.join(new_title)
    newQuery=title
    for i in range(100):
      term,frequency = sorted_words[i]
      if wordnet_lemmatizer.lemmatize(term) not in new_title:
        j=j+1
        newQuery +=  " "
        newQuery +=  term
        if combined==False:
          final.append({"Original":title,"New":newQuery,"Frequency":frequency})
          newQuery=title 
      if j==top_n:
        if combined==True:
          final.append({"Original":title,"New":newQuery,"Frequency":frequency})
        break
  return final


In [None]:
topics=pd.read_csv("/content/drive/MyDrive/Touche/topics-task2.csv")
topics.head()

In [None]:
documents= pd.read_json(path_or_buf='/content/drive/MyDrive/Touche/touche-task2-passages-expanded-with-queries.jsonl', lines=True)
documents.head()

In [None]:
reranked=pd.read_csv("/content/drive/MyDrive/Touche/reranked.csv")
reranked.head()

In [None]:
Final=psuedo(topics,documents,reranked,3,100)
Final=pd.DataFrame(Final)
Final.head()

# Comparison between normal queries and psuedo relevance queries for 2020 (Intial Retrieval)

In [None]:
!sudo apt-get install libomp-dev
# installing important packages for building the new index on merged documents.
!pip install pyserini
!pip install faiss

In [7]:
import pyserini
from xml.dom import minidom

In [8]:
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["Number","Title"])
  return parsed

In [9]:
topics=parse_xml("/content/drive/MyDrive/Touche/topics-task-2-2020.xml")
topics.head()

Unnamed: 0,Number,Title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n


In [10]:
documents=pd.read_csv("/content/drive/MyDrive/Touche/2020/docs_2020.csv")
documents= documents.rename(columns={"old_id":"id"})
documents.head()

Unnamed: 0.1,Unnamed: 0,id,contents
0,0,clueweb12-0000tw-00-14115,Do Asian-Americans Face Bias in Admissions at ...
1,1,clueweb12-0000tw-00-15738,Big Data Analytics a Key Enabler for Social CR...
2,2,clueweb12-0000tw-00-17905,Best Bets: Fern brings magic to Milford - Delm...
3,3,clueweb12-0000tw-01-02230,Python Data Analysis Library — pandas: Python ...
4,4,clueweb12-0000tw-01-15084,MILLARWORLD WEEK: Mark Millar’s Master Plan | ...


In [11]:
reranked=pd.read_csv("/content/drive/MyDrive/Touche/2020/reranked_2020__mono_t5.csv")
reranked= reranked.rename(columns={"doc_id":"id"})
reranked.head()

Unnamed: 0.1,Unnamed: 0,id,Score,title
0,0,clueweb12-0818wb-26-13074,-13.594674,\nWhat is the difference between sex and love?\n
1,1,clueweb12-0010wb-33-27298,-13.818763,\nWhat is the difference between sex and love?\n
2,2,clueweb12-0103wb-41-14265,-13.821753,\nWhat is the difference between sex and love?\n
3,3,clueweb12-1013wb-51-17839,-13.743243,\nWhat is the difference between sex and love?\n
4,4,clueweb12-1400tw-39-23968,-14.184422,\nWhat is the difference between sex and love?\n


In [12]:
Final=psuedo(topics,documents,reranked,1,100)
Final=pd.DataFrame(Final)
Final.head()

33401it [03:26, 161.68it/s]
50it [00:16,  2.97it/s]


Unnamed: 0,Original,New,Frequency
0,\nWhat is the difference between sex and love?\n,\nWhat is the difference between sex and love?...,495
1,"\nWhich is better, a laptop or a desktop?\n","\nWhich is better, a laptop or a desktop?\n new",456
2,"\nWhich is better, Canon or Nikon?\n","\nWhich is better, Canon or Nikon?\n camera",1599
3,\nWhat are the best dish detergents?\n,\nWhat are the best dish detergents?\n laundry,670
4,\nWhat are the best cities to live in?\n,\nWhat are the best cities to live in?\n new,1063


In [13]:
topics=topics.join(Final.set_index("Original"),on="Title")
topics.head()

Unnamed: 0,Number,Title,New,Frequency
0,1,\nWhat is the difference between sex and love?\n,\nWhat is the difference between sex and love?...,495
1,2,"\nWhich is better, a laptop or a desktop?\n","\nWhich is better, a laptop or a desktop?\n new",456
2,3,"\nWhich is better, Canon or Nikon?\n","\nWhich is better, Canon or Nikon?\n camera",1599
3,4,\nWhat are the best dish detergents?\n,\nWhat are the best dish detergents?\n laundry,670
4,5,\nWhat are the best cities to live in?\n,\nWhat are the best cities to live in?\n new,1063


In [25]:
topics.to_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")

In [26]:
from pyserini.search import SimpleSearcher
from pyserini.search.lucene import LuceneSearcher
searcher = LuceneSearcher('/content/drive/MyDrive/Touche/2020/sample_collection_jsonl')
searcher.set_bm25(1.2, 0.68)
h1=[]
for l,m,j in tqdm(zip(topics["Title"],topics["New"],topics["Number"])):
  hits = searcher.search(m, k=1000)
  for i in range(len(hits)):
    contents=str.split(hits[i].raw,'"contents" :',)[1]
    contents=contents.replace("}","")
    contents=contents.replace('"',"")
    contents=contents.replace('\n',"")
    h1.append({"title_id":j,"title":l,'doc_id': hits[i].docid,'score': hits[i].score,"content": contents})

50it [01:08,  1.37s/it]


In [27]:
df=pd.DataFrame(h1)
df.head()

Unnamed: 0,title_id,title,doc_id,score,content
0,1,\nWhat is the difference between sex and love?\n,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...
1,1,\nWhat is the difference between sex and love?\n,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...
2,1,\nWhat is the difference between sex and love?\n,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...
3,1,\nWhat is the difference between sex and love?\n,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...
4,1,\nWhat is the difference between sex and love?\n,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...


In [19]:
df=pd.read_csv("/content/drive/MyDrive/Touche/2020/relevant_bm25_2020.csv")
df=df.drop("Unnamed: 0", axis =1)
df.head()

Unnamed: 0,title_id,title,doc_id,score,content
0,1,\nWhat is the difference between sex and love?\n,clueweb12-0818wb-26-13074,7.1866,toofly nyc » hip hop culture toofly nyc clien...
1,1,\nWhat is the difference between sex and love?\n,clueweb12-0010wb-33-27298,7.1624,relational quantum mechanics stanford encyclo...
2,1,\nWhat is the difference between sex and love?\n,clueweb12-0103wb-41-14265,6.9118,lecture series science software engineering...
3,1,\nWhat is the difference between sex and love?\n,clueweb12-1013wb-51-17839,6.4538,ethical buyers guide toy cars trains planes s...
4,1,\nWhat is the difference between sex and love?\n,clueweb12-1400tw-39-23968,6.2936,video video cbr tv movie trailersclips tv tra...


In [28]:
df.to_csv("/content/drive/MyDrive/Touche/2020/Psuedo_Intial_BM25_retrieval.csv")

In [29]:
from sklearn.metrics import ndcg_score
new=df
rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/new_rel_2020.csv")
rel_2020=rel_2020.drop(["Unnamed: 0","no"],axis=1)
topics_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")
rel_2020=rel_2020.join(topics_2020.set_index("Number"),on="qid").drop("qid",axis=1)
scores=[]
for title in tqdm(new["title"].unique()):
   predicted=new.loc[new["title"]==title]
   true=rel_2020.loc[rel_2020["Title"]==title].drop("Unnamed: 0",axis=1)
   joined=true.join(predicted.set_index("doc_id"),on="doc", how="inner")
   joined["rel"]=joined["rel"].astype(float)
   #print(joined)
   x=np.reshape(np.asarray(joined["rel"]),(1,len(joined["rel"])))
   y=np.reshape(np.asarray(joined["score"]),(1,len(joined["rel"])))
   #print(x.shape)
   scores.append({"Title":title,"Score":ndcg_score(x,y,k=5)})

100%|██████████| 50/50 [00:00<00:00, 81.53it/s]


In [30]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,Title,Score
0,\nWhat is the difference between sex and love?\n,0.650393
1,"\nWhich is better, a laptop or a desktop?\n",0.893007
2,"\nWhich is better, Canon or Nikon?\n",1.0
3,\nWhat are the best dish detergents?\n,0.529635
4,\nWhat are the best cities to live in?\n,0.150393
5,\nWhat is the longest river in the U.S.?\n,0.639945
6,"\nWhich is healthiest: coffee, green tea or bl...",0.285164
7,\nWhat are the advantages and disadvantages of...,0.58557
8,\nWhy is Linux better than Windows?\n,0.510447
9,\nHow to sleep better?\n,0.387483


In [31]:
np.mean(np.asarray(scores["Score"].astype(float)))

0.6056172984600517

In [23]:
# Normal BM25 Ndcg=0.6245066216824341

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6056172984600517

#Top 3
# Psuedo relevance Intial Retrieval Ndcg=0.5831943937936228

#Top 10
# Psuedo relevance Intial Retrieval Ndcg=0.5606175240906345


# Comparison between normal queries and psuedo relevance queries for 2020 (Reranking)

In [None]:
#install pygaggle before transformers (first create the index using pyserini then install pygaggle)
!pip install pygaggle==0.0.2
!pip install transformers==4.17.0
import transformers
import pygaggle

In [2]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, DuoT5
reranker =  MonoT5()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1841.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691413.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [3]:
import pandas as pd
from tqdm import tqdm
from xml.dom import minidom

In [4]:
relevant_bm25=pd.read_csv("/content/drive/MyDrive/Touche/2020/Psuedo_Intial_BM25_retrieval.csv")
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,title,doc_id,score,content
0,0,1,\nWhat is the difference between sex and love?\n,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...
1,1,1,\nWhat is the difference between sex and love?\n,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...
2,2,1,\nWhat is the difference between sex and love?\n,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...
3,3,1,\nWhat is the difference between sex and love?\n,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...
4,4,1,\nWhat is the difference between sex and love?\n,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...


In [5]:
topics= pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")
relevant_bm25=relevant_bm25.join(topics[["Title","New"]].set_index("Title"),on="title", how="inner")
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,title,doc_id,score,content,New
0,0,1,\nWhat is the difference between sex and love?\n,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...,\nWhat is the difference between sex and love?...
1,1,1,\nWhat is the difference between sex and love?\n,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...,\nWhat is the difference between sex and love?...
2,2,1,\nWhat is the difference between sex and love?\n,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...,\nWhat is the difference between sex and love?...
3,3,1,\nWhat is the difference between sex and love?\n,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...,\nWhat is the difference between sex and love?...
4,4,1,\nWhat is the difference between sex and love?\n,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...,\nWhat is the difference between sex and love?...


In [6]:
relevant_bm25=relevant_bm25.drop(["title"],axis=1)
relevant_bm25=relevant_bm25.rename(columns={"New":"title"})
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,doc_id,score,content,title
0,0,1,clueweb12-1311wb-38-04771,8.0234,diverstity whats point interview rae pica med...,\nWhat is the difference between sex and love?...
1,1,1,clueweb12-1506wb-16-24791,7.9391,sex pornography media home resources calendar...,\nWhat is the difference between sex and love?...
2,2,1,clueweb12-1100tw-83-05955,7.8749,im asexual partner wants sex rh reality chec...,\nWhat is the difference between sex and love?...
3,3,1,clueweb12-0004wb-38-13132,7.869,wait sex academics humanities social sciences...,\nWhat is the difference between sex and love?...
4,4,1,clueweb12-0916wb-94-14593,7.8487,pure intimacy sex single guy 1 pure intimacy...,\nWhat is the difference between sex and love?...


In [7]:
# Mono T5 reranker
new=[]
for title in tqdm(relevant_bm25["title"].unique()):
  passages= relevant_bm25.loc[relevant_bm25["title"]==title]
  texts = [ Text(p[1].content, {'docid': p[1].doc_id}, 0) for p in passages.iterrows()]
  query = Query(title)
  reranked = reranker.rerank(query, texts)
  for i in range(0, 1000):
    new.append({"doc_id":reranked[i].metadata["docid"], "Score":reranked[i].score, "title":title})

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
100%|██████████| 50/50 [1:40:50<00:00, 121.02s/it]


In [8]:
df=pd.DataFrame(new)

In [None]:
from sklearn.metrics import ndcg_score
new=df.join(topics[["Title","New"]].set_index("New"),on="title", how="inner")
new=new.drop("title",axis=1)
new=new.rename(columns={"Title":"title"})
rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/new_rel_2020.csv")
rel_2020=rel_2020.drop(["Unnamed: 0","no"],axis=1)
topics_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")
rel_2020=rel_2020.join(topics_2020.set_index("Number"),on="qid").drop("qid",axis=1)
scores=[]
for title in tqdm(new["title"].unique()):
   predicted=new.loc[new["title"]==title]
   true=rel_2020.loc[rel_2020["Title"]==title]
   joined=true.join(predicted.set_index("doc_id"),on="doc", how="inner")
   joined["rel"]=joined["rel"].astype(float)
   x=np.reshape(np.asarray(joined["rel"]),(1,len(joined["rel"])))
   y=np.reshape(np.asarray(joined["Score"]),(1,len(joined["rel"])))
   #print(x.shape)
   scores.append({"Title":title,"Score":ndcg_score(x,y,k=5)})

In [20]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,Title,Score
0,\nWhat is the difference between sex and love?\n,0.926966
1,"\nWhich is better, a laptop or a desktop?\n",1.0
2,"\nWhich is better, Canon or Nikon?\n",0.83042
3,\nWhat are the best dish detergents?\n,0.654809
4,\nWhat are the best cities to live in?\n,0.488244
5,\nWhat is the longest river in the U.S.?\n,0.529635
6,"\nWhich is healthiest: coffee, green tea or bl...",0.30744
7,\nWhat are the advantages and disadvantages of...,0.15102
8,\nWhy is Linux better than Windows?\n,0.66084
9,\nHow to sleep better?\n,0.535104


In [21]:
np.mean(np.asarray(scores["Score"].astype(float)))

0.7031541399074839

In [None]:
# Normal BM25 Ndcg=0.6245066216824341
# Mono t5 Ndcg@5=0.7337937746825516
# distibert Ndcg@5=0.41921988405319743

#Top 1
# Psuedo relevance Intial Retrieval Ndcg=0.6056172984600517

#Top 3
# Psuedo relevance Intial Retrieval Ndcg=0.5831943937936228

#Top 10
# Psuedo relevance Intial Retrieval Ndcg=0.5606175240906345

#Top 1
# Psuedo relevance Reranking Ndcg=0.7031541399074839
