In [None]:
# TASK 1: Creating retrieval document for the LeviRank system.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cleaning sample_data directory.
!rm -r sample_data

In [None]:
# The list of files that are present in the pwd.
!ls drive/MyDrive/touche-2022-prototyping/topics-task2-2022

backup-duoT5-voting-reranked.csv	 OBJECT1_OBJECT2-CLASSIFIER
duoT5-voting-reranked.csv		 retrieval-1500.jsonl
first_model_classifier-epoch-17-f1-5888  retrieval-2000.jsonl
initial-retrieval-1500.csv		 topics-task2.csv
initial-retrieval-2000.csv		 topics-task2.xml
monoT5-input-1500.csv			 Touche
monoT5-input-2000.csv			 touche_topics_query_expansion.csv
monoT5-voting-reranked.csv


In [None]:
# Import statements, must required for the final script.
import logging
import tarfile
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
import spacy
import string
from tqdm import tqdm
from spacy.lang.en.stop_words import STOP_WORDS
import random
import numpy as np
import pandas as pd
random.seed(10)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
# Spacy related import statements.
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
sp = spacy.load("en_core_web_sm")
all_stopwords = sp.Defaults.stop_words

In [None]:
# Sample test for checking out the spacy working.
text = sp("Nick likes to play football, however he is not too fond of tennis.")
# text_tokens = word_tokenize(text)
# print(text
#       )
token_list = []
token_tag_list = []
for token in text:
    token_list.append(token.text)
    token_tag_list.append(token.tag_)
print(token_list)
print(token_tag_list)

['Nick', 'likes', 'to', 'play', 'football', ',', 'however', 'he', 'is', 'not', 'too', 'fond', 'of', 'tennis', '.']
['NNP', 'VBZ', 'TO', 'VB', 'NN', ',', 'RB', 'PRP', 'VBZ', 'RB', 'RB', 'JJ', 'IN', 'NN', '.']


In [None]:
# Important function, required to be implemented in the query expansion script.
# Extracts list of synonyms and antonyms from the wordnet.
def synonym_antonym_extractor(w_):
     # extracting relevant synonyms and antonym pairs.
     from nltk.corpus import wordnet
     synonyms = []
     antonyms = []
     for syn in wordnet.synsets(w_):
          for l in syn.lemmas():
               synonyms.append(str(l.name()).replace('_',' '))
               if l.antonyms():
                    antonyms.append(str(l.antonyms()[0].name()).replace('_',' '))
     return list(set(synonyms)), list(set(antonyms))
# testing out outputs of the synonym_antonym_extractor function.
# synonym_antonym_extractor('better')

In [None]:
# Must required and embeddings to be downloaded into the script.
# import statements and GloVe embeddings loading for word vectorization.
import gensim.downloader as api
# overview of all models in gensim: https://github.com/RaRe-Technologies/gensim-data
model_glove = api.load("glove-wiki-gigaword-300")



In [None]:
# Important function required for the script you are making, for extract best
# synonym and antonym words. Second, it also calls synonym_antonym_extractor function in it.
# Also, it requires above loading of the module for its working.
# Getting 3 best synonyms and 2 best antonym pairs for a given 'ADJ'
def best_words_extractor(w_, syn_list, ann_list):
    # prepare dictionaries for scores of syn_list & ann_list with w_
    syns = []
    anns = []
    for s_ in syn_list:
        try:
            if model_glove.get_vector(s_) is not None and s_ != w_:
                syns.append(s_)
        except:
            pass
    # print(syns)
    for a_ in ann_list:
        try:
            if model_glove.get_vector(a_) is not None and a_ != w_:
                anns.append(a_)
        except:
            pass
    # print(anns)
    syn_score_dict = {}
    ann_score_dict = {}
    for s_ in syns:
        syn_score_dict[s_] = model_glove.distance(w_, s_)
    for a_ in anns:
        ann_score_dict[a_] = model_glove.distance(w_, a_)
    # getting only the top-k most similar synonyms and most dissimilar antonyms
    # values from the dictionaries and creating score permutations for them.
    syn_score_dict = {k: v for k, v in sorted(syn_score_dict.items(), key=lambda item: item[1])}
    # print(syn_score_dict)
    syn_score_dict = {k: syn_score_dict[k] for k in list(syn_score_dict)[:20]}
    # print(syn_score_dict)
    ann_score_dict = {k: v for k, v in sorted(ann_score_dict.items(), key=lambda item: item[1])}
    # print(ann_score_dict)
    ann_score_dict = {k: ann_score_dict[k] for k in list(ann_score_dict)[:20]}
    # print(ann_score_dict)
    syn_data = []
    ann_data = []
    c_ = 0
    for i, j in syn_score_dict.items():
        c_ = c_ + 1
        syn_data.append(i)
        if c_ >=3:
            break
    if len(syn_data) < 3:
        syn_data = ['good','well', 'best']
    c_ = 0
    for i, j in ann_score_dict.items():
        c_ = c_ + 1
        ann_data.append(i)
        if c_ >=1:
            break
    if len(ann_data) < 2:
        if len(ann_data) == 0:
            ann_data = ['worse','badly']
        if len(ann_data) == 1 and ann_data[0] != 'worse':
            ann_data.append('worse')
        if len(ann_data) == 1 and ann_data[0] == 'worse':
            ann_data.append('different')
    return syn_data, ann_data 

In [None]:
# Testing the outputs for above defined functions.
# Specifically, 'synonym_antonym_extractor' and 'best_words_extractor' 
syns, anons = synonym_antonym_extractor('better')
print(syns)
print(anons)
print(best_words_extractor('better', syns, anons))

['wagerer', 'serious', 'practiced', 'easily', 'in force', 'sound', 'honest', 'best', 'punter', 'honorable', 'unspoiled', 'substantially', 'skilful', 'skillful', 'undecomposed', 'near', 'expert', 'amend', 'salutary', 'comfortably', 'advantageously', 'effective', 'ameliorate', 'upright', 'bettor', 'intimately', 'adept', 'secure', 'well', 'in effect', 'respectable', 'beneficial', 'safe', 'good', 'improve', 'full', 'just', 'ripe', 'considerably', 'unspoilt', 'right', 'dear', 'estimable', 'dependable', 'proficient', 'break', 'better', 'meliorate']
['badly', 'ill', 'worse', 'bad', 'disadvantageously', 'evil', 'worsen']
(['good', 'well', 'improve'], ['worse', 'different'])


In [None]:
# IMPORTANT: This function combined with above two synonym and antonym
# finding and generating function is required to generate the expanded queries.
# Return multiple expanded query versions.
# nouns only, top-3 synonyms and antonyms queries.
# Note: this is the final function that given an input query generates out
# series of various expanded queries as output.
def get_comparation_superlation_nouns(query):
    nouns_as_string = []
    nouns_only_string = []
    restricted_nouns_as_string = []
    doc = sp(query)
    annotations = ["CC", "CD", "JJ", "JJR", "JJS",
            "RB", "RBR", "RBS", "NN", "NNS", "NNP",
            "NNPS", "VB"]
    annotations_except_nouns = ["CC", "CD", "JJ", "JJR", "JJS",
            "RB", "RBR", "RBS", "VB"]
    annotations_nouns = ["NN", "NNS", "NNP", "NNPS", "VB"]
    adj_flg = 0
    adj_val = 'better' # default value, query objectives.
    # appending data into nouns as string
    for token in doc:
        if token.tag_ in annotations:
            nouns_as_string.append(token.text)
            if token.tag_ in annotations_except_nouns and adj_flg == 0:
                adj_val = token.text
                adj_flg = 1
            if token.tag_ not in annotations_except_nouns:
                restricted_nouns_as_string.append(token.text)
            if token.tag_ in annotations_nouns:
                nouns_only_string.append(token.text)

    # appending top-3 syns and anons to the query
    adj_val= adj_val.lower()
    syns, anons = synonym_antonym_extractor(adj_val)
    # print(syns, anons)
    if len(syns) == 0:
        syns, _ = synonym_antonym_extractor('different')
    if len(anons) == 0:
        _, anons = synonym_antonym_extractor('better')
    
    syns_fin, anons_fin = best_words_extractor(adj_val,syns, anons)
    
    # queries preprepartion
    base_query = " ".join(nouns_as_string)
    noun_query = " ".join(nouns_only_string)
    temp_query = " ".join(restricted_nouns_as_string)
    syn1_query = "".join(syns_fin[0]).strip() + " " + temp_query
    syn2_query = "".join(syns_fin[1]).strip() + " " + temp_query
    syn3_query = "".join(syns_fin[2]).strip() + " " + temp_query
    ant1_query = "".join(anons_fin[0]).strip() + " " + temp_query
    ant2_query = "".join(anons_fin[1]).strip() + " " + temp_query

    return base_query.strip(), noun_query.strip(), syn1_query.strip(), syn2_query.strip(), syn3_query.strip(), \
    ant1_query.strip(), ant2_query.strip()

In [None]:
# Sample example for testing out the function.
topics = [
    "What is the difference between sex and love?",
    "Which is the highest mountain in the world?",
    "Which is better, a laptop or a desktop?",
]
for q in topics:
    print(get_comparation_superlation_nouns(q))
    print('\n')

('difference sex and love', 'difference sex love', 'different difference sex love', 'unlike difference sex love', 'dissimilar difference sex love', 'bad difference sex love', 'worse difference sex love')


('highest mountain world', 'mountain world', 'high mountain world', 'eminent mountain world', 'high-pitched mountain world', 'low mountain world', 'worse mountain world')


('better laptop or desktop', 'laptop desktop', 'good laptop desktop', 'well laptop desktop', 'improve laptop desktop', 'worse laptop desktop', 'different laptop desktop')




In [None]:
# Extensive sample testing example for getting out the function results.
topics_large = [
    "What is the difference between sex and love?",
    "Which is better, a laptop or a desktop?",
    "Which is better, Canon or Nikon?",
    "What are the best dish detergents?",
    "What are the best cities to live in?",
    "What is the longest river in the U.S.?",
    "Which is healthiest: coffee, green tea or black tea and why?",
    "What are the advantages and disadvantages of PHP over Python and vice versa?",
    "Why is Linux better than Windows?",
    "How to sleep better?",
    "Should I buy an LCD TV or a plasma TV?",
    "Train or plane? Which is the better choice?",
    "What is the highest mountain on Earth?",
    "Should one prefer Chinese medicine or Western medicine?",
    "What are the best washing machine brands?",
    "Should I buy or rent?",
    "Do you prefer cats or dogs, and why?",
    "What is the better way to grill outdoors: gas or charcoal?",
    "Which is better, MAC or PC?",
    "What is better: to use a brush or a sponge?",
    "Which is better, Linux or Microsoft?",
    "Which is better, Pepsi or Coke?",
    "What is better, Google search or Yahoo search?",
    "Which one is better, Netflix or Blockbuster?",
    "Which browser is better, Internet Explorer or Firefox?",
    "Which is a better vehicle: BMW or Audi?",
    "Which one is better, an electric stove or a gas stove?",
    "What planes are best, Boeing or Airbus?",
    "Which is better, Disneyland or Disney World?",
    "Should I buy an Xbox or a PlayStation?",
    "Which has more caffeine, coffee or tea?",
    "Which is better, LED or LCD Reception Displays?",
    "What is better: ASP or PHP?",
    "What is better for the environment, a real or a fake Christmas tree?",
    "Do you prefer tampons or pads?",
    "What IDE is better for Java: NetBeans or Eclipse?",
    "Is OpenGL better than Direct3D in terms of portability to different platforms?",
    "What are the differences between MySQL and PostgreSQL in performance?",
    "Is Java code more readable than code written in Scala?",
    "Which operating system has better performance: Windows 7 or Windows 8?",
    "Which smartphone has a better battery life: Xperia or iPhone?",
    "Which four wheel truck is better: Ford or Toyota?",
    "Should I prefer a Leica camera over Nikon for portrait photographs?",
    "Which company has a larger capitalization: Apple or Microsoft?",
    "Which laptop has a better durability: HP or Dell?",
    "Which beverage has more calories per glass: beer or cider?",
    "Is admission rate in Stanford higher than that of MIT?",
    "Is pasta healthier than pizza?",
    "Which city is more expensive to live in: San Francisco or New York?",
    "Whose salary is higher: basketball or soccer players?",
]
for q in topics_large[10:20]:
    print(get_comparation_superlation_nouns(q))
    # print('\n')

('buy LCD TV or plasma TV', 'buy LCD TV plasma TV', 'purchase LCD TV plasma TV', 'steal LCD TV plasma TV', 'bargain LCD TV plasma TV', 'sell LCD TV plasma TV', 'worse LCD TV plasma TV')
('Train or plane better choice', 'Train plane choice', 'good Train plane choice', 'well Train plane choice', 'best Train plane choice', 'bad Train plane choice', 'worse Train plane choice')
('highest mountain Earth', 'mountain Earth', 'high mountain Earth', 'eminent mountain Earth', 'high-pitched mountain Earth', 'low mountain Earth', 'worse mountain Earth')
('prefer Chinese medicine or Western medicine', 'prefer medicine medicine', 'choose medicine medicine', 'opt medicine medicine', 'favor medicine medicine', 'worse medicine medicine', 'different medicine medicine')
('best washing machine brands', 'washing machine brands', 'better washing machine brands', 'good washing machine brands', 'well washing machine brands', 'worst washing machine brands', 'worse washing machine brands')
('buy or rent', 'buy r

In [None]:
# Not required for building your script, it's used hear for testing baseline
# and voting based query expansion methods.
from xml.dom import minidom
# Define function for loading all the topics from the topics files.
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["number","title"])
  return parsed

# preparing the list of topics and corresponding dataframe.
# topics_2020 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task-2.xml")
# topics_2021 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-51-100.xml")
topics_2022 = parse_xml("drive/MyDrive/touche-2022-prototyping/topics-task2-2022/topics-task2.xml")
touche_topics = topics_2022
touche_topics

Unnamed: 0,number,title
0,2,"Which is better, a laptop or a desktop?"
1,3,"Which is better, Canon or Nikon?"
2,8,What are the advantages and disadvantages of P...
3,9,Why is Linux better than Windows?
4,12,Train or plane? Which is the better choice?
5,14,Should one prefer Chinese medicine or Western ...
6,17,"Do you prefer cats or dogs, and why?"
7,18,What is the better way to grill outdoors: gas ...
8,19,"Which is better, MAC or PC?"
9,22,"Which is better, Pepsi or Coke?"


In [None]:
for q in touche_topics['title']:
    print(get_comparation_superlation_nouns(q))

('better laptop or desktop', 'laptop desktop', 'good laptop desktop', 'well laptop desktop', 'improve laptop desktop', 'worse laptop desktop', 'different laptop desktop')
('better Canon or Nikon', 'Canon Nikon', 'good Canon Nikon', 'well Canon Nikon', 'improve Canon Nikon', 'worse Canon Nikon', 'different Canon Nikon')
('advantages and disadvantages PHP Python and vice versa', 'advantages disadvantages PHP Python vice', 'different advantages disadvantages PHP Python vice', 'unlike advantages disadvantages PHP Python vice', 'dissimilar advantages disadvantages PHP Python vice', 'bad advantages disadvantages PHP Python vice', 'worse advantages disadvantages PHP Python vice')
('Linux better Windows', 'Linux Windows', 'good Linux Windows', 'well Linux Windows', 'improve Linux Windows', 'worse Linux Windows', 'different Linux Windows')
('Train or plane better choice', 'Train plane choice', 'good Train plane choice', 'well Train plane choice', 'best Train plane choice', 'bad Train plane choi

In [None]:
# Some acronyms in the query might be an issue during the retrieval
# hardcoding domain knowledge with their full forms for retrieval.
acr_to_full_word = {'PC': 'Personal Computer (PC)', 'MAC': 'MacBook (MAC)', 'ASP': 'Active Server Pages (ASP)'
                    ,'PHP': 'Hypertext Preprocessor (PHP)' , 'IDE': 'Integrated Development Environment (IDE)'
                    ,'R': 'R (programming language)'
                    ,'IELTS': 'International English Language Testing System (IELTS)'
                    ,'TOEFL': 'Test of English as a Foreign Language (TOEFL)'
                  }

def acr_replace(query):
    for acr_, full_form_ in acr_to_full_word.items():
        full_query = query.replace(acr_, full_form_)
        query = full_query
    return full_query


print(acr_replace('Should I learn Python or R for data analysis?'))
print(acr_replace('What is better: ASP or PHP?'))
print(acr_replace('What IDE is better for Java: NetBeans or Eclipse?'))
print(acr_replace('Should I take the IELTS or the TOEFL?'))
print(acr_replace('What IDE is better for Java: NetBeans or Eclipse?'))

Should I learn Python or R (programming language) for data analysis?
What is better: Active Server Pages (ASP) or Hypertext Preprocessor (PHP)?
What Integrated Development Environment (IDE) is better for Java: NetBeans or Eclipse?
Should I take the International English Language Testing System (IELTS) or the Test of English as a Foreign Language (TOEFL)?
What Integrated Development Environment (IDE) is better for Java: NetBeans or Eclipse?


In [17]:
# Loading the pseudo relevance document from Pujit's topics file as reference.
pseudo_topics = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/Touche/topics-task2-psuedo_relevance.csv')

In [18]:
# Getting output of pseudo relevance query dataframe.
pseudo_topics.head()

Unnamed: 0,Number,Title,Objects,New,Frequency
0,2,"Which is better, a laptop or a desktop?","laptop, desktop","Which is better, a laptop or a desktop? computer",39
1,3,"Which is better, Canon or Nikon?","Canon, Nikon","Which is better, Canon or Nikon? vs",42
2,8,What are the advantages and disadvantages of P...,"PHP, Python",What are the advantages and disadvantages of P...,45
3,9,Why is Linux better than Windows?,"Linux, Windows",Why is Linux better than Windows? software,12
4,12,Train or plane? Which is the better choice?,"Train, plane",Train or plane? Which is the better choice? tr...,25


In [20]:
 # Not required for the script, building the dataframe.
 touche_topics['title'] = touche_topics['title'].apply(lambda x: x.strip())
 touche_topics['title_acr_full'] = touche_topics['title'].apply(lambda x: acr_replace(x.strip()))
 touche_topics['title_pseudo'] = pseudo_topics['New'].apply(lambda x: get_comparation_superlation_nouns(x)[0])
 touche_topics['query_base'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[0])
 touche_topics['query_noun'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[1])
 touche_topics['query_synonym1'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[2])
 touche_topics['query_synonym2'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[3])
 touche_topics['query_synonym3'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[4])
 touche_topics['query_antonym1'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[5])
 touche_topics['query_antonym2'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[6])

In [21]:
# Not required for the script, getting the dataframe output.
touche_topics

Unnamed: 0,number,title,title_acr_full,title_pseudo,query_base,query_noun,query_synonym1,query_synonym2,query_synonym3,query_antonym1,query_antonym2
0,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop?",better laptop or desktop computer,better laptop or desktop,laptop desktop,good laptop desktop,well laptop desktop,improve laptop desktop,worse laptop desktop,different laptop desktop
1,3,"Which is better, Canon or Nikon?","Which is better, Canon or Nikon?",better Canon or Nikon,better Canon or Nikon,Canon Nikon,good Canon Nikon,well Canon Nikon,improve Canon Nikon,worse Canon Nikon,different Canon Nikon
2,8,What are the advantages and disadvantages of P...,What are the advantages and disadvantages of H...,advantages and disadvantages PHP Python and vi...,advantages and disadvantages PHP Python and vi...,advantages disadvantages PHP Python vice,different advantages disadvantages PHP Python ...,unlike advantages disadvantages PHP Python vice,dissimilar advantages disadvantages PHP Python...,bad advantages disadvantages PHP Python vice,worse advantages disadvantages PHP Python vice
3,9,Why is Linux better than Windows?,Why is Linux better than Windows?,Linux better Windows software,Linux better Windows,Linux Windows,good Linux Windows,well Linux Windows,improve Linux Windows,worse Linux Windows,different Linux Windows
4,12,Train or plane? Which is the better choice?,Train or plane? Which is the better choice?,Train or plane better choice travel,Train or plane better choice,Train plane choice,good Train plane choice,well Train plane choice,best Train plane choice,bad Train plane choice,worse Train plane choice
5,14,Should one prefer Chinese medicine or Western ...,Should one prefer Chinese medicine or Western ...,prefer Chinese medicine or Western medicine pe...,prefer Chinese medicine or Western medicine,prefer medicine medicine,choose medicine medicine,opt medicine medicine,favor medicine medicine,worse medicine medicine,different medicine medicine
6,17,"Do you prefer cats or dogs, and why?","Do you prefer cats or dogs, and why?",prefer cats or dogs and people,prefer cats or dogs and,prefer cats dogs,choose cats dogs,opt cats dogs,favor cats dogs,worse cats dogs,different cats dogs
7,18,What is the better way to grill outdoors: gas ...,What is the better way to grill outdoors: gas ...,better way grill outdoors gas or charcoal,better way grill outdoors gas or charcoal,way grill gas charcoal,good way gas charcoal,well way gas charcoal,improve way gas charcoal,worse way gas charcoal,different way gas charcoal
8,19,"Which is better, MAC or PC?","Which is better, MacBook (MAC) or Personal Com...",better MAC or PC windows,better MAC or PC,MAC PC,good MAC PC,well MAC PC,improve MAC PC,worse MAC PC,different MAC PC
9,22,"Which is better, Pepsi or Coke?","Which is better, Pepsi or Coke?",better Pepsi or Coke taste,better Pepsi or Coke,Pepsi Coke,good Pepsi Coke,well Pepsi Coke,improve Pepsi Coke,worse Pepsi Coke,different Pepsi Coke


In [23]:
# Not required for the script, just saving the above dataframe in 'csv' format.
# touche_topics.to_csv("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/touche_topics_query_expansion.csv", index=False)
# Changing the final saved file name for pseudo relevance query retrieval.
touche_topics.to_csv("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/pseudo_touche_topics_query_expansion.csv", index=False)

In [None]:
# Metric Evaluation for the query expansion approach.
# 1. Loading the data from the touche topics file.
# ---
# ---
# IMPORTANT, relevant for script: We perform multiple retrievals for all the expanded queries.
# Note: This might be an important modification required from the current script perspective.

# Second, we use retrieval of our main query as driving matcher, and check whether for each query
# the document is retrieved by any other of the pooled queries (at least 1, threshold).
# If yes, we keep those documents. (We do it for first 1K documents, but we will essentially end up with ,1k documents)

# After this, we select first-k relevant documents from each of the pool queries 
# The selected number of retrieved documents is optimized based on practical experience with retrieval count tuning.

# Finally, those first-k relevant documents that are not present in our list from the second step are appended until 1.5k
# document count is reached. This logic is implemented in the form of a dictionary iterator in the upcoming cells.
# ---
# ---
# 2. Loading the corresponding queries into List, making predictions of k=275 (min) on the built index.
# 3. Merging Logic: For additional queries, remove the matching documents & keep only first non-existing docs.
# 4. Evaluation Metric: Average coverage/recall calculation for the voting based query expansion.

In [24]:
# Import statements for retrieval evaluation.
import pandas as pd

In [25]:
# Loading the saved query expansion dataframe.
touche_topics = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/pseudo_touche_topics_query_expansion.csv')
touche_topics.head()

Unnamed: 0,number,title,title_acr_full,title_pseudo,query_base,query_noun,query_synonym1,query_synonym2,query_synonym3,query_antonym1,query_antonym2
0,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop?",better laptop or desktop computer,better laptop or desktop,laptop desktop,good laptop desktop,well laptop desktop,improve laptop desktop,worse laptop desktop,different laptop desktop
1,3,"Which is better, Canon or Nikon?","Which is better, Canon or Nikon?",better Canon or Nikon,better Canon or Nikon,Canon Nikon,good Canon Nikon,well Canon Nikon,improve Canon Nikon,worse Canon Nikon,different Canon Nikon
2,8,What are the advantages and disadvantages of P...,What are the advantages and disadvantages of H...,advantages and disadvantages PHP Python and vi...,advantages and disadvantages PHP Python and vi...,advantages disadvantages PHP Python vice,different advantages disadvantages PHP Python ...,unlike advantages disadvantages PHP Python vice,dissimilar advantages disadvantages PHP Python...,bad advantages disadvantages PHP Python vice,worse advantages disadvantages PHP Python vice
3,9,Why is Linux better than Windows?,Why is Linux better than Windows?,Linux better Windows software,Linux better Windows,Linux Windows,good Linux Windows,well Linux Windows,improve Linux Windows,worse Linux Windows,different Linux Windows
4,12,Train or plane? Which is the better choice?,Train or plane? Which is the better choice?,Train or plane better choice travel,Train or plane better choice,Train plane choice,good Train plane choice,well Train plane choice,best Train plane choice,bad Train plane choice,worse Train plane choice


In [26]:
# Not relevant for the script, as you'll be getting pool of expanded from the above function.
queries = touche_topics['title']
queries_arc = touche_topics['title_acr_full']
queries_pseudo = touche_topics['title_pseudo']
base_queries = touche_topics['query_base']
noun_queries = touche_topics['query_noun']
syn1_queries = touche_topics['query_synonym1']
syn2_queries = touche_topics['query_synonym2']
syn3_queries = touche_topics['query_synonym3']
ant1_queries = touche_topics['query_antonym1']
ant2_queries = touche_topics['query_antonym2']

In [27]:
# Important for making the 'pyserini' library work.
# Installing linux related stuff for pyserini.
!sudo apt-get install libomp-dev
# Installing important packages for building the new index on merged documents.
!pip install pyserini
!pip install faiss

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'sudo apt autoremove' to remove them.
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 42 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 0s (767 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialo

Collecting faiss
  Downloading faiss-1.5.3-cp37-cp37m-manylinux1_x86_64.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.2 MB/s 
Installing collected packages: faiss
Successfully installed faiss-1.5.3


In [28]:
# Relevant for the script, loading the already stored and built index.
from pyserini.search.lucene import LuceneSearcher
searcher_opt = LuceneSearcher('/content/drive/MyDrive/touche-2022-prototyping/indexes/baseline_index')
searcher_opt.set_bm25(1.2, 0.68)

In [29]:
# Relevant for the script, getting the driving query retrieval in the solution dictionary.
solution_dict = {}
for id_, q_ in zip(touche_topics['number'], queries):
    hits = searcher_opt.search(q_.strip(), k=1500)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict[id_] = d_list

In [30]:
# Relevant for the script, getting the acronym query retrieval in the solution dictionary.
solution_dict_arc = {}
for id_, q_ in zip(touche_topics['number'], queries_arc):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_arc[id_] = d_list

In [31]:
# Relevant for the script, getting the driving query retrieval in the solution dictionary.
solution_dict_pseudo = {}
for id_, q_ in zip(touche_topics['number'], queries_pseudo):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_pseudo[id_] = d_list

In [32]:
# Relevant for the script, getting the noun query retrievals.
solution_dict_noun = {}
for id_, q_ in zip(touche_topics['number'], noun_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_noun[id_] = d_list

In [33]:
# Relevant for the script, getting the base query retrievals.
solution_dict_base = {}
for id_, q_ in zip(touche_topics['number'], base_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_base[id_] = d_list

In [34]:
# Relevant for the script, getting the syn1 query retrievals.
solution_dict_syn1 = {}
for id_, q_ in zip(touche_topics['number'], syn1_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_syn1[id_] = d_list

In [35]:
# Relevant for the script, getting the syn2 query retrievals.
solution_dict_syn2 = {}
for id_, q_ in zip(touche_topics['number'], syn2_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_syn2[id_] = d_list

In [36]:
# Relevant for the script, getting the syn3 query retrievals.
solution_dict_syn3 = {}
for id_, q_ in zip(touche_topics['number'], syn3_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_syn3[id_] = d_list

In [37]:
# Relevant for the script, getting the ant1 query retrievals.
solution_dict_ant1 = {}
for id_, q_ in zip(touche_topics['number'], ant1_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_ant1[id_] = d_list

In [38]:
# Relevant for the script, getting the ant2 query retrievals.
solution_dict_ant2 = {}
for id_, q_ in zip(touche_topics['number'], ant2_queries):
    hits = searcher_opt.search(q_.strip(), k=3000)
    d_list = []
    for h_ in hits:
        # d_= h_.docid.split('___')[0]
        d_= h_.docid
        if d_ not in d_list:
            d_list.append(d_)
    solution_dict_ant2[id_] = d_list

In [40]:
# IMPORTANT: Main document merging logic, you might require a function or I'll recommend straight up adding it in the voting/expansion script.
# Solution dictionary final below, will store list of all document (but not in ordered manner) to be fed into monoT5.
# The order doesn't matter as monoT5 ingnores all the associated scores.
# solution_dict_1500 = {}
solution_dict_2000 = {}
# Iterating over all the pooled queries that we have created, plus our driving query.
for (k,v), (k1,v1), (k2,v2), (k3,v3), (k4,v4), (k5,v5), (k6,v6), (k7,v7), (k8,v8), (k9,v9) \
    in zip(solution_dict.items(), solution_dict_noun.items(), solution_dict_base.items(), solution_dict_syn1.items(), \
           solution_dict_syn2.items(), solution_dict_syn3.items(), solution_dict_ant1.items(), solution_dict_ant2.items(), \
           solution_dict_arc.items(), solution_dict_pseudo.items() ):
    # Check for all k's being equal is quite unnecessary though.
    if k == k1 and k1 == k2 and k2 == k3 and k3 == k4 and k4 == k5 and k5 == k6 and k6 == k7 and k7 == k8 and k8 == k9:
        # For storing the list of 1500 relevant documents as per our merging logic.
        l_temp = []

        # Finding commonality amongst documents from different queries.
        # Below 'v' represents documents from driving main query.
        # We essentially check, whether the given document v_ in v is present
        # in any of the other retrievals by other pool of expanded queries.
        # If, v_ is present in any of the documents we add it into the l_temp.
        v_x = v[:1000] # Limiting only to 1000 common entries only.
        for v_ in v_x:
            c_b = 0
            if v_ in v1:
                c_b = c_b + 1
            if v_ in v2:
                c_b = c_b + 1
            if v_ in v3:
                c_b = c_b + 1
            if v_ in v4:
                c_b = c_b + 1
            if v_ in v5:
                c_b = c_b + 1
            if c_b >= 1:
                l_temp.append(v_)

        # For the query we will append documents
        # till it reaches the 1125 document count for l_temp.
        diff_ = 1050 - len(l_temp)

        # Appending document logic.
        c_ = 0
        for vx_ in v8: # Appending acronym query documents.
            if vx_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(vx_)
            elif c_ > diff_:
                break
            c_ = c_ + 1

        # Appending document logic.
        c_ = 0
        for v1_ in v1: # Appending base query documents.
            if v1_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v1_)
            elif c_ > 125:
                break
            c_ = c_ + 1

        c_ = 0
        for v2_ in v2: # Appending noun query documents.
            if v2_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v2_)
            elif c_ > 125:
                break
            c_ = c_ + 1

        c_ = 0
        for v3_ in v3: # Appending syn1 documents.
            if v3_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v3_)
            elif c_ > 35:
                break
            c_ = c_ + 1

        c_ = 0
        for v4_ in v4: # Appending syn2 documents.
            if v4_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v4_)
            elif c_ > 34:
                break
            c_ = c_ + 1

        c_ = 0
        for v5_ in v5: # Appending syn3 documents.
            if v5_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v5_)
            elif c_ > 33:
                break
            c_ = c_ + 1

        c_ = 0
        for v6_ in v6: # Appending ant documents.
            if v6_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v6_)
            elif c_ > 25:
                break
            c_ = c_ + 1

        c_ = 0
        for v7_ in v7: # Appending ant documents.
            if v7_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v7_)
            elif c_ > 25:
                break
            c_ = c_ + 1
        
        c_ = 0
        for v8_ in v: # Appending org. documents.
            if v8_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v8_)
            elif c_ > 175:
                break
            c_ = c_ + 1

        for v9_ in v9: # Appending pseudo relevance documents.
            if v9_ not in l_temp: # Only relevant non-existing docs added.
                l_temp.append(v9_)

        # solution_dict_1500[k] = l_temp[:1500]
        solution_dict_2000[k] = l_temp[:2000]

In [41]:
# Storing values present in dictionaries.
# ids_1500 = []
ids_2000 = []
# res_1500 = []
res_2000 = []
# for x, y in solution_dict_1500.items():
#     ids_1500.append(x)
#     res_1500.append(y)

for x, y in solution_dict_2000.items():
    ids_2000.append(x)
    res_2000.append(y)

# print(set(ids_1500) == set(ids_2000))
# print(len(ids_1500), len(res_1500), len(res_1500[1]))
print(len(ids_2000), len(res_2000), len(res_2000[1]))

50 50 2000


In [42]:
# Storing the final results in pandas csv form and jsonlines form for the retrieval.
# query1500_df = pd.DataFrame({'query_id': ids_1500,'query':queries,'document_ids': res_1500})
query2000_df = pd.DataFrame({'query_id': ids_2000,'query':queries,'document_ids': res_2000})

In [43]:
# query1500_df.head()

In [44]:
query2000_df.head()

Unnamed: 0,query_id,query,document_ids
0,2,"Which is better, a laptop or a desktop?","[clueweb12-0608wb-66-03868___1, clueweb12-0001..."
1,3,"Which is better, Canon or Nikon?","[clueweb12-0405wb-11-27558___46, clueweb12-110..."
2,8,What are the advantages and disadvantages of P...,"[clueweb12-0008wb-36-19669___42, clueweb12-180..."
3,9,Why is Linux better than Windows?,"[clueweb12-0608wb-30-15417___40, clueweb12-060..."
4,12,Train or plane? Which is the better choice?,"[clueweb12-1204wb-20-15853___8, clueweb12-1019..."


In [47]:
# Saving dataframes as csv files.
# query1500_df.to_csv('drive/MyDrive/touche-2022-prototyping/topics-task2-2022/initial-retrieval-1500.csv', encoding='utf-8', index=False)
query2000_df.to_csv('drive/MyDrive/touche-2022-prototyping/topics-task2-2022/pseudo-initial-retrieval-2000.csv', encoding='utf-8', index=False)

In [46]:
!pip install jsonlines
import jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [49]:
# Writing the above dataframes in jsonline files.
# output1500 = jsonlines.open('drive/MyDrive/touche-2022-prototyping/topics-task2-2022/retrieval-1500.jsonl', 'w')
output2000 = jsonlines.open('drive/MyDrive/touche-2022-prototyping/topics-task2-2022/pseudo-retrieval-2000.jsonl', 'w')

# for x, q, y in zip(query1500_df['query_id'], queries, query1500_df['document_ids']):
#     output1500.write({
#             'query_id': x,
#             'query': q,
#             'document_ids': y
#         })

for x, q, y in zip(query2000_df['query_id'], queries, query2000_df['document_ids']):
    output2000.write({
            'query_id': x,
            'query': q,
            'document_ids': y
        })

In [50]:
# Creating file as input for the monoT5 model.
# Loading the dataframe files of the saved results from the 
# query1500_df = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/initial-retrieval-1500.csv')
# print(query1500_df.head())
query2000_df = pd.read_csv('/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/pseudo-initial-retrieval-2000.csv')
print(query2000_df.head())

   query_id                                              query  \
0         2            Which is better, a laptop or a desktop?   
1         3                   Which is better, Canon or Nikon?   
2         8  What are the advantages and disadvantages of P...   
3         9                  Why is Linux better than Windows?   
4        12        Train or plane? Which is the better choice?   

                                        document_ids  
0  ['clueweb12-0608wb-66-03868___1', 'clueweb12-0...  
1  ['clueweb12-0405wb-11-27558___46', 'clueweb12-...  
2  ['clueweb12-0008wb-36-19669___42', 'clueweb12-...  
3  ['clueweb12-0608wb-30-15417___40', 'clueweb12-...  
4  ['clueweb12-1204wb-20-15853___8', 'clueweb12-1...  


In [51]:
# Storing the input file results from the monoT5 model into list
# for dataframe conversion.
# input_1500 = []
input_2000 = []

In [52]:
# Installing jsonlines library.
!pip install jsonlines
import jsonlines



In [53]:
# Loading the document corpus for getting document content and processing it.
qcorpus_list = list(jsonlines.open('/content/drive/MyDrive/touche-2022-prototyping/touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl'))

In [54]:
# NLTK related import statements.
# import statements, must required for the final script.
# All the document processing related import statements.
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [63]:
# Creating an input file for indexing with preprocessing in the pyserini compatible format.
doc_dictionary = {}
# Loading all the stop words frome english language.
# stop_words = set(stopwords.words("english"))
for vals in tqdm(qcorpus_list):
    document = vals['contents']
    document = document.lower()
    # Remove punctuation and numbers.
    document = re.sub(r"[^A-Za-z\s]+", " ", document)
    # Remove extra XML query tag.
    document = document.replace("query"," ")
    # Remove extra white spaces.
    document = re.sub(r"[\s]{2,}", " ", document)
    # Tokenize string.
    # word_tokens = word_tokenize(document)
    # Remove stop words.
    # word_tokens = [token for token in word_tokens if token not in stop_words]
    # word_tokens = [token for token in word_tokens if len(token) > 1]
    # Combining word tokens for results.
    results = document
    document = results.strip()
    doc_dictionary[vals['id']] = document

100%|██████████| 868655/868655 [01:15<00:00, 11577.16it/s]


In [56]:
# For converting string representation of list into actual lists.
import ast

In [61]:
def query_basic_preprocess(input_query):
    input_query = input_query.lower()
    # Remove punctuation and numbers.
    input_query = re.sub(r"[^A-Za-z\s]+", " ", input_query)
    # Remove extra white spaces.
    input_query = re.sub(r"[\s]{2,}", " ", input_query)
    # Tokenize string.
    # word_tokens = word_tokenize(input_query)
    # Remove stop words.
    # word_tokens = [token for token in word_tokens if token not in stop_words]
    # word_tokens = [token for token in word_tokens if len(token) > 1]
    # Combining word tokens for results.
    # results = " ".join(word_tokens)
    output_query = input_query.strip()
    return output_query

In [64]:
# apply basic preprocessing to the input queries.
query2000_df['query'] = query2000_df['query'].apply(lambda x: query_basic_preprocess(x))

In [66]:
for ti_, q_, doc_list in zip(query2000_df['query_id'], query2000_df['query'], query2000_df['document_ids']):
    doc_list = ast.literal_eval(doc_list)
    doc_list = [n.strip() for n in doc_list]
    for doc_ in doc_list:
        input_2000.append({"title_id":ti_,"title":q_,'doc_id': doc_,"content": doc_dictionary[doc_]})

In [67]:
input_2000[:10]

[{'content': 'desktop or laptop what the tech homeforumscontact hijackthissearchhelp please visit our forums for help with malware removal or any tech support question take me to the forums desktop or laptop aug posted by hemal in hardware do i buy a laptop or a desktop computer for my next machine i am not sure if laptops are just replacing desktops and if i should just not waste my money on a desktop or if they are still better computers thanks laptops are becoming more prevalent in the tech world today becoming the staple of computing especially on the college scene desktops the office work horse are slowly becoming edged out of the picture due to their lack of portability but are they being underrated if you re considering buying either a new desktop or laptop you have to view the pros and cons of both and whether which would serve you better let s outline a few do you need a laptop or desktop',
  'doc_id': 'clueweb12-0608wb-66-03868___1',
  'title': 'which is better a laptop or a 

In [None]:
# for ti_, q_, doc_list in zip(query1500_df['query_id'], query1500_df['query'], query1500_df['document_ids']):
#     doc_list = ast.literal_eval(doc_list)
#     doc_list = [n.strip() for n in doc_list]
#     for doc_ in doc_list:
#         input_1500.append({"title_id":ti_,"title":q_,'doc_id': doc_,"content": doc_dictionary[doc_]})

In [57]:
# input_1500[:10]

In [58]:
# input_1500_df=pd.DataFrame(input_1500)
# print(input_1500_df.head())

In [None]:
# input_1500_df.to_csv("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/monoT5-input-1500.csv", index=False)

In [68]:
input_2000_df=pd.DataFrame(input_2000)
print(input_2000_df.head())

   title_id                                  title  \
0         2  which is better a laptop or a desktop   
1         2  which is better a laptop or a desktop   
2         2  which is better a laptop or a desktop   
3         2  which is better a laptop or a desktop   
4         2  which is better a laptop or a desktop   

                          doc_id  \
0  clueweb12-0608wb-66-03868___1   
1  clueweb12-0001wb-72-30848___8   
2  clueweb12-0907wb-60-03588___2   
3  clueweb12-1804wb-90-15876___2   
4  clueweb12-0601wb-28-19879___1   

                                             content  
0  desktop or laptop what the tech homeforumscont...  
1  desktop replacements are usually larger and he...  
2  there has been a turnaround in pricing trends ...  
3  therefore hard drives in a laptop usually have...  
4  do you like laptops or desktops better mister ...  


In [69]:
input_2000_df.to_csv("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-2022/pseudo-monoT5-input-2000.csv", index=False)