In [None]:
# TASK 1: Analysis of step-one performance retrieval.
# First thing, start easy: build two-indexes, evaluate them top-m retrievals
# Hit-once and Hit-all. Then see other metrics like precision & recall + Roni's analysis.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# cleaning sample_data directory
!rm -r sample_data

In [None]:
!ls drive/My\ Drive/touche-2022-prototyping

dataset-prep-and-retrieval-diagnostic-analysis.ipynb
indexes
initial-retrieval-metric-analysis.ipynb
merged_documents
missed_1000.csv
missed_1200.csv
missed_1375.csv
missed_1500.csv
missed_250.csv
missed_500.csv
missed_786.csv
missed_full_1000.csv
missed_full_1375.csv
missed_full_1500.csv
missed_full_250.csv
missed_full_500.csv
missed_full_786.csv
mon-duo-retrieval-prototyping-and-analysis.ipynb
query-expansion-retrieval-metric-analysis.ipynb
topics-task2-51-100.xml
topics-task-2.xml
touche2020-task2-relevance-withbaseline.qrels
touche_complete_topics.csv
touche_ground_truth.csv
touche_results_2021.csv
touche-task2-51-100-relevance.qrels
touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl


In [None]:
import logging
import tarfile
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
# from typing import List
import spacy
# en_core_web_sm
import string
from tqdm import tqdm
from spacy.lang.en.stop_words import STOP_WORDS
import random
import numpy as np
import pandas as pd
random.seed(10)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
topics = [
    "What is the difference between sex and love?",
    "Which is the highest mountain in the world?",
    "Which is better, a laptop or a desktop?",
]

df = pd.DataFrame(
    list(zip(range(0,len(topics)),topics, topics, len(topics) * ["original"])),
    columns=["TopicID","topic", "query", "tag"],
)

expansion = QueryExpansion(df)
df_demo = expansion.expansion(
    relation=True, synonyms=True, sensevec=False, embedded=True
)
df_demo

Relation progress: 100%|██████████| 3/3 [00:00<00:00, 79.64it/s]
Synonyms progress: 100%|██████████| 3/3 [00:02<00:00,  1.33it/s]
Embeddings progress: 100%|██████████| 3/3 [00:01<00:00,  2.00it/s]


Unnamed: 0,TopicID,topic,query,tag
0,0,What is the difference between sex and love?,What is the difference between sex and love?,original
1,1,Which is the highest mountain in the world?,Which is the highest mountain in the world?,original
2,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop?",original
0,0,What is the difference between sex and love?,difference sex and love,annotation
1,1,Which is the highest mountain in the world?,highest mountain world,annotation
2,2,"Which is better, a laptop or a desktop?",better laptop or desktop,annotation
0,0,What is the difference between sex and love?,What is the difference between sex and love co...,syns
1,1,Which is the highest mountain in the world?,Which is the highest mountain in the world dea...,syns
2,2,"Which is better, a laptop or a desktop?",Which is better a laptop or a desktop screen c...,syns
0,0,What is the difference between sex and love?,What is the differance between sex and love?,embedded_1


In [None]:
!ls

drive


In [None]:
# one-time pre-loading of libraries.
import spacy
sp = spacy.load("en_core_web_sm")
all_stopwords = sp.Defaults.stop_words

In [None]:
text = sp("Nick likes to play football, however he is not too fond of tennis.")
# text_tokens = word_tokenize(text)
# print(text
#       )
token_list = []
token_tag_list = []
for token in text:
    token_list.append(token.text)
    token_tag_list.append(token.tag_)
print(token_list)
print(token_tag_list)

['Nick', 'likes', 'to', 'play', 'football', ',', 'however', 'he', 'is', 'not', 'too', 'fond', 'of', 'tennis', '.']
['NNP', 'VBZ', 'TO', 'VB', 'NN', ',', 'RB', 'PRP', 'VBZ', 'RB', 'RB', 'JJ', 'IN', 'NN', '.']


In [None]:
def synonym_antonym_extractor(w_):
     # extracting relevant synonyms and antonym pairs.
     from nltk.corpus import wordnet
     synonyms = []
     antonyms = []

     for syn in wordnet.synsets(w_):
          for l in syn.lemmas():
               synonyms.append(str(l.name()).replace('_',' '))
               if l.antonyms():
                    antonyms.append(str(l.antonyms()[0].name()).replace('_',' '))
     return list(set(synonyms)), list(set(antonyms))
# synonym_antonym_extractor('better')

In [None]:
import gensim.downloader as api
# overview of all models in gensim: https://github.com/RaRe-Technologies/gensim-data
model_glove = api.load("glove-wiki-gigaword-300")



In [None]:
# Getting 3 best synonyms and antonym pairs for a give 'ADJ'
def best_words_extractor(w_, syn_list, ann_list):
    # prepare dictionaries for scores of syn_list & ann_list with w_
    syns = []
    anns = []
    for s_ in syn_list:
        try:
            if model_glove.get_vector(s_) is not None and s_ != w_:
                syns.append(s_)
        except:
            pass
    # print(syns)
    for a_ in ann_list:
        try:
            if model_glove.get_vector(a_) is not None and a_ != w_:
                anns.append(a_)
        except:
            pass
    # print(anns)
    syn_score_dict = {}
    ann_score_dict = {}
    for s_ in syns:
        syn_score_dict[s_] = model_glove.distance(w_, s_)
    for a_ in anns:
        ann_score_dict[a_] = model_glove.distance(w_, a_)

    # getting only the top-k most similar synonyms and most dissimilar antonyms
    # values from the dictionaries and creating score permutations for them.

    syn_score_dict = {k: v for k, v in sorted(syn_score_dict.items(), key=lambda item: item[1])}
    # print(syn_score_dict)
    syn_score_dict = {k: syn_score_dict[k] for k in list(syn_score_dict)[:20]}
    # print(syn_score_dict)
    ann_score_dict = {k: v for k, v in reversed(sorted(ann_score_dict.items(), key=lambda item: item[1]))}
    # print(ann_score_dict)
    ann_score_dict = {k: ann_score_dict[k] for k in list(ann_score_dict)[:20]}
    # print(ann_score_dict)

    # for synonyms preparing min similarity score sum and max difference score of the combinations.
    # rev_syn_score_dict = {}
    syn_min = 1000
    syn_max = -1000
    syn_data = []
    for i_ in syn_score_dict.keys():
        for j_ in syn_score_dict.keys():
            for k_ in syn_score_dict.keys():
                if i_ != j_ and j_ != k_ and i_ != k_ :
                    # print(i[0],j[0],k[0])
                    scr_1 = model_glove.distance(i_, j_) + \
                           model_glove.distance(j_, k_) + \
                           model_glove.distance(k_, i_)
                    scr_1 = round(scr_1, 5)
                    scr_2 = syn_score_dict[i_] + syn_score_dict[j_] + syn_score_dict[k_]
                    scr_2 = round(scr_2, 5)
                    if scr_1 > syn_max and scr_2 < syn_min:
                        syn_max = scr_1
                        syn_min = scr_2
                        syn_data = [i_,j_,k_]
                    
    # for anonyms preparing max disssimilarity score sum and max difference score.
    # score_func = 0.5*max_sim w/ w_ + 0.5*max_sim w/ each other terms.
    # two cooperative scores, best to select the maximum value from the combinations.
    ann_max = -1000
    ann_data = []
    for i_ in ann_score_dict.keys():
        for j_ in ann_score_dict.keys():
            for k_ in ann_score_dict.keys():
                if i_ != j_ and j_ != k_ and i_ != k_ :
                    # print(i[0],j[0],k[0])
                    scr_1 = model_glove.distance(i_, j_) + \
                           model_glove.distance(j_, k_) + \
                           model_glove.distance(k_, i_) + \
                           ann_score_dict[i_] + ann_score_dict[j_] + ann_score_dict[k_]
                    scr_1 = round(scr_1, 5)
                    if scr_1 > ann_max:
                        ann_max = scr_1
                        ann_data = [i_,j_,k_]
    # print(ann_data)
    return syn_data, ann_data 

In [None]:
# testing the outputs for above defined functions.
syns, anons = synonym_antonym_extractor('better')
print(syns)
print(anons)
print(best_words_extractor('better', syns, anons))

['best', 'full', 'amend', 'estimable', 'intimately', 'honorable', 'effective', 'well', 'comfortably', 'advantageously', 'salutary', 'unspoiled', 'expert', 'skilful', 'adept', 'beneficial', 'ameliorate', 'ripe', 'break', 'undecomposed', 'respectable', 'right', 'safe', 'unspoilt', 'better', 'good', 'serious', 'easily', 'substantially', 'meliorate', 'wagerer', 'practiced', 'considerably', 'upright', 'in force', 'secure', 'skillful', 'punter', 'improve', 'just', 'in effect', 'proficient', 'bettor', 'near', 'sound', 'honest', 'dear', 'dependable']
['ill', 'evil', 'disadvantageously', 'worse', 'badly', 'worsen', 'bad']
(['good', 'well', 'improve'], ['evil', 'worsen', 'badly'])


In [None]:
# return two expanded query versions.
# nouns only, top-3 synonyms and antonyms queries.
def get_comparation_superlation_nouns(query):
    nouns_as_string = []
    restricted_nouns_as_string = []
    doc = sp(query)
    annotations = ["CC", "CD", "JJ", "JJR", "JJS",
            "RB", "RBR", "RBS", "NN", "NNS", "NNP",
            "NNPS", "VB"]
    annotations_except_nouns = ["CC", "CD", "JJ", "JJR", "JJS",
            "RB", "RBR", "RBS", "VB"]

    adj_flg = 0
    adj_val = 'better' # default value, query objectives.

    # appending data into nouns as string
    for token in doc:
        if token.tag_ in annotations:
            nouns_as_string.append(token.text)
            if token.tag_ in annotations_except_nouns and adj_flg == 0:
                adj_val = token.text
                adj_flg = 1
            if token.tag_ not in annotations_except_nouns:
                restricted_nouns_as_string.append(token.text)

    # appending top-3 syns and anons to the query
    adj_val= adj_val.lower()
    syns, anons = synonym_antonym_extractor(adj_val)
    # print(syns, anons)
    if len(syns) == 0:
        syns, _ = synonym_antonym_extractor('different')
    if len(anons) == 0:
        _, anons = synonym_antonym_extractor('better')
    
    syns_fin, anons_fin = best_words_extractor(adj_val,syns, anons)

    # queries preprepartion
    base_query = " ".join(nouns_as_string)
    temp_query = " ".join(restricted_nouns_as_string)
    if len(syns) < 3:
        syn_query = " ".join(syns).strip() + " " + temp_query
    else:
        syn_query = " ".join(syns_fin).strip() + " " + temp_query
    if len(anons) < 3:
        ann_query = " ".join(anons).strip() + " " + temp_query
    else:
        ann_query = " ".join(anons_fin).strip() + " " + temp_query

    return base_query.strip(), syn_query.strip(), ann_query.strip()

In [None]:
topics = [
    "What is the difference between sex and love?",
    "Which is the highest mountain in the world?",
    "Which is better, a laptop or a desktop?",
]
for q in topics:
    print(get_comparation_superlation_nouns(q))
    print('\n')

('difference sex and love', 'different unlike dissimilar difference sex love', 'worsen evil badly difference sex love')


('highest mountain world', 'high eminent high-pitched mountain world', 'low mountain world')


('better laptop or desktop', 'good well improve laptop desktop', 'evil worsen badly laptop desktop')




In [None]:
topics_large = [
    "What is the difference between sex and love?",
    "Which is better, a laptop or a desktop?",
    "Which is better, Canon or Nikon?",
    "What are the best dish detergents?",
    "What are the best cities to live in?",
    "What is the longest river in the U.S.?",
    "Which is healthiest: coffee, green tea or black tea and why?",
    "What are the advantages and disadvantages of PHP over Python and vice versa?",
    "Why is Linux better than Windows?",
    "How to sleep better?",
    "Should I buy an LCD TV or a plasma TV?",
    "Train or plane? Which is the better choice?",
    "What is the highest mountain on Earth?",
    "Should one prefer Chinese medicine or Western medicine?",
    "What are the best washing machine brands?",
    "Should I buy or rent?",
    "Do you prefer cats or dogs, and why?",
    "What is the better way to grill outdoors: gas or charcoal?",
    "Which is better, MAC or PC?",
    "What is better: to use a brush or a sponge?",
    "Which is better, Linux or Microsoft?",
    "Which is better, Pepsi or Coke?",
    "What is better, Google search or Yahoo search?",
    "Which one is better, Netflix or Blockbuster?",
    "Which browser is better, Internet Explorer or Firefox?",
    "Which is a better vehicle: BMW or Audi?",
    "Which one is better, an electric stove or a gas stove?",
    "What planes are best, Boeing or Airbus?",
    "Which is better, Disneyland or Disney World?",
    "Should I buy an Xbox or a PlayStation?",
    "Which has more caffeine, coffee or tea?",
    "Which is better, LED or LCD Reception Displays?",
    "What is better: ASP or PHP?",
    "What is better for the environment, a real or a fake Christmas tree?",
    "Do you prefer tampons or pads?",
    "What IDE is better for Java: NetBeans or Eclipse?",
    "Is OpenGL better than Direct3D in terms of portability to different platforms?",
    "What are the differences between MySQL and PostgreSQL in performance?",
    "Is Java code more readable than code written in Scala?",
    "Which operating system has better performance: Windows 7 or Windows 8?",
    "Which smartphone has a better battery life: Xperia or iPhone?",
    "Which four wheel truck is better: Ford or Toyota?",
    "Should I prefer a Leica camera over Nikon for portrait photographs?",
    "Which company has a larger capitalization: Apple or Microsoft?",
    "Which laptop has a better durability: HP or Dell?",
    "Which beverage has more calories per glass: beer or cider?",
    "Is admission rate in Stanford higher than that of MIT?",
    "Is pasta healthier than pizza?",
    "Which city is more expensive to live in: San Francisco or New York?",
    "Whose salary is higher: basketball or soccer players?",
]
for q in topics_large:
    print(get_comparation_superlation_nouns(q))
    print('\n')

('difference sex and love', 'different unlike dissimilar difference sex love', 'worsen evil badly difference sex love')


('better laptop or desktop', 'good well improve laptop desktop', 'evil worsen badly laptop desktop')


('better Canon or Nikon', 'good well improve Canon Nikon', 'evil worsen badly Canon Nikon')


('best dish detergents', 'better good well detergents', 'ill evil badly detergents')


('best cities live', 'better good well cities', 'ill evil badly cities')


('longest river U.S.', 'long tenacious retentive river U.S.', 'short unretentive river U.S.')


('healthiest coffee green tea or black tea and', 'healthy levelheaded level-headed coffee tea tea', 'unhealthy coffee tea tea')


('advantages and disadvantages PHP Python and vice versa', 'different unlike dissimilar advantages disadvantages PHP Python vice', 'worsen evil badly advantages disadvantages PHP Python vice')


('Linux better Windows', 'good well improve Linux Windows', 'evil worsen badly Linux Windows')


(

In [None]:
from xml.dom import minidom
# define function for loading all the topics from the topics files.
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["number","title"])
  return parsed

# preparing the list of topics and corresponding dataframe.
topics_2020 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task-2.xml")
topics_2021 = parse_xml("/content/drive/MyDrive/touche-2022-prototyping/topics-task2-51-100.xml")
touche_topics = topics_2020.append(topics_2021, ignore_index=True)
touche_topics

Unnamed: 0,number,title
0,1,\nWhat is the difference between sex and love?\n
1,2,"\nWhich is better, a laptop or a desktop?\n"
2,3,"\nWhich is better, Canon or Nikon?\n"
3,4,\nWhat are the best dish detergents?\n
4,5,\nWhat are the best cities to live in?\n
...,...,...
95,96,"Which is healthier to wear, boxers or briefs?"
96,97,What is the difference between a blender vs a ...
97,98,"Which is better, rock or rap?"
98,99,Do you think imagination is better than knowle...


In [61]:
 touche_topics['title'] = touche_topics['title'].apply(lambda x: x.strip())
 touche_topics['query_base'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[0])
 touche_topics['query_synonym'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[1])
 touche_topics['query_antonym'] = touche_topics['title'].apply(lambda x: get_comparation_superlation_nouns(x)[2])

In [62]:
touche_topics

Unnamed: 0,number,title,query_base,query_synonym,query_antonym
0,1,What is the difference between sex and love?,difference sex and love,different unlike dissimilar difference sex love,worsen evil badly difference sex love
1,2,"Which is better, a laptop or a desktop?",better laptop or desktop,good well improve laptop desktop,evil worsen badly laptop desktop
2,3,"Which is better, Canon or Nikon?",better Canon or Nikon,good well improve Canon Nikon,evil worsen badly Canon Nikon
3,4,What are the best dish detergents?,best dish detergents,better good well detergents,ill evil badly detergents
4,5,What are the best cities to live in?,best cities live,better good well cities,ill evil badly cities
...,...,...,...,...,...
95,96,"Which is healthier to wear, boxers or briefs?",healthier wear boxers or briefs,healthy fitter salubrious boxers briefs,unhealthy boxers briefs
96,97,What is the difference between a blender vs a ...,difference blender food processor,good well improve difference blender food proc...,evil worsen badly difference blender food proc...
97,98,"Which is better, rock or rap?",better rock or rap,good well improve rock rap,evil worsen badly rock rap
98,99,Do you think imagination is better than knowle...,think imagination better knowledge,believe guess imagine imagination knowledge,forget imagination knowledge


In [64]:
touche_topics.to_csv("/content/drive/MyDrive/touche-2022-prototyping/touche_topics_query_expansion.csv", index=False)