# Word embeddings for ad-hoctext retrieval

In [1]:
import pandas as pd
import numpy as np
import subprocess
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
import re
# Use anserini and run java code
import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home'

import jnius_config
jnius_config.set_classpath("/Users/silvan/ucph/Block4-19/IR2019/assignments/assignment_2/anserini/target/anserini-0.5.1-SNAPSHOT-fatjar.jar")

from jnius import autoclass
JString = autoclass('java.lang.String')
JSearcher = autoclass('io.anserini.search.SimpleSearcher')

searcher = JSearcher(JString('/Users/silvan/ucph/Block4-19/IR2019/assignments/assignment_2/anserini/lucene-index.robust04.pos+docvectors+rawdocs'))

## Parse Queries

In [2]:
text_queries = []
with open("data/04.testset") as f:
    lines = f.readlines()
line_num = 0    
for line in lines:
    line_num += 1
    if line.startswith('<num> Number:'):
        currentId = line.replace('<num> Number:', '').strip()
    if line.startswith('<title>'):
        text = line.replace('<title>', '').strip()
        if not text:
            text = lines[line_num]
        if currentId is not None and text is not None:
            text_queries.append((currentId,text))

In [3]:
train_queries = text_queries[:int(0.8 * len(text_queries))]
test_queries = text_queries[int(0.8 * len(text_queries)):]
print(len(train_queries))
print(len(test_queries))

200
50


In [4]:
fold1, fold2, fold3, fold4, fold5 = np.array_split(train_queries,5)
folds = [fold1, fold2, fold3, fold4, fold5]
print(len(fold1))
print(len(fold2))
print(len(fold3))
print(len(fold4))
print(len(fold5))

40
40
40
40
40


In [5]:
# Example query
print(text_queries[0])

('301', 'International Organized Crime')


## BM25

In [6]:
# use anserini to get BM25 rankings
def get_ranking(query_text,run_length=1000,k1=1.5, b=0.75):
    searcher.setBM25Similarity(k1,b)
    hits = searcher.search(JString(query_text),run_length)
    scores, docids = [h.score for h in hits], [h.docid for h in hits]
    return (scores, docids)

In [7]:
def get_ranking_all(queries,output_file,k1=1.5, b=0.75,fixed='Q0',run='BM25',run_length=100):
    all_data = pd.DataFrame()
    topic_list = []
    for query in queries:
        topic, query_text = query
        doc_scores, doc_ids = get_ranking(query_text,run_length,k1=k1,b=b)

        hitslen = len(doc_ids)
        qids, ranks = [topic] * hitslen, np.arange(1,hitslen+1), 
        runs, fixeds = [run] * hitslen, [fixed] * hitslen
        # found scores and ids to dataframe
        topic_df = pd.DataFrame({'docid' : doc_ids,'score' : doc_scores})
        # runs need format: topic fixed docid rank score run
        topic_df.insert(0, 'topic', topic)
        topic_df.insert(1, 'fixed', 'Q0')
        topic_df.reset_index()
        topic_df.insert(3, 'rank', topic_df.index+1)
        topic_df.insert(5, 'run', run)
        topic = topic_df.drop_duplicates(subset='docid', keep='first')
        topic_list.append(topic)
    all_data = pd.concat(topic_list)
    all_data.to_csv(output_file, header=None, index=None, sep=' ', mode='w')

In [8]:
def trec_eval(qrels,run_output):
    trec = "trec_eval.9.0/trec_eval"
    cli_output = subprocess.run([trec, '-q', '-m', 'all_trec', qrels, run_output], stdout=subprocess.PIPE).stdout.decode('utf-8')
    cli_output=cli_output.split("\n")
    map_values = []
    for line in cli_output:
        line = line.split('\t')
        try:
            map_outputs = ['map','map_cut_5','map_cut_10','map_cut_20']
            if line[0].strip() in map_outputs and line[1] == 'all':
                map_values.append((line[0],line[2]))
        except:
            continue
    return map_values

In [9]:
qrels = "data/qrels.robust2004.txt"
run = "data/run.bm25.txt"
k1 = [0.5,1,1.5,2.0]
b = [0.25,0.5,0.75,1]

parameters = []
for i in range(len(k1)):
    for o in range(len(b)):
        for n in range(5):
            get_ranking_all(queries=folds[n],output_file=run,k1=k1[i],b=b[o])
            parameters.append((float(trec_eval(qrels,run)[0][1]),k1[i],b[o]))

In [10]:
sorted_parameters = sorted(parameters, key=lambda tup: tup[0], reverse=True)
_, k1, b = sorted_parameters[0]
get_ranking_all(queries=test_queries,output_file=run,k1=k1,b=b)
# Eval with trac
trec_eval(qrels,run)

[('map                   ', '0.2326'),
 ('map_cut_5             ', '0.0914'),
 ('map_cut_10            ', '0.1358'),
 ('map_cut_20            ', '0.1773')]

## Extending Query BM25

In [11]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

### Centroid

In [12]:
# centroid aproach (taking average)
def qe_centroid_method(queries,n):
    expanded_queries_centroid = []
    for query in queries:
        topic, query_text = query
        query_terms = query_text.lower().split()
        vectors = np.zeros(300) # dim 300
        for word in query_terms:
            if word in model.vocab:
                vectors += model[word]
        similar_words = model.similar_by_vector(vectors,topn=n)
        expansion = " ".join([ x[0] for x in similar_words ])
        query_terms = " ".join(query_terms)
        expanded_query = query_terms + " " + expansion
        expanded_queries_centroid.append((topic,expanded_query))
    return expanded_queries_centroid

In [13]:
run = 'data/run.expanded.centroid.bm25.txt'
n = [1,2,3,4]
parameters = []
for i in range(len(n)):
    for o in range(len(folds)):
        fold_queries = qe_centroid_method(folds[o],n[i])
        get_ranking_all(queries=fold_queries,output_file=run,run="BM25+QE",k1=k1,b=b)
        parameters.append((float(trec_eval(qrels,run)[0][1]),n[i]))

In [14]:
sorted_parameters = sorted(parameters, key=lambda tup: tup[0], reverse=True)
_, n = sorted_parameters[0]
exp_queries_cent = qe_centroid_method(text_queries,n)
get_ranking_all(queries=exp_queries_cent,output_file=run,run="BM25+QE",k1=k1,b=b)

In [15]:
qrels = "data/qrels.robust2004.txt"
trec_eval(qrels,run)

[('map                   ', '0.1832'),
 ('map_cut_5             ', '0.0751'),
 ('map_cut_10            ', '0.1089'),
 ('map_cut_20            ', '0.1408')]

### Fusion-based

In [16]:
def softmax(similar_words):
    exps = [(i,np.exp(v)) for i,v in similar_words]
    exps_sum = [np.exp(v) for i,v in similar_words]
    sum_of_exps = sum(exps_sum)
    return [(i,v/sum_of_exps) for i,v in exps]

In [17]:
# fusion aproach
def fusion_method(queries,n,v):
    expanded_queries_fusion = []
    for query in queries:
        query_dict = {} # get rid of double words, set highest one
        topic, query_text = query
        query_terms = query_text.lower().split()
        expanding_words = []
        for word in query_terms:
            if word in model.vocab:
                expanding_words.append(model.similar_by_word(word,topn=n))
        for word in expanding_words:
            for word,sim in word:
                try:
                    if query_dict[word] < sim:
                        query_dict[word] = sim
                except:
                    query_dict[word] = sim 
        
        sorted_words = sorted(softmax(query_dict.items()),key=lambda x: x[1], reverse=True)[:v]
        expansion = " ".join([ x[0] for x in sorted_words ])
        expanded_queries_fusion.append((topic, query_text + " " + expansion.replace("ñ", "n"))) # quick fix Spanish character

    return expanded_queries_fusion

In [18]:
n = [1,3]
v = [1,3]
run = 'data/expanded.fusion.queries.txt'
parameters = []
for p in range(len(folds)):
    for i in range(len(n)):
        for o in range(len(v)):
            fold_fus = fusion_method(folds[p],n[i],v[o])
            get_ranking_all(queries=fold_fus,output_file=run,run="BM25+QE")
            parameters.append((float(trec_eval(qrels,run)[0][1]),n[i],v[o]))

In [19]:
sorted_parameters = sorted(parameters, key=lambda tup: tup[0], reverse=True)
_, n, v = sorted_parameters[0]
exp_queries_fus = fusion_method(test_queries,n,v)
get_ranking_all(queries=exp_queries_fus,output_file=run,run="BM25+QE")

In [20]:
qrels = "data/qrels.robust2004.txt"
trec_eval(qrels,run)

[('map                   ', '0.1903'),
 ('map_cut_5             ', '0.0778'),
 ('map_cut_10            ', '0.1100'),
 ('map_cut_20            ', '0.1420')]

## BM25 word embeddings

In [21]:
# Text needs to be split in single words, remove unneeded texts
REPLACE_BY_SPACE = re.compile('[/(){}\[\]|@,;]')
BAD_SYMBOLS = re.compile('[^a-z #+_]')
TAGS = re.compile('<[^<]+?>')
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(content):
    content = content.lower()
    content = REPLACE_BY_SPACE.sub(' ', content)
    content = BAD_SYMBOLS.sub(' ', content)
    content = TAGS.sub('', content)
    content = ' '.join(word for word in content.split() if word not in STOPWORDS)
    remove_more_words = ['bfn','b','f'] # not in word2vec vocab and also not needed
    content = ' '.join(word for word in content.split() if word not in remove_more_words)
    content = content.split()
    return content

def get_ranking_with_content(query,run_length=2):
    _, query_text = query
    hits = searcher.search(JString(query_text), run_length)
    topic_content, doc_ids = [h.content for h in hits], [h.docid for h in hits] 
    topic_content_df = pd.DataFrame({'content': topic_content, 'docid': doc_ids})
    topic_content_df['content'] = topic_content_df['content'].apply(preprocess_text)
    return topic_content_df

In [22]:
def bm25_we(queries,output_file,k1=1.5,b=0.75,run='BM25+WE',run_length=100):
    topic_list = []
    for query in queries:
        topic, query_text = query
        query_terms = query_text.lower().split()
        bm25_ranking = get_ranking_with_content(query,run_length=run_length)
        docids = bm25_ranking['docid'].values
        content = bm25_ranking['content'].values
        word_corpus = Dictionary(content)
        
        # average sentence length
        avgsl = int(np.mean(np.array([len(x) for x in content])))

        scores = []
        for document in content:
            score = 0
            # loop through longer text, that we don't overlook any term
            for word in document:
                sem = 0
                sim_list = []
                for term in query_terms:
                    # word might not be in vocab, so similarity = 0
                    try:
                        # cosine similarity
                        sim = model.similarity(word,term)
                    except:
                        sim = 0
                    sim_list.append(sim)
                
                sem = max(sim_list)
                
                token = word_corpus.token2id[word]
                doc_fqs = word_corpus.dfs[token]
                
                idf = np.log(word_corpus.num_docs/doc_fqs)
                top_frac = sem * (k1 + 1)
                text_frac = len(query_terms)/avgsl
                lower_frac = sem + k1 * (1-b+b*text_frac)
                score += idf * top_frac/lower_frac
            scores.append(score)
        topic_df = pd.DataFrame({'docid': docids,'score': scores}).sort_values(by=['score'], ascending=False)
        # runs need format: topic fixed docid rank score run
        topic_df.insert(0, 'topic', topic)
        topic_df.insert(1, 'fixed', 'Q0')
        topic_df.reset_index()
        topic_df.insert(3, 'rank', topic_df.index+1)
        topic_df.insert(5, 'run', run)
        topic_df = topic_df.drop_duplicates(subset='docid', keep='first')
        topic_list.append(topic_df)
    
    all_data = pd.concat(topic_list)
    all_data.to_csv(output_file, header=None, index=None, sep=' ', mode='w')

In [23]:
k1 = [1,1.5,2]
b = [0.5,0.75,1]
parameters = []
for p in range(len(folds)):
    for i in range(len(k1)):
        for o in range(len(b)):
            fold_we = bm25_we(folds[p],'data/run.we.bm25.txt',k1[i],b[o])
            parameters.append((float(trec_eval(qrels,'data/run.we.bm25.txt')[0][1]),k1[i],b[o]))

In [24]:
sorted_parameters = sorted(parameters, key=lambda tup: tup[0], reverse=True)
_, k1, b = sorted_parameters[0]
bm25_we(test_queries,'data/run.we.bm25.txt',k1,b)

In [25]:
trec_eval(qrels,'data/run.we.bm25.txt')

[('map                   ', '0.1458'),
 ('map_cut_5             ', '0.0414'),
 ('map_cut_10            ', '0.0577'),
 ('map_cut_20            ', '0.0897')]