# Information Retrieval assignment 2

In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import time
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics.pairwise import cosine_similarity
from gensim.corpora import Dictionary
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
import re
import subprocess
from tqdm import tqdm_notebook as tqdm
from nltk.corpus import stopwords
#import unidecode

import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home'

import jnius_config
jnius_config.set_classpath("/Users/silvan/ucph/Block4-19/IR2019/assignments/assignment_2/anserini/target/anserini-0.5.1-SNAPSHOT-fatjar.jar")

from jnius import autoclass
JString = autoclass('java.lang.String')
JSearcher = autoclass('io.anserini.search.SimpleSearcher')

searcher = JSearcher(JString('/Users/silvan/ucph/Block4-19/IR2019/assignments/assignment_2/anserini/lucene-index.robust04.pos+docvectors+rawdocs'))

### Load in the query data

In [2]:
#reads the file into one big string
with open('04.testset', 'r') as file:
    data = file.read()
    
#regular expressions for pulling out the querys and their id's     
idreg = re.compile('<num.*\s(\d+)')
qid = re.findall(idreg, data)

titreg = re.compile('<title>\s(.+)')
qry = re.findall(titreg, data)

qrys = list(zip(qid,qry))

### Load in the GLoVe word embedding

In [3]:
start = time.time()

wv = KeyedVectors.load_word2vec_format('glove/glove.6B.300d.w2vformat.txt')

end = time.time()
print(round(end - start,2))

105.74


## Functions for basic BM25 + TREC_eval

In [4]:
def retrieve_ranking(inp, fixed= 'Q0', run='BM25run', runlen = 1000):
        qid, qry = inp
        hits = searcher.search(JString(qry), runlen)
        scores, docids = [h.score for h in hits], [h.docid for h in hits]
        hitslen = len(hits)
        qids, ranks = [qid] * hitslen, np.arange(1,hitslen+1), 
        runs, fixeds = [run] * hitslen, [fixed] * hitslen 
        df = pd.DataFrame({'topid' : qids, 
                           'fixed' : fixeds,
                           'docid' : docids,
                           'rank'  : ranks,
                           'score' : scores,
                           'runid' : runs})
        #duplicates appear, drop them except first
        df = df.drop_duplicates(subset='docid', keep='first')
        return df
    
def all_rankings(qrys, run='BM25run', runlen=1000):
    frames = []
    for qry in tqdm(qrys):
        df = retrieve_ranking(qry, run=run, runlen=runlen)
        frames.append(df)
    return pd.concat(frames)

In [13]:
def trec(run, qrl):
    te = 'trec_eval.9.0/trec_eval'
    trec = subprocess.run([te, '-q', '-m', 'all_trec', qrl, run], stdout=subprocess.PIPE).stdout.decode('utf-8')
    #MEAN AVERAGE PREICISON
    map_re = re.compile('\smap\s*\tall\t(\d.+)')
    mr = re.findall(map_re, trec)

    #MAP at cutoffs {5,10,20}
    map_5_re = re.compile('\smap_cut_5\s*\tall\t(\d.+)')
    mr5 = re.findall(map_5_re, trec)

    map_10_re = re.compile('\smap_cut_10\s*\tall\t(\d.+)')
    mr10 = re.findall(map_10_re, trec)

    map_20_re = re.compile('\smap_cut_20\s*\tall\t(\d.+)')
    mr20 = re.findall(map_20_re, trec)
    
    return mr, mr5, mr10, mr20

In [14]:
def print_trec(mr, mr5, mr10, mr20):
    print('MAP:        ' +str(mr[0]))
    print('MAP_CUT_5:  '+str(mr5[0]))
    print('MAP_CUT_10: ' +str(mr10[0]))
    print('MAP_CUT_20: '+str(mr20[0]))

## Query expansions

In [15]:
def centroid(qry, we, v=3, dim=300):
    cent = np.zeros(dim)
    qid, qs = qry
    qs = qs.split()
    #build the vector representation of the query
    for q in qs:
        try:
            vec = we.get_vector(q.lower())
            cent += vec
        except:
            continue
    #get v similar words to vector, and append them to query
    v = v + len(qs)
    sim = we.similar_by_vector(cent, topn=v)
    query_exp = [t for t,s in sim[len(qs):v]]
    res = qs + query_exp
    res = ' '.join(res)
    res = unidecode.unidecode(res)
    return (qid,res)

In [16]:
def fusion(qry, we, v=3, n=5, dim=300, comb='sum'):
    qid, qs = qry
    qs = qs.split()
    scores, d = [], {}
    
    #create the top n lists for each query term
    #conversion to probabilities skipped
    for q in qs:
        parens = '(){}<>'
        reg = re.compile('[%s]' % parens)
        q = reg.sub('', q)
        try:
            vec = we.get_vector(q.lower())
            sim = we.similar_by_vector(vec, topn=n+1)
           # res = [k for k,v in sim][1:]
            scores.append(sim)
        except:
            continue
        
    for s in scores:
            for k,val in s[1:]:
                try:
                    if comb == 'sum':
                        d[k] += val
                    if comb == 'max':
                        if d[k] < val:
                            d[k] = val
                except:
                    d[k] = val
    
    tops = (sorted(d.items(), key = lambda x: int(x[1]), reverse  = True))
    top = [k for k,v in tops][:v]
    qry = qs + top
    #qry = list(set(qry))  #removes duplicates, but does not preserve order
    qry = ' '.join(qry)
    qry = unidecode.unidecode(qry)
    return (qid,qry)

## Saliency-weighted semantic network

In [17]:
def preprocess_text(df, column):
    #turn to lower case
    df[column] = df[column].apply(lambda x: " ".join(x.lower() for x in x.split()))
    #throw away some punctuation 
    df[column] = df[column].str.replace('[^\w\s]','')
    #throw away stop words
    stop = stopwords.words('english')
    #outcomment next line for production
    extra_words = ['br', 'text', 'bfn', 'language', 'article', 'date', 'headline']
    stop += extra_words
    df[column] = df[column].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    df[column] = df[column].apply(lambda x: x[:].split(' '))
    return df

#retrieves runlen initial rankings using basic BM25
def retrieve_init(inp, runlen=1000):
    qid, qry = inp
    hits = searcher.search(JString(qry), runlen)
    conts, docids = [h.content for h in hits], [h.docid for h in hits] 
    df = pd.DataFrame({'cont' : conts, 'docids' : docids, 'qryid': qid})
    df = preprocess_text(df, 'cont')
        #docs.append(df)
    return df

In [18]:
#saliency-weighted semantic network
def get_swsn(inp, we, runid, b=0.75, k1=1.2, fixed='Q0', runlen=1000, ranks=1):
    
    #unpack inputs
    qid, ss = inp
    ss = ss.split()
    qid = [qid] * runlen

    rank_init = retrieve_init(inp, runlen=1000)
    corpus = Dictionary(rank_init['cont'].values)
    
    #get the average document length, and precompute some values for loop optimization
    avgsl = int(np.mean(np.array([len(x) for x in rank_init['cont'].values])))
    kp1, bm1, bavgsl = k1+1, 1-b, b * avgsl
    scores = []
    
    #loop over each document in initial rankings
    for sl in tqdm(rank_init['cont'].values):
        score = 0
        #loop over each word in the document
        for w in sl:
            sem = 0
            tmp_list = []
            #loop over each word in the query
            for wprime in ss:
                try:
                    tmp = we.similarity(w, wprime.lower())
                except:
                    tmp = 0
                tmp_list.append(tmp)
            sem = max(tmp_list) 
            try:
                tok = corpus.token2id[w]
                dfs = corpus.dfs[tok]
            except:
                dfs = 0
            idf = np.log((corpus.num_docs/dfs))
            num = sem * kp1
            denom = sem + (k1 * (bm1 + bavgsl * len(ss)))
            score += idf*(num/denom)
        scores.append(score)
    #sort the scores and docids
    npscores = np.array(scores)
    sort_idx = np.argsort(npscores)
    sort_scores = np.sort(npscores)
    doc_ids = rank_init['docids'].values
    sort_docids = np.array(doc_ids)[sort_idx]       
    
    #return a single df for a single query, ready to be concat to trec_evals texts
    df = pd.DataFrame({'qryids' : qid, 
                       'fixed' : fixed, 
                       'docids' : sort_docids[::-1], 
                       'ranks' : ranks,
                       'scores': sort_scores[::-1],
                       'run': runid})
    return df


def swsn(qrys, we, runlen=1000, runid='SWSN', b = 0.75, k1=1.2, fixed='Q0'):
    dfs = []
    #these are exactly the same for each run, so precompute them to avoid doing it inside loops
    runid_list = [runid] * runlen
    fixed_list = [fixed] * runlen
    ranks_list = np.arange(1,runlen+1)
    for qry in tqdm(qrys):
        df = get_swsn(qry, we, runid=runid_list, fixed=fixed_list, ranks=ranks_list)
        dfs.append(df)
    return pd.concat(dfs)

## Main entry function for the 3 ranking functions

In [19]:
#main function, wrapper for the 3 different ranking functions
def ad_hoc_retrieval(qrys, 
                     we, 
                     k1=1.2, 
                     b=0.75, 
                     runlen=1000, 
                     mode='BM25', 
                     p=False,
                     v=3,
                     n=5,
                     comb='sum',
                     dim=300):
    
    searcher.setBM25Similarity(k1,b)
    qrl = 'qrels.robust2004.txt'
    
    if mode == 'BM25':
        path = 'BM25run.txt'
        df = all_rankings(qrys, runlen=runlen)
    
    elif mode == 'centroid':
        path = 'Assignment2/centroidrun.txt'
        exp_qrys = [centroid(qry, wv) for qry in tqdm(qrys)]
        df = all_rankings(exp_qrys,runlen=runlen)
        
    elif mode == 'fusion':
        if comb == 'sum':
            path = 'Assignment2/fusion_sumrun.txt'
        elif comb == 'max':
            path = 'Assignment2/fusion_maxrun.txt'
        else:
            print('Comb option for fusion not supported. Choose between \'sum\ and \'max\'')
        exp_qrys = [fusion(qry, wv, v=v, n=n, comb=comb) for qry in tqdm(qrys)]
        df = all_rankings(exp_qrys, runlen=runlen)
        
    elif mode == 'swsn':
        path = 'Assignment2/swsnrun.txt'
        df = swsn(qrys, wv, runlen=runlen, b=b, k1=k1)
    
    else:
        print('Mode not supported')
        return 0
    
    df.to_csv(path, sep=' ', index=False, header=False)
    mr, mr5, mr10, mr20 = trec(path, qrl)
    if p:
        print_trec(mr, mr5, mr10, mr20)
    return mr

## Run the code

In [71]:
swsn_run = ad_hoc_retrieval(qrys[:3], wv, mode='swsn', runlen=1000, b=0.75, k1=1.2, p=True) 
swsn_run

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

['0.0630']

In [73]:
fusion_sum = ad_hoc_retrieval(qrys, wv, mode ='fusion', comb='max', v=3, n=5, p=True)
fusion_sum

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

MAP:        0.1393
MAP_CUT_5:  0.0387
MAP_CUT_10: 0.0579
MAP_CUT_20: 0.0762


['0.1393']

In [52]:
cent = ad_hoc_retrieval(qrys, wv, mode='centroid', p=True)
cent

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

MAP:        0.1570
MAP_CUT_5:  0.0459
MAP_CUT_10: 0.0654
MAP_CUT_20: 0.0879


['0.1570']

In [20]:
mr = ad_hoc_retrieval(qrys, wv, p=True)
mr

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))


MAP:        0.2362
MAP_CUT_5:  0.0712
MAP_CUT_10: 0.1054
MAP_CUT_20: 0.1408


['0.2362']