# Word embeddings for ad-hoctext retrieval

## Parse Queries + Data

In [1]:
text_corpus = dict()
   
corpus = open ("data/trec_corpus.txt", "r")
    
for line in corpus.readlines():
    splitted_line = line.split(" ")
    text = splitted_line[1:]
    docid = splitted_line[0]
    text_corpus[docid] = text

del corpus

In [2]:
# Example document
print(text_corpus["FBIS3-11318"])

['drlat', 'o', 'fbis', 'lat', 'document', 'type', 'daily', 'report', 'mar', 'colombia', 'criticizes', 'drug', 'consuming', 'countries', 'pa', 'santa', 'fe', 'de', 'bogota', 'inravision', 'television', 'cadena', 'in', 'spanish', 'gmt', 'mar', 'pa', 'santa', 'fe', 'de', 'bogota', 'inravision', 'television', 'cadena', 'language', 'spanish', 'article', 'type', 'bfn', 'from', 'the', 'ntc', 'news', 'newscast', 'text', 'faced', 'with', 'criticism', 'triggered', 'by', 'the', 'cali', 'cartel', 'surrender', 'process', 'the', 'prosecutor', 'general', 's', 'office', 'has', 'issued', 'a', 'communique', 'the', 'communique', 'from', 'the', 'prosecutor', 'general', 'contains', 'seven', 'specific', 'points', 'his', 'activities', 'have', 'been', 'directed', 'at', 'the', 'fight', 'against', 'all', 'manifestations', 'of', 'crime', 'the', 'destruction', 'of', 'the', 'medellin', 'cartel', 'was', 'due', 'to', 'constant', 'work', 'by', 'the', 'prosecutor', 'general', 's', 'office', 'number', 'as', 'heard', 'a

In [3]:
text_queries = []
with open("data/queries.txt") as f:
    lines = ''.join(f.readlines())
text_queries = [line.rstrip().split() for line in lines.split('\n')[:-1]]

In [4]:
# Example query
print(text_queries[0])

['301', 'international', 'organized', 'crime']


## BM25 Calculation

In [8]:
import math
import numpy as np

class BM25:
    def __init__(self, corpus, k1=1.5, b=0.75, epsilon=0.25):
        self.corpus_size = len(corpus)
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        
        # Calc parameters
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon

        nd = self._initialize(corpus)
        self._calc_idf(nd)


    def _initialize(self, corpus):
        nd = {}
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                if word not in nd:
                    nd[word] = 0
                nd[word] += 1

        self.avgdl = num_doc / self.corpus_size
        return nd
    
    def _calc_idf(self, nd):
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

## Running BM25

In [9]:
bm25 = BM25(text_corpus.values())

In [10]:
topic = text_queries[0][0]
doc_scores = bm25.get_scores(text_queries[0][1:])

In [11]:
text_queries[0][1:]

['international', 'organized', 'crime']

In [12]:
f = open('data/run.bm25.txt', 'a')
for query in text_queries:
    topic = query[0]
    tokenized_query = query[1:]
    doc_scores = bm25.get_scores(tokenized_query)
    keys = list(text_corpus.keys())

    topic_list = []
    for item in range(len(doc_scores)):
        topic_list.append((topic,keys[item],doc_scores[item]))
    topic_list.sort(key=lambda tup: tup[2], reverse=True)
    # filter out topics with relavance score 0
    topics_list_filtered = [tup for tup in topic_list if tup[2] != 0]
    for item in range(len(topics_list_filtered)):
        (topic, docid, score) = topics_list_filtered[item]
        print(topic," ","Q0"," ",docid," ",item+1," ",score," ","BM25", file=f)   
    print(topics_list_filtered[0])   
f.close() 


('301', 'FBIS4-41991', 18.50459297775577)
('302', 'FBIS4-67701', 37.802274284004476)
('303', 'FT921-7107', 35.62332964986995)
('304', 'FR940419-2-00009', 31.402554438878067)
('305', 'LA112489-0003', 14.380538952839482)
('306', 'FT921-13505', 19.769796389716703)
('307', 'FBIS4-33622', 21.05319224961245)
('308', 'LA070489-0051', 26.137889127694713)
('309', 'LA090190-0055', 23.040548848799215)
('310', 'FT931-11958', 37.44866841403548)
('311', 'FT944-15440', 19.399126422497098)
('312', 'FBIS4-67023', 12.216144829799484)
('313', 'FBIS3-59655', 44.31177385782762)
('314', 'FR941104-1-00033', 15.092100297277502)
('315', 'FR940628-1-00073', 21.005182966866162)
('316', 'FT932-4228', 17.096984424217453)
('317', 'LA080889-0065', 18.58488534454721)
('318', 'LA101890-0067', 15.016472624550534)
('319', 'FBIS4-67889', 15.14589948644035)
('320', 'FBIS4-25142', 38.929938795389546)
('321', 'FBIS4-26295', 21.68415019488701)
('322', 'FT944-8297', 12.99205728641839)
('323', 'LA102689-0011', 23.9625919631275

('640', 'FT931-8172', 26.2894494736206)
('641', 'LA120989-0014', 29.24495311376587)
('642', 'FBIS3-3664', 27.821347175699106)
('643', 'FT943-6186', 29.168951750794868)
('644', 'LA070190-0086', 22.285888471917865)
('645', 'FT944-16684', 26.579627080835625)
('646', 'LA041590-0171', 24.08434273027045)
('647', 'FBIS4-47519', 19.406024589680282)
('648', 'FR941202-0-00181', 15.919130158625375)
('649', 'FT944-9024', 23.163291011301677)
('650', 'LA102690-0100', 25.5789525059822)
('651', 'LA070290-0060', 25.605861962307713)
('652', 'FBIS4-54764', 26.610033659203626)
('653', 'LA093090-0190', 35.42400410606862)
('654', 'LA120789-0105', 19.315502176154318)
('655', 'FR940106-1-00025', 20.607319133577818)
('656', 'LA032590-0171', 25.05972412167151)
('657', 'LA090389-0150', 25.21679585909343)
('658', 'LA041890-0107', 22.12546083574931)
('659', 'LA090990-0124', 19.566233193159682)
('660', 'LA123189-0136', 28.206301071318542)
('661', 'LA080190-0001', 25.614995650433958)
('662', 'LA020790-0135', 9.55507

## Extending Query BM25

In [13]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
model.most_similar(positive=text_queries[0][1:],topn=4)

[('transnational_organized', 0.565650224685669),
 ('Organised', 0.5449141263961792),
 ('el_Gweini', 0.5151761770248413),
 ('crimes', 0.5112009048461914)]

### Centroid

In [None]:
# centroid aproach (taking average)
f = open('data/expanded.centroid.queries.txt', 'a')
for query in text_queries:
    vectors = np.zeros(300)
    for word in query[1:]:
        if word in model.vocab:
            vectors += model[word]
    similar_words = model.similar_by_vector(vectors/len(query[1:]),topn=3)
    expansion = " ".join([ x[0] for x in similar_words ])
    print(" ".join(query),expansion, file=f)
f.close()

In [None]:
expanded_queries = []
with open("data/expanded.centroid.queries.txt") as f:
    lines = ''.join(f.readlines())
expanded_queries = [line.rstrip().split() for line in lines.split('\n')[:-1]]

In [None]:
f = open('data/run.expanded.centroid.bm25.txt', 'a')
for query in expanded_queries:
    topic = query[0]
    tokenized_query = query[1:]
    doc_scores = bm25.get_scores(tokenized_query)
    keys = list(text_corpus.keys())

    topic_list = []
    for item in range(len(doc_scores)):
        topic_list.append((topic,keys[item],doc_scores[item]))
    topic_list.sort(key=lambda tup: tup[2], reverse=True)
    # filter out topics with relavance score 0
    topics_list_filtered = [tup for tup in topic_list if tup[2] != 0]
    for item in range(len(topics_list_filtered)):
        (topic, docid, score) = topics_list_filtered[item]
        print(topic," ","Q0"," ",docid," ",item+1," ",score," ","BM25+QE", file=f)   
    print(topics_list_filtered[0])   
f.close()

('301', 'FBIS4-18122', 35.45937763206223)
('302', 'FBIS4-67701', 68.81975756051388)
('303', 'FT921-7107', 77.15575738076538)
('304', 'FR940617-0-00103', 53.37472021960415)
('305', 'LA113089-0027', 30.343961685514365)
('306', 'FT921-13505', 39.539592779433406)
('307', 'FBIS4-33116', 43.41750447047306)
('308', 'LA070489-0051', 67.3745070334407)
('309', 'LA090190-0055', 39.52033769478694)
('310', 'FT931-11958', 64.23601632777786)
('311', 'FBIS4-19393', 48.08824677745489)
('312', 'LA040289-0050', 37.329566327515515)
('313', 'FBIS3-59655', 76.3244558706656)
('314', 'FBIS3-22391', 35.301520392583654)
('315', 'LA092789-0116', 37.31830258761538)
('316', 'FT932-4228', 34.193968848434906)
('317', 'LA080889-0065', 37.16977068909442)
('318', 'LA101890-0067', 35.876710139064656)
('319', 'FBIS4-46063', 36.23823813415579)
('320', 'FBIS4-20427', 61.733618261607674)
('321', 'FBIS4-26295', 37.42402339751883)
('322', 'FT944-8297', 25.98411457283678)
('323', 'LA102689-0011', 47.92518392625508)
('324', 'FB

### Fusion-based

In [None]:
# fusion aproach (taking average)
f = open('data/expanded.fusion.queries.txt', 'a')
for query in text_queries:
    expanding_words = []
    for word in query[1:]:
        if word in model.vocab:
            expanding_words.append(model.similar_by_word(word,topn=1))
    expanding_words = sum(expanding_words, [])
    expansion = " ".join([ x[0] for x in expanding_words ])
    print(" ".join(query),expansion, file=f)
f.close()

In [None]:
expanded_queries = []
with open("data/expanded.fusion.queries.txt") as f:
    lines = ''.join(f.readlines())
expanded_queries = [line.rstrip().split() for line in lines.split('\n')[:-1]]

In [None]:
f = open('data/run.expanded.fusion.bm25.txt', 'a')
for query in expanded_queries:
    topic = query[0]
    tokenized_query = query[1:]
    doc_scores = bm25.get_scores(tokenized_query)
    keys = list(text_corpus.keys())

    topic_list = []
    for item in range(len(doc_scores)):
        topic_list.append((topic,keys[item],doc_scores[item]))
    topic_list.sort(key=lambda tup: tup[2], reverse=True)
    # filter out topics with relavance score 0
    topics_list_filtered = [tup for tup in topic_list if tup[2] != 0]
    for item in range(len(topics_list_filtered)):
        (topic, docid, score) = topics_list_filtered[item]
        print(topic," ","Q0"," ",docid," ",item+1," ",score," ","BM25+QE", file=f)   
    print(topics_list_filtered[0])   
f.close()

## BM25 word embeddings

In [None]:
# Extension of BM25 Class, WIP
class BM25WE(BM25):
    def __init__(self, corpus):
        super().__init__(corpus)
        
    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score