# Word embeddings for ad-hoctext retrieval

## Parse Queries + Data

In [None]:
text_corpus = dict()
   
corpus = open ("data/trec_corpus.txt", "r")
    
for line in corpus.readlines():
    splitted_line = line.split(" ")
    text = splitted_line[1:]
    docid = splitted_line[0]
    text_corpus[docid] = text

del corpus

In [None]:
# Example document
print(text_corpus["FBIS3-10319"])

In [None]:
text_queries = []
with open("data/queries.txt") as f:
    lines = ''.join(f.readlines())
text_queries = [line.rstrip().split() for line in lines.split('\n')[:-1]]

In [None]:
# Example query
print(text_queries[0])

In [None]:
class InvertedIndex:

	def __init__(self):
		self.index = dict()

	def __contains__(self, item):
		return item in self.index

	def __getitem__(self, item):
		return self.index[item]

	def add(self, word, docid):
		if word in self.index:
			if docid in self.index[word]:
				self.index[word][docid] += 1
			else:
				self.index[word][docid] = 1
		else:
			d = dict()
			d[docid] = 1
			self.index[word] = d

	#frequency of word in document
	def get_document_frequency(self, word, docid):
		if word in self.index:
			if docid in self.index[word]:
				return self.index[word][docid]
			else:
				raise LookupError('%s not in document %s' % (str(word), str(docid)))
		else:
			raise LookupError('%s not in index' % str(word))

	#frequency of word in index, i.e. number of documents that contain word
	def get_index_frequency(self, word):
		if word in self.index:
			return len(self.index[word])
		else:
			raise LookupError('%s not in index' % word)


class DocumentLengthTable:

	def __init__(self):
		self.table = dict()

	def __len__(self):
		return len(self.table)

	def add(self, docid, length):
		self.table[docid] = length

	def get_length(self, docid):
		if docid in self.table:
			return self.table[docid]
		else:
			raise LookupError('%s not found in table' % str(docid))

	def get_average_length(self):
		sum = 0
		for length in self.table.values():
			sum += length
		return float(sum) / float(len(self.table))


def build_data_structures(corpus):
	idx = InvertedIndex()
	dlt = DocumentLengthTable()
	for docid in corpus:

		#build inverted index
		for word in corpus[docid]:
			idx.add(str(word), str(docid))

		#build document length table
		length = len(corpus[str(docid)])
		dlt.add(docid, length)
	return idx, dlt

In [None]:
class QueryProcessor:
	def __init__(self, queries, corpus):
		self.queries = queries
		self.index, self.dlt = build_data_structures(corpus)

	def run(self):
		results = []
		for query in self.queries:
			results.append(self.run_query(query))
		return results

	def run_query(self, query):
		query_result = dict()
		for term in query:
			if term in self.index:
				doc_dict = self.index[term] # retrieve index entry
				for docid, freq in doc_dict.items(): #for each document and its word frequency
					score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
									   dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score
					if docid in query_result: #this document has already been scored once
						query_result[docid] += score
					else:
						query_result[docid] = score
		return query_result


## BM25 Calculation

In [None]:
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0.0


def score_BM25(n, f, qf, r, N, dl, avdl):
	K = compute_K(dl, avdl)
	first = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
	second = ((k1 + 1) * f) / (K + f)
	third = ((k2+1) * qf) / (k2 + qf)
	return first * second * third


def compute_K(dl, avdl):
	return k1 * ((1-b) + b * (float(dl)/float(avdl)) )

## Running BM25

In [10]:
proc = QueryProcessor(text_queries, text_corpus)
results = proc.run()

In [15]:
print(len(results))

0	Q0	FBIS3-3304	 0	14.012251566222307	NH-BM25
0	Q0	FBIS3-7969	 1	13.111052418677264	NH-BM25
0	Q0	FBIS3-6381	 2	11.420490080390735	NH-BM25
0	Q0	FBIS3-3412	 3	10.648930224417967	NH-BM25
0	Q0	FBIS3-4013	 4	10.540528255618867	NH-BM25
0	Q0	FBIS3-3866	 5	10.132560325184343	NH-BM25
0	Q0	FBIS3-6661	 6	9.106291379223629	NH-BM25
0	Q0	FBIS3-569	 7	9.10529883683841	NH-BM25
0	Q0	FBIS3-3189	 8	8.892306409576655	NH-BM25
0	Q0	FBIS3-2231	 9	8.516608776663968	NH-BM25
0	Q0	FBIS3-5118	10	7.744536500942227	NH-BM25
0	Q0	FBIS3-6459	11	7.272803128453199	NH-BM25
0	Q0	FBIS3-7880	12	7.143684665694022	NH-BM25
0	Q0	FBIS3-2934	13	7.077492304039112	NH-BM25
0	Q0	FBIS3-7993	14	6.937735977955943	NH-BM25
0	Q0	FBIS3-1606	15	6.936831875035309	NH-BM25
0	Q0	FBIS3-3437	16	6.8596082523285205	NH-BM25
0	Q0	FBIS3-7770	17	6.854275664797069	NH-BM25
0	Q0	FBIS3-3020	18	6.79264069847693	NH-BM25
0	Q0	FBIS3-7878	19	6.725011367053665	NH-BM25
0	Q0	FBIS3-7879	20	6.629706566284343	NH-BM25
0	Q0	FBIS3-5913	21	6.459715398946958	NH-BM25
0	Q0	F

20	Q0	FBIS3-7723	 5	-5.918260760110954	NH-BM25
20	Q0	FBIS3-116	 6	-6.004285936704633	NH-BM25
20	Q0	FBIS3-3463	 7	-6.203612258795733	NH-BM25
20	Q0	FBIS3-414	 8	-6.302653976456513	NH-BM25
20	Q0	FBIS3-122	 9	-6.372907904015864	NH-BM25
20	Q0	FBIS3-4743	10	-6.455002165608289	NH-BM25
20	Q0	FBIS3-683	11	-6.722645882716293	NH-BM25
20	Q0	FBIS3-5937	12	-6.747312659559084	NH-BM25
20	Q0	FBIS3-4742	13	-7.2864855793908205	NH-BM25
20	Q0	FBIS3-2935	14	-7.409486196775935	NH-BM25
20	Q0	FBIS3-7186	15	-7.535935263892257	NH-BM25
20	Q0	FBIS3-3632	16	-7.53741818875114	NH-BM25
20	Q0	FBIS3-3597	17	-7.53741818875114	NH-BM25
20	Q0	FBIS3-833	18	-7.551634544704153	NH-BM25
20	Q0	FBIS3-4436	19	-7.661226531744566	NH-BM25
20	Q0	FBIS3-4453	20	-7.702698273055243	NH-BM25
20	Q0	FBIS3-2311	21	-7.94465609383791	NH-BM25
20	Q0	FBIS3-4923	22	-8.070520845276983	NH-BM25
20	Q0	FBIS3-4314	23	-8.164172518993936	NH-BM25
20	Q0	FBIS3-7856	24	-8.261625806531086	NH-BM25
20	Q0	FBIS3-6661	25	-8.324678261025365	NH-BM25
20	Q0	FBIS3-523	26	-

40	Q0	FBIS3-3864	57	5.554125212068672	NH-BM25
40	Q0	FBIS3-305	58	5.449559298914222	NH-BM25
40	Q0	FBIS3-294	59	5.449559298914222	NH-BM25
40	Q0	FBIS3-2867	60	5.430731285594809	NH-BM25
40	Q0	FBIS3-6891	61	5.427261822011227	NH-BM25
40	Q0	FBIS3-7955	62	5.368684307619091	NH-BM25
40	Q0	FBIS3-7771	63	5.359318430392019	NH-BM25
40	Q0	FBIS3-7862	64	5.34704242368199	NH-BM25
40	Q0	FBIS3-6147	65	5.329140359517418	NH-BM25
40	Q0	FBIS3-2977	66	5.297216915251692	NH-BM25
40	Q0	FBIS3-7858	67	5.297216915251692	NH-BM25
40	Q0	FBIS3-3538	68	5.26477733739677	NH-BM25
40	Q0	FBIS3-7860	69	5.260137762609028	NH-BM25
40	Q0	FBIS3-1601	70	5.244852696558179	NH-BM25
40	Q0	FBIS3-2040	71	5.213928162533256	NH-BM25
40	Q0	FBIS3-1661	72	5.2133089527291006	NH-BM25
40	Q0	FBIS3-3338	73	5.205272820251326	NH-BM25
40	Q0	FBIS3-3977	74	5.190126715530351	NH-BM25
40	Q0	FBIS3-6457	75	5.189803288127331	NH-BM25
40	Q0	FBIS3-2111	76	5.153160361357351	NH-BM25
40	Q0	FBIS3-7726	77	5.152061349360679	NH-BM25
40	Q0	FBIS3-1078	78	5.149825878269128

58	Q0	FBIS3-3763	72	4.717499773135403	NH-BM25
58	Q0	FBIS3-4696	73	4.698013924095917	NH-BM25
58	Q0	FBIS3-7378	74	4.690826784325402	NH-BM25
58	Q0	FBIS3-4453	75	4.674066077221055	NH-BM25
58	Q0	FBIS3-5185	76	4.671089751907722	NH-BM25
58	Q0	FBIS3-5517	77	4.664453720652093	NH-BM25
58	Q0	FBIS3-3976	78	4.656264802315917	NH-BM25
58	Q0	FBIS3-2619	79	4.644160620976296	NH-BM25
58	Q0	FBIS3-2542	80	4.6383755516238825	NH-BM25
58	Q0	FBIS3-6174	81	4.628025749055028	NH-BM25
58	Q0	FBIS3-952	82	4.610608557519106	NH-BM25
58	Q0	FBIS3-6018	83	4.607464091998787	NH-BM25
58	Q0	FBIS3-5564	84	4.5921623511705345	NH-BM25
58	Q0	FBIS3-4500	85	4.58066735831228	NH-BM25
58	Q0	FBIS3-2289	86	4.574953719354849	NH-BM25
58	Q0	FBIS3-3518	87	4.572492070215975	NH-BM25
58	Q0	FBIS3-3555	88	4.546860931882692	NH-BM25
58	Q0	FBIS3-4723	89	4.544938110547341	NH-BM25
58	Q0	FBIS3-3259	90	4.5381597052123475	NH-BM25
58	Q0	FBIS3-463	91	4.53195842820713	NH-BM25
58	Q0	FBIS3-4903	92	4.522077577991743	NH-BM25
58	Q0	FBIS3-6344	93	4.4936792557629

80	Q0	FBIS3-5071	21	6.186445029216791	NH-BM25
80	Q0	FBIS3-3548	22	6.1546649611839	NH-BM25
80	Q0	FBIS3-4203	23	6.110717488405622	NH-BM25
80	Q0	FBIS3-7741	24	6.106630944845494	NH-BM25
80	Q0	FBIS3-2390	25	6.040205536560652	NH-BM25
80	Q0	FBIS3-7244	26	6.02467887780695	NH-BM25
80	Q0	FBIS3-7105	27	5.8942645074457625	NH-BM25
80	Q0	FBIS3-1372	28	5.88653054361386	NH-BM25
80	Q0	FBIS3-7387	29	5.833872693457686	NH-BM25
80	Q0	FBIS3-5251	30	5.7034610407953314	NH-BM25
80	Q0	FBIS3-3359	31	5.5345079723352395	NH-BM25
80	Q0	FBIS3-7128	32	5.4841617298590535	NH-BM25
80	Q0	FBIS3-3965	33	5.4442885730059105	NH-BM25
80	Q0	FBIS3-2719	34	5.434410713751049	NH-BM25
80	Q0	FBIS3-1972	35	5.424568633420307	NH-BM25
80	Q0	FBIS3-3182	36	5.419660949658298	NH-BM25
80	Q0	FBIS3-3861	37	5.363445895300124	NH-BM25
80	Q0	FBIS3-7743	38	5.332542208245436	NH-BM25
80	Q0	FBIS3-7907	39	5.301992609430299	NH-BM25
80	Q0	FBIS3-1674	40	5.281810131016861	NH-BM25
80	Q0	FBIS3-7914	41	5.256818927811762	NH-BM25
80	Q0	FBIS3-1201	42	5.22126342156

97	Q0	FBIS3-25	 2	7.407483568967099	NH-BM25
97	Q0	FBIS3-1359	 3	6.96658373039341	NH-BM25
97	Q0	FBIS3-3674	 4	6.305291760269352	NH-BM25
97	Q0	FBIS3-2016	 5	5.633100988767928	NH-BM25
97	Q0	FBIS3-6806	 6	5.566340174628505	NH-BM25
97	Q0	FBIS3-5495	 7	5.562462313545963	NH-BM25
97	Q0	FBIS3-44	 8	5.435268333659581	NH-BM25
97	Q0	FBIS3-1342	 9	5.393380809370402	NH-BM25
97	Q0	FBIS3-4847	10	5.389361330871576	NH-BM25
97	Q0	FBIS3-2762	11	5.375255544070436	NH-BM25
97	Q0	FBIS3-6404	12	5.310870155843536	NH-BM25
97	Q0	FBIS3-6066	13	5.296777500626197	NH-BM25
97	Q0	FBIS3-4454	14	5.013327919807025	NH-BM25
97	Q0	FBIS3-3314	15	4.9951844626021185	NH-BM25
97	Q0	FBIS3-4815	16	4.9865074336991055	NH-BM25
97	Q0	FBIS3-2724	17	4.926712192150911	NH-BM25
97	Q0	FBIS3-7860	18	4.920573576960077	NH-BM25
97	Q0	FBIS3-2672	19	4.908539629851438	NH-BM25
97	Q0	FBIS3-6726	20	4.901420617060807	NH-BM25
97	Q0	FBIS3-2675	21	4.819651330067127	NH-BM25
97	Q0	FBIS3-3001	22	4.8080422099198135	NH-BM25
97	Q0	FBIS3-7880	23	4.68943968405864

118	Q0	FBIS3-7068	19	6.758522245721688	NH-BM25
118	Q0	FBIS3-4872	20	6.757734055387829	NH-BM25
118	Q0	FBIS3-7079	21	6.732862840259403	NH-BM25
118	Q0	FBIS3-3769	22	6.588967342702167	NH-BM25
118	Q0	FBIS3-5555	23	6.588967342702167	NH-BM25
118	Q0	FBIS3-7953	24	6.503312381231593	NH-BM25
118	Q0	FBIS3-5281	25	6.356816358512465	NH-BM25
118	Q0	FBIS3-6436	26	6.342631843936123	NH-BM25
118	Q0	FBIS3-7401	27	6.338514119398021	NH-BM25
118	Q0	FBIS3-5301	28	6.206594109246666	NH-BM25
118	Q0	FBIS3-5643	29	6.101944191856423	NH-BM25
118	Q0	FBIS3-4682	30	6.084377653986109	NH-BM25
118	Q0	FBIS3-7123	31	5.939891827061423	NH-BM25
118	Q0	FBIS3-7224	32	5.930596952814296	NH-BM25
118	Q0	FBIS3-4796	33	5.866913789239418	NH-BM25
118	Q0	FBIS3-7182	34	5.847320870284005	NH-BM25
118	Q0	FBIS3-7227	35	5.786857775720861	NH-BM25
118	Q0	FBIS3-6384	36	5.7829458606162785	NH-BM25
118	Q0	FBIS3-5883	37	5.770240578956666	NH-BM25
118	Q0	FBIS3-5155	38	5.751286989505508	NH-BM25
118	Q0	FBIS3-6897	39	5.738720286633009	NH-BM25
118	Q0	FBIS3

136	Q0	FBIS3-2505	99	5.7585253681674695	NH-BM25
137	Q0	FBIS3-8	 0	8.441925798436408	NH-BM25
137	Q0	FBIS3-1965	 1	8.270075571014749	NH-BM25
137	Q0	FBIS3-3541	 2	8.083037823248688	NH-BM25
137	Q0	FBIS3-3697	 3	8.064894385488044	NH-BM25
137	Q0	FBIS3-4387	 4	7.787899215870889	NH-BM25
137	Q0	FBIS3-1851	 5	7.640678310285923	NH-BM25
137	Q0	FBIS3-7422	 6	7.554957208398603	NH-BM25
137	Q0	FBIS3-4050	 7	7.463932928363853	NH-BM25
137	Q0	FBIS3-7517	 8	7.396589951165966	NH-BM25
137	Q0	FBIS3-7258	 9	7.205828392696795	NH-BM25
137	Q0	FBIS3-7423	10	7.186106599089637	NH-BM25
137	Q0	FBIS3-1745	11	7.1836887260859985	NH-BM25
137	Q0	FBIS3-3482	12	7.157255113681366	NH-BM25
137	Q0	FBIS3-1854	13	7.152857394688935	NH-BM25
137	Q0	FBIS3-2402	14	7.081989420263376	NH-BM25
137	Q0	FBIS3-3546	15	7.026802361406042	NH-BM25
137	Q0	FBIS3-7711	16	6.975818341674257	NH-BM25
137	Q0	FBIS3-4330	17	6.943359388164888	NH-BM25
137	Q0	FBIS3-2034	18	6.814778076880382	NH-BM25
137	Q0	FBIS3-6064	19	6.806951448024201	NH-BM25
137	Q0	FBIS3-7

156	Q0	FBIS3-2648	66	5.220762268033162	NH-BM25
156	Q0	FBIS3-28	67	5.2103986739231445	NH-BM25
156	Q0	FBIS3-2163	68	5.209491645497068	NH-BM25
156	Q0	FBIS3-2495	69	5.160365036659054	NH-BM25
156	Q0	FBIS3-7720	70	5.147993623673785	NH-BM25
156	Q0	FBIS3-2369	71	5.1452754186516065	NH-BM25
156	Q0	FBIS3-2172	72	5.1436231766017215	NH-BM25
156	Q0	FBIS3-1890	73	5.128505335479468	NH-BM25
156	Q0	FBIS3-2498	74	5.120593921768113	NH-BM25
156	Q0	FBIS3-3856	75	5.1203711408275145	NH-BM25
156	Q0	FBIS3-2782	76	5.114221132321781	NH-BM25
156	Q0	FBIS3-6657	77	5.105254641572483	NH-BM25
156	Q0	FBIS3-2513	78	5.080548050953828	NH-BM25
156	Q0	FBIS3-6415	79	5.0788513254414145	NH-BM25
156	Q0	FBIS3-2255	80	5.078017344573111	NH-BM25
156	Q0	FBIS3-3341	81	5.076616587140526	NH-BM25
156	Q0	FBIS3-6079	82	5.068133262909069	NH-BM25
156	Q0	FBIS3-1996	83	5.0076090274303935	NH-BM25
156	Q0	FBIS3-5022	84	5.001060397967319	NH-BM25
156	Q0	FBIS3-2873	85	4.998000338885488	NH-BM25
156	Q0	FBIS3-4150	86	4.994944022304127	NH-BM25
156	Q0	FB

173	Q0	FBIS3-780	14	7.093745335192301	NH-BM25
173	Q0	FBIS3-944	15	7.024344038502699	NH-BM25
173	Q0	FBIS3-2578	16	6.8936715120573	NH-BM25
173	Q0	FBIS3-535	17	6.820004343782349	NH-BM25
173	Q0	FBIS3-181	18	6.787766368454693	NH-BM25
173	Q0	FBIS3-1248	19	6.767771947075408	NH-BM25
173	Q0	FBIS3-5817	20	6.731177687175455	NH-BM25
173	Q0	FBIS3-6642	21	6.692855529449149	NH-BM25
173	Q0	FBIS3-253	22	6.673415547457386	NH-BM25
173	Q0	FBIS3-5444	23	6.6405986579603065	NH-BM25
173	Q0	FBIS3-1956	24	6.595087892953357	NH-BM25
173	Q0	FBIS3-118	25	6.5892043083037395	NH-BM25
173	Q0	FBIS3-135	26	6.520916248299558	NH-BM25
173	Q0	FBIS3-1492	27	6.484109689945858	NH-BM25
173	Q0	FBIS3-7991	28	6.480558864772873	NH-BM25
173	Q0	FBIS3-423	29	6.423407116407545	NH-BM25
173	Q0	FBIS3-5223	30	6.386880174388941	NH-BM25
173	Q0	FBIS3-1518	31	6.310493840234612	NH-BM25
173	Q0	FBIS3-1581	32	6.28209952868759	NH-BM25
173	Q0	FBIS3-3872	33	6.227610076380158	NH-BM25
173	Q0	FBIS3-1313	34	6.052495459865039	NH-BM25
173	Q0	FBIS3-1349	35	6

190	Q0	FBIS3-4814	50	5.3677411637723855	NH-BM25
190	Q0	FBIS3-5684	51	5.319742972614216	NH-BM25
190	Q0	FBIS3-1497	52	5.194334470532431	NH-BM25
190	Q0	FBIS3-2650	53	5.087722198603794	NH-BM25
190	Q0	FBIS3-6524	54	4.952941596114599	NH-BM25
190	Q0	FBIS3-3525	55	4.9357644087353325	NH-BM25
190	Q0	FBIS3-3137	56	4.9357644087353325	NH-BM25
190	Q0	FBIS3-5650	57	4.915374106671164	NH-BM25
190	Q0	FBIS3-1866	58	4.804231346743612	NH-BM25
190	Q0	FBIS3-2138	59	4.7227011322692	NH-BM25
190	Q0	FBIS3-4683	60	4.571647344474257	NH-BM25
190	Q0	FBIS3-7025	61	4.5646319654423735	NH-BM25
190	Q0	FBIS3-5973	62	4.5477231851711615	NH-BM25
190	Q0	FBIS3-7028	63	4.540246787846085	NH-BM25
190	Q0	FBIS3-3417	64	4.532318393565792	NH-BM25
190	Q0	FBIS3-2326	65	4.523930199927245	NH-BM25
190	Q0	FBIS3-6434	66	4.520434037893177	NH-BM25
190	Q0	FBIS3-1928	67	4.507620359230625	NH-BM25
190	Q0	FBIS3-4788	68	4.494839527816323	NH-BM25
190	Q0	FBIS3-6310	69	4.4652758251911955	NH-BM25
190	Q0	FBIS3-6279	70	4.4652758251911955	NH-BM25
190	Q0	F