-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
356 lines (292 loc) · 12.5 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
"""
This module implements a (partial, sample) query interface for elasticsearch movie search.
You will need to rewrite and expand sections to support the types of queries over the fields in your UI.
Documentation for elasticsearch query DSL:
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html
For python version of DSL:
https://elasticsearch-dsl.readthedocs.io/en/latest/
Search DSL:
https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html
"""
import re
from flask import *
from index import Movie
from pprint import pprint
from elasticsearch_dsl import Q
from elasticsearch_dsl.utils import AttrList
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.analysis import tokenizer, analyzer
app = Flask(__name__)
# Initialize global variables for rendering page
tmp_text = ""
tmp_title = ""
tmp_star = ""
tmp_min = ""
tmp_max = ""
tmp_director = ""
tmp_lan = ""
tmp_country = ""
tmp_loc = ""
tmp_minyear = ""
tmp_maxyear = ""
tmp_cats = ""
gresults = {}
# index name var
index_name = 'wiki_film_index'
# display query page
@app.route("/")
def search():
return render_template('page_query.html')
# display results page for first set of results and "next" sets.
@app.route("/results", defaults = {'page': 1}, methods = ['GET', 'POST'])
@app.route("/results/<page>", methods = ['GET', 'POST'])
def results(page):
global tmp_text
global tmp_title
global tmp_star
global tmp_min
global tmp_max
global tmp_director
global tmp_lan
global tmp_country
global tmp_loc
global tmp_minyear
global tmp_maxyear
global tmp_cats
global gresults
# convert the <page> parameter in url to integer.
if type(page) is not int:
page = int(page.encode('utf-8'))
# if the method of request is post (for initial query), store query in local global variables
# if the method of request is get (for "next" results), extract query contents from client's global variables
if request.method == 'POST':
# if has query, strip() all whitespace
text_query = request.form['query'].strip()
star_query = request.form['starring'].strip()
mintime_query = request.form['mintime'].strip()
if len(mintime_query) != 0:
mintime_query = int(mintime_query)
maxtime_query = request.form['maxtime'].strip()
if len(maxtime_query) != 0:
maxtime_query = int(maxtime_query)
director_query = request.form['director'].strip()
lan_query = request.form['language'].strip()
country_query = request.form['country'].strip()
loc_query = request.form['location'].strip()
minyear_query = request.form['minplottime'].strip()
if len(minyear_query) != 0:
minyear_query = int(minyear_query)
maxyear_query = request.form['maxplottime'].strip()
if len(maxyear_query) != 0:
maxyear_query = int(maxyear_query)
cats_query = request.form['categories'].strip()
# update global variable template data
tmp_text = text_query
tmp_star = star_query
tmp_min = mintime_query
tmp_max = maxtime_query
tmp_director = director_query
tmp_lan = lan_query
tmp_country = country_query
tmp_loc = loc_query
tmp_minyear = minyear_query
tmp_maxyear = maxyear_query
tmp_cats = cats_query
else:
# use the current values stored in global variables.
text_query = tmp_text
star_query = tmp_star
mintime_query = tmp_min
maxtime_query = tmp_max
director_query = tmp_director
lan_query = tmp_lan
country_query = tmp_country
loc_query = tmp_loc
minyear_query = tmp_minyear
maxyear_query = tmp_maxyear
cats_query = tmp_cats
# store query values to display in search boxes in UI
shows = {}
shows['text'] = text_query
shows['star'] = star_query
shows['maxtime'] = maxtime_query
shows['mintime'] = mintime_query
shows['director'] = director_query
shows['lan'] = lan_query
shows['country'] = country_query
shows['loc'] = loc_query
shows['minyear'] = minyear_query
shows['maxyear'] = maxyear_query
shows['cats'] = cats_query
# keep a copy of original text query, in case cull out explicit phrases later
full_text_query = text_query
# Create a search object to query our index
s = Search(index = index_name)
# Build up your elasticsearch query in piecemeal fashion based on the user's parameters passed in.
# The search API is "chainable".
# Each call to search.query method adds criteria to our growing elasticsearch query.
# You will change this section based on how you want to process the query data input into your interface.
# set flag to default to indicate all terms have been matched
all_matched = True
# compile a Regex pattern to extract explicit phrases enclosed by ""
pattern = re.compile(r'(?:\B\")(.*?)(?:\b\")')
phrases = pattern.findall(text_query)
# get the rest free terms
text_query = pattern.sub('', text_query).strip()
# First doing conjunctive search over multiple fields (title and text) using the text_query and phrases passed in
if len(text_query) + len(phrases) > 0:
# save deep copies for disjunctive search later
tmp_s = s.__copy__()
tmp_phrases = phrases.copy()
# conjunctive search for text_query AND phrases, with boosted field weight
if len(text_query) > 0:
s = s.query('multi_match', query = text_query, type = 'cross_fields', fields = ['title^2', 'text'],
operator = 'and')
while len(phrases) > 0:
s = s.query('multi_match', query = phrases.pop(), type = 'phrase_prefix', fields = ['title^2', 'text'])
# if conjunctive search has no result, doing disjunctive ( text_query OR phrases )
if s.count() == 0:
# indicate not all terms are matched
all_matched = False
if len(text_query) > 0:
q = Q('multi_match', query = text_query, type = 'cross_fields', fields = ['title^2', 'text'],
operator = 'or')
else:
q = Q('multi_match', query = tmp_phrases.pop(), type = 'phrase_prefix', fields = ['title^2', 'text'])
while len(tmp_phrases) > 0:
q |= Q('multi_match', query = tmp_phrases.pop(), type = 'phrase_prefix', fields = ['title^2', 'text'])
s = tmp_s.query(q)
# search for multiple fields using chained query (AND)
if len(mintime_query) > 0:
s = s.query('range', runtime = {'gte': mintime_query})
if len(maxtime_query) > 0:
s = s.query('range', runtime = {'lte': maxtime_query})
if len(minyear_query) > 0:
s = s.query('range', runtime = {'gte': minyear_query})
if len(maxyear_query) > 0:
s = s.query('range', runtime = {'lte': maxyear_query})
if len(star_query) > 0:
s = s.query('match', starring = star_query)
if len(director_query) > 0:
s = s.query('match', director = director_query)
if len(lan_query) > 0:
s = s.query('match', language = lan_query)
if len(country_query) > 0:
s = s.query('match', country = country_query)
if len(loc_query) > 0:
s = s.query('match', location = loc_query)
if len(cats_query) > 0:
s = s.query('match', categories = cats_query)
# highlight
s = s.highlight_options(pre_tags = '<mark>', post_tags = '</mark>')
s = s.highlight('text', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('title', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('starring', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('director', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('language', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('country', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('location', fragment_size = 999999999, number_of_fragments = 1)
s = s.highlight('categories', fragment_size = 999999999, number_of_fragments = 1)
# determine the subset of results to display (based on current <page> value)
start = 0 + (page - 1) * 10
end = 10 + (page - 1) * 10
# execute search and return results in specified range.
response = s[start:end].execute()
# insert data into response
resultList = {}
for hit in response.hits:
result = {}
result['score'] = hit.meta.score
if 'highlight' in hit.meta:
if 'title' in hit.meta.highlight:
result['title'] = hit.meta.highlight.title[0]
else:
result['title'] = hit.title
if 'text' in hit.meta.highlight:
result['text'] = hit.meta.highlight.text[0]
else:
result['text'] = hit.text
if 'starring' in hit.meta.highlight:
result['starring'] = hit.meta.highlight.starring[0]
else:
result['starring'] = hit.starring
if 'director' in hit.meta.highlight:
result['director'] = hit.meta.highlight.director[0]
else:
result['director'] = hit.director
if 'language' in hit.meta.highlight:
result['language'] = hit.meta.highlight.language[0]
else:
result['language'] = hit.language
if 'country' in hit.meta.highlight:
result['country'] = hit.meta.highlight.country[0]
else:
result['country'] = hit.country
if 'location' in hit.meta.highlight:
result['location'] = hit.meta.highlight.location[0]
else:
result['location'] = hit.location
if 'categories' in hit.meta.highlight:
result['categories'] = hit.meta.highlight.categories[0]
else:
result['categories'] = hit.categories
else:
result['title'] = hit.title
result['text'] = hit.text
result['starring'] = hit.starring
result['director'] = hit.director
result['language'] = hit.language
result['country'] = hit.country
result['location'] = hit.location
result['categories'] = hit.categories
resultList[hit.meta.id] = result
# make the result list available globally
gresults = resultList
# get the total number of matching results
result_num = response.hits.total
# if we find the results, extract title and text information from doc_data, else do nothing
if result_num > 0:
return render_template('page_SERP.html', results = resultList, res_num = result_num, page_num = page,
queries = shows, all_matched = all_matched)
else:
message = []
if len(full_text_query) > 0:
message.append('Unknown search term: ' + full_text_query)
if len(star_query) > 0:
message.append('Cannot find star: ' + star_query)
if len(director_query) > 0:
message.append('Cannot find director: ' + director_query)
if len(lan_query) > 0:
message.append('Cannot find language: ' + lan_query)
if len(country_query) > 0:
message.append('Cannot find country: ' + country_query)
if len(loc_query) > 0:
message.append('Cannot find location: ' + loc_query)
if len(cats_query) > 0:
message.append('Cannot find categories: ' + cats_query)
return render_template('page_SERP.html', results = message, res_num = result_num, page_num = page,
queries = shows)
# display a particular document given a result number
@app.route("/documents/<res>", methods = ['GET'])
def documents(res):
global gresults
film = gresults[res]
filmtitle = film['title']
for term in film:
if type(film[term]) is AttrList:
s = "\n"
for item in film[term]:
s += item + ",\n"
film[term] = s
# fetch the movie from the elasticsearch index using its id
movie = Movie.get(id = res, index = index_name)
filmdic = movie.to_dict()
# in case of no runtime data
if 'runtime' in filmdic:
film['runtime'] = str(filmdic['runtime']) + " min"
else:
film['runtime'] = "N/A"
return render_template('page_targetArticle.html', film = film, title = filmtitle)
if __name__ == "__main__":
app.run(debug = True)