From ecb54cf0d7bdb02f2079ea561c3bc2a92d968fa0 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Tue, 1 Aug 2017 15:17:58 -0500 Subject: [PATCH 1/4] new get_nodes function --- api/api.py | 1 + api/handlers/dataexplorerhandler.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/api/api.py b/api/api.py index 36f609f98..3ba0af7bb 100644 --- a/api/api.py +++ b/api/api.py @@ -105,6 +105,7 @@ def prefix(path, routes): route('/dataexplorer/facets', DataExplorerHandler, h='get_facets', m=['POST']), route('/dataexplorer/search/fields', DataExplorerHandler, h='search_fields', m=['POST']), route('/dataexplorer/search/fields/aggregate', DataExplorerHandler, h='aggregate_field_values', m=['POST']), + route('/dataexplorer/search/nodes', DataExplorerHandler, h='get_nodes', m=['POST']), route('/dataexplorer/index/fields', DataExplorerHandler, h='index_field_names', m=['POST']), # Users diff --git a/api/handlers/dataexplorerhandler.py b/api/handlers/dataexplorerhandler.py index 4d37690f8..938984934 100644 --- a/api/handlers/dataexplorerhandler.py +++ b/api/handlers/dataexplorerhandler.py @@ -470,6 +470,37 @@ def get_facets(self): aggs['by_session']['subject.age'] = age_node['subject.age'] return {'facets': aggs} + def get_nodes(self): + + return_type, filters, search_string = self._parse_request() + query = { + "size": 0, + "query": { + "bool": { + "must": { + "match": { + "_all": search_string + } + }, + "filter": { + "bool" : { + "must" : filters + } + } + } + } + } + + query['aggs'] = { + "by_container": { + "terms": { + "field": return_type+"._id", + "size": size + } + } + } + + @require_login def search_fields(self): field_query = self.request.json_body.get('field') From ff1d34758e26a1c3e90b52da2fd09704a10e51cf Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 2 Aug 2017 10:19:34 -0500 Subject: [PATCH 2/4] get-file-nodes using scroll --- api/handlers/dataexplorerhandler.py | 79 +++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/api/handlers/dataexplorerhandler.py b/api/handlers/dataexplorerhandler.py index 938984934..33d7b425a 100644 --- a/api/handlers/dataexplorerhandler.py +++ b/api/handlers/dataexplorerhandler.py @@ -1,7 +1,7 @@ import copy import json -from elasticsearch import ElasticsearchException, TransportError +from elasticsearch import ElasticsearchException, TransportError, helpers from ..web import base from .. import config @@ -473,32 +473,81 @@ def get_facets(self): def get_nodes(self): return_type, filters, search_string = self._parse_request() + if return_type == 'file': + return get_file_nodes(return_type, filters, search_string) + query = { - "size": 0, - "query": { - "bool": { - "must": { + "bool": { + "must": { "match": { - "_all": search_string + "_all": search_string } - }, - "filter": { + }, + "filter": { "bool" : { - "must" : filters + "must" : [ + { "term" : {"container_type" : return_type}} + ] } - } } } } + + + # Add filters list to filter key on query if exists + if filters: + query['bool']['filter']['bool']['must'].extend(filters) + nodes = [] + results = helpers.scan(client=config.es, query={'query': query}, scroll='5m', size=1000, index='data_explorer', doc_type='flywheel', _source=[return_type+'._id'], aggs={"by_container": { + "terms": { + "field": "session._id", + "size": 1000 + }, + "aggs": { + "by_top_hit": { + "top_hits": { + "_source": "session._id", + "size": 1 + } + } + } + }}) + log.debug(results) + for batch in results: + log.debug(batch) + nodes.append({'level': return_type, '_id': batch['_source'][return_type]['_id']}) + return {'nodes':nodes} + + def get_file_nodes(return_type,filters,search_string): - query['aggs'] = { - "by_container": { - "terms": { - "field": return_type+"._id", - "size": size + query = { + "bool": { + "must": { + "match": { + "_all": search_string + } + }, + "filter": { + "bool" : { + "must" : [ + { "term" : {"container_type" : return_type}} + ] + } } } } + + + # Add filters list to filter key on query if exists + if filters: + query['bool']['filter']['bool']['must'].extend(filters) + nodes = [] + results = helpers.scan(client=config.es, query={'query': query}, scroll='5m', size=1000, index='data_explorer', doc_type='flywheel', _source=[return_type+'._id']) + log.debug(results) + for batch in results: + log.debug(batch) + nodes.append({'level': return_type, '_id': batch['_source'][return_type]['_id']}) + return {'nodes':nodes} @require_login From fb3367168de11c8a05394157149223a669e9f340 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 2 Aug 2017 12:57:56 -0500 Subject: [PATCH 3/4] get_nodes for upper containers --- api/handlers/dataexplorerhandler.py | 100 ++++++++++++++++++---------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/api/handlers/dataexplorerhandler.py b/api/handlers/dataexplorerhandler.py index 33d7b425a..efa844beb 100644 --- a/api/handlers/dataexplorerhandler.py +++ b/api/handlers/dataexplorerhandler.py @@ -470,55 +470,82 @@ def get_facets(self): aggs['by_session']['subject.age'] = age_node['subject.age'] return {'facets': aggs} + def search_size(self, return_type): + body = { + "size": 0, + "aggs" : { + "count" : { + "cardinality" : { + "field" : return_type + "._id", + "precision_threshold": 100000 + } + } + } + } + + size = config.es.search( + index='data_explorer', + doc_type='flywheel', + body=body)['aggregations']['count']['value'] + size = int(size*1.02) + return size + def get_nodes(self): return_type, filters, search_string = self._parse_request() if return_type == 'file': - return get_file_nodes(return_type, filters, search_string) + return self.get_file_nodes(return_type, filters, search_string) - query = { - "bool": { - "must": { - "match": { - "_all": search_string + size = self.search_size(return_type) + body = { + "size": 0, + "_source": ["session._id"], + 'query': { + "bool": { + "must": { + "match": { + "_all": search_string + } + }, + "filter": { + "bool" : { + "must" : filters + } } - }, - "filter": { - "bool" : { - "must" : [ - { "term" : {"container_type" : return_type}} - ] + } + }, + 'aggs': { + 'by_container': { + 'terms': { + 'field': return_type+'._id', + 'size': size } } } } + # Remove search string if none given + if not search_string: + body['query']['bool'].pop('must') + + # Remove filters list to filter key on query if does not exists + if not filters: + body['query']['bool'].pop('filter') + + if not filters and not search_string: + body['query'] = MATCH_ALL - # Add filters list to filter key on query if exists - if filters: - query['bool']['filter']['bool']['must'].extend(filters) nodes = [] - results = helpers.scan(client=config.es, query={'query': query}, scroll='5m', size=1000, index='data_explorer', doc_type='flywheel', _source=[return_type+'._id'], aggs={"by_container": { - "terms": { - "field": "session._id", - "size": 1000 - }, - "aggs": { - "by_top_hit": { - "top_hits": { - "_source": "session._id", - "size": 1 - } - } - } - }}) - log.debug(results) - for batch in results: - log.debug(batch) - nodes.append({'level': return_type, '_id': batch['_source'][return_type]['_id']}) + results = config.es.search( + index='data_explorer', + doc_type='flywheel', + body=body)['aggregations']['by_container']['buckets'] + + for result in results: + nodes.append({'level': return_type, '_id': result['key']}) return {'nodes':nodes} - def get_file_nodes(return_type,filters,search_string): + def get_file_nodes(self, return_type,filters,search_string): query = { "bool": { @@ -544,9 +571,8 @@ def get_file_nodes(return_type,filters,search_string): nodes = [] results = helpers.scan(client=config.es, query={'query': query}, scroll='5m', size=1000, index='data_explorer', doc_type='flywheel', _source=[return_type+'._id']) log.debug(results) - for batch in results: - log.debug(batch) - nodes.append({'level': return_type, '_id': batch['_source'][return_type]['_id']}) + for result in results: + nodes.append({'level': return_type, '_id': result['_source'][return_type]['_id']}) return {'nodes':nodes} From 6af9b5a9c3eb6e1d2de7bcbcccda05e212a6dbfb Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Fri, 18 Aug 2017 15:50:59 -0500 Subject: [PATCH 4/4] Used _construct_query to construct queries --- api/handlers/dataexplorerhandler.py | 62 +++-------------------------- 1 file changed, 5 insertions(+), 57 deletions(-) diff --git a/api/handlers/dataexplorerhandler.py b/api/handlers/dataexplorerhandler.py index efa844beb..e1e3dee70 100644 --- a/api/handlers/dataexplorerhandler.py +++ b/api/handlers/dataexplorerhandler.py @@ -497,43 +497,10 @@ def get_nodes(self): return self.get_file_nodes(return_type, filters, search_string) size = self.search_size(return_type) - body = { - "size": 0, - "_source": ["session._id"], - 'query': { - "bool": { - "must": { - "match": { - "_all": search_string - } - }, - "filter": { - "bool" : { - "must" : filters - } - } - } - }, - 'aggs': { - 'by_container': { - 'terms': { - 'field': return_type+'._id', - 'size': size - } - } - } - } + body = self._construct_query(return_type, search_string, filters, size) - # Remove search string if none given - if not search_string: - body['query']['bool'].pop('must') - - # Remove filters list to filter key on query if does not exists - if not filters: - body['query']['bool'].pop('filter') - - if not filters and not search_string: - body['query'] = MATCH_ALL + body['aggs']['by_container'].pop('aggs') + body['_source'] = [return_type + "._id"] nodes = [] results = config.es.search( @@ -545,29 +512,10 @@ def get_nodes(self): nodes.append({'level': return_type, '_id': result['key']}) return {'nodes':nodes} - def get_file_nodes(self, return_type,filters,search_string): + def get_file_nodes(self, return_type, filters, search_string): - query = { - "bool": { - "must": { - "match": { - "_all": search_string - } - }, - "filter": { - "bool" : { - "must" : [ - { "term" : {"container_type" : return_type}} - ] - } - } - } - } + query = self._construct_file_query(return_type, filters, search_string)['query'] - - # Add filters list to filter key on query if exists - if filters: - query['bool']['filter']['bool']['must'].extend(filters) nodes = [] results = helpers.scan(client=config.es, query={'query': query}, scroll='5m', size=1000, index='data_explorer', doc_type='flywheel', _source=[return_type+'._id']) log.debug(results)