From 9eed0356fe370e8fe8f54b5da3955da9de5a83ea Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 20 Sep 2017 12:40:04 -0500 Subject: [PATCH 1/6] returns count, mb size, and nodes for each file type --- api/api.py | 1 + api/download.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/api/api.py b/api/api.py index 96c5b62cf..2409c3354 100644 --- a/api/api.py +++ b/api/api.py @@ -85,6 +85,7 @@ def prefix(path, routes): # General-purpose upload & download route('/download', Download, h='download', m=['GET', 'POST']), + route('/download/summary', Download, h='summary', m=['POST']), route('/upload/', Upload, h='upload', m=['POST']), route('/clean-packfiles', Upload, h='clean_packfile_tokens', m=['POST']), route('/engine', Upload, h='engine', m=['POST']), diff --git a/api/download.py b/api/download.py index 99a4a3d51..aba5162f7 100644 --- a/api/download.py +++ b/api/download.py @@ -13,9 +13,9 @@ from . import validators import os from .dao.containerutil import pluralize - log = config.log +BYTES_IN_MEGABYTE = float(1<<20) def _filter_check(property_filter, property_values): minus = set(property_filter.get('-', [])) @@ -315,3 +315,84 @@ def download(self): log.debug(json.dumps(req_spec, sort_keys=True, indent=4, separators=(',', ': '))) return self._preflight_archivestream(req_spec, collection=self.get_param('collection')) + + def summary(self): + """Return a summary of what has been/will be downloaded based on a given query""" + req = self.request.json_body + req['_id'] = bson.ObjectId(req['_id']) + level = req['level'] + + containers = ['projects', 'sessions', 'acquisitions', 'analyses'] + cont_query = {} + if level == 'projects': + # Grab sessions and their ids + sessions = config.db.sessions.find({'project': req['_id']}, {'_id': 1}) + session_ids = [s['_id'] for s in sessions] + + # Grab acquisitions and their ids + acquisitions = config.db.acquisitions.find({'session': {'$in': session_ids}}, {'_id': 1}) + acquisition_ids = [a['_id'] for a in acquisitions] + parent_ids = [req['_id']] + session_ids + acquisition_ids + + # # Grab analyses and their ids + # analysis_ids = [an['_id'] for an in config.db.analyses.find({'parent.id': {'$in': parent_ids}})] + + # for each type of container below it will have a slightly modified match query + cont_query = { + 'projects': {'_id': {'project': req['_id']}}, + 'sessions': {'project': req['_id']}, + 'acquisitions': {'session': {'$in': session_ids}}, + 'analyses': {'parent.id': {'$in': parent_ids}} + } + if level == 'sessions': + + # Grab acquisitions and their ids + acquisitions = config.db.acquisitions.find({'session': req['_id']}, {'_id': 1}) + acquisition_ids = [a['_id'] for a in acquisitions] + parent_ids = [req['_id']] + acquisition_ids + + # # Grab analyses and their ids + # analysis_ids = [an['_id'] for an in config.db.analyses.find({'parent.id': {'$in': parent_ids}})] + + # for each type of container below it will have a slightly modified match query + cont_query = { + 'sessions': {'_id': req['_id']}, + 'acquisitions': {'session': req['_id']}, + 'analyses': {'parent.id': {'$in': parent_ids}} + } + containers = containers[1:] + + res = {} + for cont_name in containers: + # Aggregate file types + pipeline = [ + {'$match': cont_query[cont_name]}, + {'$unwind': '$files'}, + {'$project': {'_id': '$_id', 'type': '$files.type','mbs': {'$divide': ['$files.size', BYTES_IN_MEGABYTE]}}}, + {'$group': { + '_id': '$type', + 'count': {'$sum' : 1}, + 'mb_total': {'$sum':'$mbs'}, + 'nodes' : { + '$addToSet': {'level': {'$literal':cont_name}, '_id': '$_id'} + } + }} + ] + + try: + result = config.db.command('aggregate', cont_name, pipeline=pipeline) + except Exception as e: # pylint: disable=broad-except + result = e + return result + + if result.get("ok"): + for doc in result.get("result"): + type_ = doc['_id'] + if res.get(type_): + res[type_]['count'] += doc.get('count',0) + res[type_]['mb_total'] += doc.get('mb_total',0) + res[type_]['nodes'] += doc.get('nodes', []) + else: + res[type_] = doc + return res + From 95316aef755a77d6f88458a998eef2d0808aca7a Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 20 Sep 2017 13:34:51 -0500 Subject: [PATCH 2/6] added tests and fixed up typos --- api/download.py | 5 ++- .../integration_tests/python/test_download.py | 35 +++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/api/download.py b/api/download.py index aba5162f7..fa75b6c32 100644 --- a/api/download.py +++ b/api/download.py @@ -339,7 +339,7 @@ def summary(self): # for each type of container below it will have a slightly modified match query cont_query = { - 'projects': {'_id': {'project': req['_id']}}, + 'projects': {'_id': req['_id']}, 'sessions': {'project': req['_id']}, 'acquisitions': {'session': {'$in': session_ids}}, 'analyses': {'parent.id': {'$in': parent_ids}} @@ -382,8 +382,7 @@ def summary(self): try: result = config.db.command('aggregate', cont_name, pipeline=pipeline) except Exception as e: # pylint: disable=broad-except - result = e - return result + return e if result.get("ok"): for doc in result.get("result"): diff --git a/test/integration_tests/python/test_download.py b/test/integration_tests/python/test_download.py index 94452c918..4c092f56b 100644 --- a/test/integration_tests/python/test_download.py +++ b/test/integration_tests/python/test_download.py @@ -399,3 +399,38 @@ def test_filters(data_builder, file_form, as_admin): }) assert r.ok assert r.json()['file_cnt'] == 1 + +def test_summary(data_builder, as_admin, file_form): + project = data_builder.create_project(label='project1') + session = data_builder.create_session(label='session1') + session2 = data_builder.create_session(label='session1') + acquisition = data_builder.create_acquisition(session=session) + acquisition2 = data_builder.create_acquisition(session=session2) + + # upload the same file to each container created and use different tags to + # facilitate download filter tests: + # acquisition: [], session: ['plus'], project: ['plus', 'minus'] + file_name = 'test.csv' + as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form( + file_name, meta={'name': file_name, 'type': 'csv'})) + + as_admin.post('/acquisitions/' + acquisition2 + '/files', files=file_form( + file_name, meta={'name': file_name, 'type': 'csv'})) + + as_admin.post('/sessions/' + session + '/files', files=file_form( + file_name, meta={'name': file_name, 'type': 'csv', 'tags': ['plus']})) + + as_admin.post('/projects/' + project + '/files', files=file_form( + file_name, meta={'name': file_name, 'type': 'csv', 'tags': ['plus', 'minus']})) + + missing_object_id = '000000000000000000000000' + + r = as_admin.post('/download/summary', json={"level":"projects", "_id":project}) + assert r.ok + assert len(r.json()) == 1 + assert r.json().get("csv", {}).get("count",0) == 4 + + r = as_admin.post('/download/summary', json={"level":"sessions", "_id":session}) + assert r.ok + assert len(r.json()) == 1 + assert r.json().get("csv", {}).get("count",0) == 2 From efb24c61a9849ee7cebdf907e4e6743d14a4cfee Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 20 Sep 2017 15:44:09 -0500 Subject: [PATCH 3/6] added acquisition and analysis level filtering --- api/download.py | 13 +++++++++++-- test/integration_tests/python/test_download.py | 9 +++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/api/download.py b/api/download.py index fa75b6c32..fe1c09645 100644 --- a/api/download.py +++ b/api/download.py @@ -344,7 +344,7 @@ def summary(self): 'acquisitions': {'session': {'$in': session_ids}}, 'analyses': {'parent.id': {'$in': parent_ids}} } - if level == 'sessions': + elif level == 'sessions': # Grab acquisitions and their ids acquisitions = config.db.acquisitions.find({'session': req['_id']}, {'_id': 1}) @@ -361,6 +361,15 @@ def summary(self): 'analyses': {'parent.id': {'$in': parent_ids}} } containers = containers[1:] + elif level == 'acquisitions': + + cont_query['acquisitions'] = {'_id': req['_id']} + containers = ['acquisitions'] + elif level == 'analyses': + cont_query['analyses'] = {'_id': req['_id']} + containers = containers[-1:] + else: + self.abort(400, "{} not a recognized level".format(level)) res = {} for cont_name in containers: @@ -382,7 +391,7 @@ def summary(self): try: result = config.db.command('aggregate', cont_name, pipeline=pipeline) except Exception as e: # pylint: disable=broad-except - return e + self.abort(500, str(e)) if result.get("ok"): for doc in result.get("result"): diff --git a/test/integration_tests/python/test_download.py b/test/integration_tests/python/test_download.py index 4c092f56b..8f7d737ef 100644 --- a/test/integration_tests/python/test_download.py +++ b/test/integration_tests/python/test_download.py @@ -434,3 +434,12 @@ def test_summary(data_builder, as_admin, file_form): assert r.ok assert len(r.json()) == 1 assert r.json().get("csv", {}).get("count",0) == 2 + + r = as_admin.post('/download/summary', json={"level":"acquisitions", "_id":acquisition}) + assert r.ok + assert len(r.json()) == 1 + assert r.json().get("csv", {}).get("count",0) == 1 + + r = as_admin.post('/download/summary', json={"level":"groups", "_id":missing_object_id}) + assert r.status_code == 400 + From 7361cb944d8179294a12de635cf7559f3c6e03de Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Mon, 25 Sep 2017 13:14:08 -0500 Subject: [PATCH 4/6] added analysis tests, removed nodes --- api/download.py | 37 ++++--------------- .../integration_tests/python/test_download.py | 9 +++++ 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/api/download.py b/api/download.py index fe1c09645..f7fbe56d0 100644 --- a/api/download.py +++ b/api/download.py @@ -322,52 +322,34 @@ def summary(self): req['_id'] = bson.ObjectId(req['_id']) level = req['level'] - containers = ['projects', 'sessions', 'acquisitions', 'analyses'] + containers = ['projects', 'sessions', 'acquisitions'] cont_query = {} if level == 'projects': # Grab sessions and their ids sessions = config.db.sessions.find({'project': req['_id']}, {'_id': 1}) session_ids = [s['_id'] for s in sessions] - # Grab acquisitions and their ids - acquisitions = config.db.acquisitions.find({'session': {'$in': session_ids}}, {'_id': 1}) - acquisition_ids = [a['_id'] for a in acquisitions] - parent_ids = [req['_id']] + session_ids + acquisition_ids - - # # Grab analyses and their ids - # analysis_ids = [an['_id'] for an in config.db.analyses.find({'parent.id': {'$in': parent_ids}})] - # for each type of container below it will have a slightly modified match query cont_query = { 'projects': {'_id': req['_id']}, 'sessions': {'project': req['_id']}, - 'acquisitions': {'session': {'$in': session_ids}}, - 'analyses': {'parent.id': {'$in': parent_ids}} + 'acquisitions': {'session': {'$in': session_ids}} } elif level == 'sessions': - # Grab acquisitions and their ids - acquisitions = config.db.acquisitions.find({'session': req['_id']}, {'_id': 1}) - acquisition_ids = [a['_id'] for a in acquisitions] - parent_ids = [req['_id']] + acquisition_ids - - # # Grab analyses and their ids - # analysis_ids = [an['_id'] for an in config.db.analyses.find({'parent.id': {'$in': parent_ids}})] - # for each type of container below it will have a slightly modified match query cont_query = { 'sessions': {'_id': req['_id']}, - 'acquisitions': {'session': req['_id']}, - 'analyses': {'parent.id': {'$in': parent_ids}} + 'acquisitions': {'session': req['_id']} } containers = containers[1:] elif level == 'acquisitions': cont_query['acquisitions'] = {'_id': req['_id']} - containers = ['acquisitions'] + containers = containers[-1:] elif level == 'analyses': cont_query['analyses'] = {'_id': req['_id']} - containers = containers[-1:] + containers = ['analyses'] else: self.abort(400, "{} not a recognized level".format(level)) @@ -381,17 +363,15 @@ def summary(self): {'$group': { '_id': '$type', 'count': {'$sum' : 1}, - 'mb_total': {'$sum':'$mbs'}, - 'nodes' : { - '$addToSet': {'level': {'$literal':cont_name}, '_id': '$_id'} - } + 'mb_total': {'$sum':'$mbs'} }} ] try: result = config.db.command('aggregate', cont_name, pipeline=pipeline) except Exception as e: # pylint: disable=broad-except - self.abort(500, str(e)) + log.warning(e) + self.abort(500, "Failure to load summary") if result.get("ok"): for doc in result.get("result"): @@ -399,7 +379,6 @@ def summary(self): if res.get(type_): res[type_]['count'] += doc.get('count',0) res[type_]['mb_total'] += doc.get('mb_total',0) - res[type_]['nodes'] += doc.get('nodes', []) else: res[type_] = doc return res diff --git a/test/integration_tests/python/test_download.py b/test/integration_tests/python/test_download.py index 8f7d737ef..dc5a9a285 100644 --- a/test/integration_tests/python/test_download.py +++ b/test/integration_tests/python/test_download.py @@ -442,4 +442,13 @@ def test_summary(data_builder, as_admin, file_form): r = as_admin.post('/download/summary', json={"level":"groups", "_id":missing_object_id}) assert r.status_code == 400 + + r = as_admin.post('/sessions/' + session + '/analyses', files=file_form( + file_name, meta={'label': 'test', 'inputs':[{'name':file_name}]})) + assert r.ok + analysis = r.json()['_id'] + r = as_admin.post('/download/summary', json={"level":"analyses", "_id":analysis}) + assert r.ok + assert len(r.json()) == 1 + assert r.json().get("tabular data", {}).get("count",0) == 1 From fc86fa33207036d1910023b3245f3859f01014c4 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 4 Oct 2017 11:08:47 -0500 Subject: [PATCH 5/6] Request body is now list of nodes --- api/download.py | 88 ++++++++++++------- .../integration_tests/python/test_download.py | 12 +-- 2 files changed, 60 insertions(+), 40 deletions(-) diff --git a/api/download.py b/api/download.py index f7fbe56d0..b31f5e8e7 100644 --- a/api/download.py +++ b/api/download.py @@ -318,42 +318,62 @@ def download(self): def summary(self): """Return a summary of what has been/will be downloaded based on a given query""" + res = {} req = self.request.json_body - req['_id'] = bson.ObjectId(req['_id']) - level = req['level'] - - containers = ['projects', 'sessions', 'acquisitions'] - cont_query = {} - if level == 'projects': - # Grab sessions and their ids - sessions = config.db.sessions.find({'project': req['_id']}, {'_id': 1}) - session_ids = [s['_id'] for s in sessions] - - # for each type of container below it will have a slightly modified match query - cont_query = { - 'projects': {'_id': req['_id']}, - 'sessions': {'project': req['_id']}, - 'acquisitions': {'session': {'$in': session_ids}} - } - elif level == 'sessions': - - # for each type of container below it will have a slightly modified match query - cont_query = { - 'sessions': {'_id': req['_id']}, - 'acquisitions': {'session': req['_id']} - } - containers = containers[1:] - elif level == 'acquisitions': - - cont_query['acquisitions'] = {'_id': req['_id']} - containers = containers[-1:] - elif level == 'analyses': - cont_query['analyses'] = {'_id': req['_id']} - containers = ['analyses'] - else: - self.abort(400, "{} not a recognized level".format(level)) + cont_query = { + 'projects': {'_id': {'$in':[]}}, + 'sessions': {'_id': {'$in':[]}}, + 'acquisitions': {'_id': {'$in':[]}}, + 'analyses' : {'_id': {'$in':[]}} + } + for node in req: + node['_id'] = bson.ObjectId(node['_id']) + level = node['level'] + + containers = {'projects':0, 'sessions':0, 'acquisitions':0, 'analyses':0} + + if level == 'project': + # Grab sessions and their ids + sessions = config.db.sessions.find({'project': node['_id']}, {'_id': 1}) + session_ids = [s['_id'] for s in sessions] + acquisitions = config.db.acquisitions.find({'session': {'$in': session_ids}}, {'_id': 1}) + acquisition_ids = [a['_id'] for a in acquisitions] + + containers['projects']=1 + containers['sessions']=1 + containers['acquisitions']=1 + + # for each type of container below it will have a slightly modified match query + cont_query.get('projects',{}).get('_id',{}).get('$in').append(node['_id']) + cont_query['sessions']['_id']['$in'] = cont_query['sessions']['_id']['$in'] + session_ids + cont_query['acquisitions']['_id']['$in'] = cont_query['acquisitions']['_id']['$in'] + acquisition_ids + + elif level == 'session': + acquisitions = config.db.acquisitions.find({'session': node['_id']}, {'_id': 1}) + acquisition_ids = [a['_id'] for a in acquisitions] + + + # for each type of container below it will have a slightly modified match query + cont_query.get('sessions',{}).get('_id',{}).get('$in').append(node['_id']) + cont_query['acquisitions']['_id']['$in'] = cont_query['acquisitions']['_id']['$in'] + acquisition_ids + + containers['sessions']=1 + containers['acquisitions']=1 + + elif level == 'acquisition': + + cont_query.get('acquisitions',{}).get('_id',{}).get('$in').append(node['_id']) + containers['acquisitions']=1 + + elif level == 'analysis': + cont_query.get('analyses',{}).get('_id',{}).get('$in').append(node['_id']) + containers['analyses'] = 1 + + else: + self.abort(400, "{} not a recognized level".format(level)) + + containers = [cont for cont in containers if containers[cont] == 1] - res = {} for cont_name in containers: # Aggregate file types pipeline = [ diff --git a/test/integration_tests/python/test_download.py b/test/integration_tests/python/test_download.py index dc5a9a285..e210924a6 100644 --- a/test/integration_tests/python/test_download.py +++ b/test/integration_tests/python/test_download.py @@ -425,22 +425,22 @@ def test_summary(data_builder, as_admin, file_form): missing_object_id = '000000000000000000000000' - r = as_admin.post('/download/summary', json={"level":"projects", "_id":project}) + r = as_admin.post('/download/summary', json=[{"level":"project", "_id":project}]) assert r.ok assert len(r.json()) == 1 assert r.json().get("csv", {}).get("count",0) == 4 - r = as_admin.post('/download/summary', json={"level":"sessions", "_id":session}) + r = as_admin.post('/download/summary', json=[{"level":"session", "_id":session}]) assert r.ok assert len(r.json()) == 1 assert r.json().get("csv", {}).get("count",0) == 2 - r = as_admin.post('/download/summary', json={"level":"acquisitions", "_id":acquisition}) + r = as_admin.post('/download/summary', json=[{"level":"acquisition", "_id":acquisition},{"level":"acquisition", "_id":acquisition2}]) assert r.ok assert len(r.json()) == 1 - assert r.json().get("csv", {}).get("count",0) == 1 + assert r.json().get("csv", {}).get("count",0) == 2 - r = as_admin.post('/download/summary', json={"level":"groups", "_id":missing_object_id}) + r = as_admin.post('/download/summary', json=[{"level":"group", "_id":missing_object_id}]) assert r.status_code == 400 r = as_admin.post('/sessions/' + session + '/analyses', files=file_form( @@ -448,7 +448,7 @@ def test_summary(data_builder, as_admin, file_form): assert r.ok analysis = r.json()['_id'] - r = as_admin.post('/download/summary', json={"level":"analyses", "_id":analysis}) + r = as_admin.post('/download/summary', json=[{"level":"analysis", "_id":analysis}]) assert r.ok assert len(r.json()) == 1 assert r.json().get("tabular data", {}).get("count",0) == 1 From d6116b7ac10eab9f923d31eb5e5d725b21307111 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Wed, 4 Oct 2017 14:17:56 -0500 Subject: [PATCH 6/6] null filter --- api/download.py | 6 +++++- test/integration_tests/python/test_download.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/api/download.py b/api/download.py index b31f5e8e7..51ecafe13 100644 --- a/api/download.py +++ b/api/download.py @@ -20,7 +20,11 @@ def _filter_check(property_filter, property_values): minus = set(property_filter.get('-', [])) plus = set(property_filter.get('+', [])) - if not minus.isdisjoint(property_values): + if "null" in plus and not property_values: + return True + if "null" in minus and property_values: + return False + elif not minus.isdisjoint(property_values): return False if plus and plus.isdisjoint(property_values): return False diff --git a/test/integration_tests/python/test_download.py b/test/integration_tests/python/test_download.py index e210924a6..c660d4621 100644 --- a/test/integration_tests/python/test_download.py +++ b/test/integration_tests/python/test_download.py @@ -388,6 +388,8 @@ def test_filters(data_builder, file_form, as_admin): assert r.json()['file_cnt'] == 2 # Filter by type + as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form( + "test", meta={'name': "test", 'tags': ['red', 'blue']})) r = as_admin.post('/download', json={ 'optional': False, 'filters': [ @@ -399,6 +401,17 @@ def test_filters(data_builder, file_form, as_admin): }) assert r.ok assert r.json()['file_cnt'] == 1 + r = as_admin.post('/download', json={ + 'optional': False, + 'filters': [ + {'types': {'+':['null']}} + ], + 'nodes': [ + {'level': 'session', '_id': session}, + ] + }) + assert r.ok + assert r.json()['file_cnt'] == 1 def test_summary(data_builder, as_admin, file_form): project = data_builder.create_project(label='project1')