diff --git a/.gitignore b/.gitignore index 477cd785c..14982fa1d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ bootstrap.json coverage.xml /htmlcov node_modules/ +/bin/accesslog.csv diff --git a/api/api.py b/api/api.py index bddf373c4..d986423ef 100644 --- a/api/api.py +++ b/api/api.py @@ -97,6 +97,7 @@ def prefix(path, routes): route('/resolve', ResolveHandler, h='resolve', m=['POST']), route('/schemas/', SchemaHandler, m=['GET']), route('/report/', ReportHandler, m=['GET']), + route('/report/accesslog/types', ReportHandler, h='get_types', m=['GET']), # Search diff --git a/api/config.py b/api/config.py index 43bb3133d..c1fd29cc3 100644 --- a/api/config.py +++ b/api/config.py @@ -240,6 +240,7 @@ def initialize_db(): if __config['core']['access_log_enabled']: log_db.access_log.create_index('context.ticket_id') + log_db.access_log.create_index([('timestamp', pymongo.DESCENDING)]) create_or_recreate_ttl_index('authtokens', 'timestamp', 2592000) create_or_recreate_ttl_index('uploads', 'timestamp', 60) diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index 174d4d43b..f935f0e25 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -1,17 +1,48 @@ import copy +import unicodecsv as csv import datetime +import os import bson import dateutil import pymongo from .. import config +from .. import tempdir as tempfile from .. import util from ..web import base +from ..web.request import AccessTypeList + EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60 BYTES_IN_MEGABYTE = float(1<<20) +ACCESS_LOG_FIELDS = [ + "_id", + "access_type", + "context.acquisition.id", + "context.acquisition.label", + "context.analysis.id", + "context.analysis.label", + "context.collection.id", + "context.collection.label", + "context.group.id", + "context.group.label", + "context.project.id", + "context.project.label", + "context.session.id", + "context.session.label", + "context.subject.id", + "context.subject.label", + "context.ticket_id", + "origin.id", + "origin.method", + "origin.name", + "origin.type", + "request_method", + "request_path", + "timestamp" +] class APIReportException(Exception): pass @@ -25,6 +56,9 @@ class ReportHandler(base.RequestHandler): def __init__(self, request=None, response=None): super(ReportHandler, self).__init__(request, response) + def get_types(self): + return AccessTypeList + def get(self, report_type): report = None @@ -39,7 +73,26 @@ def get(self, report_type): raise NotImplementedError('Report type {} is not supported'.format(report_type)) if self.superuser_request or report.user_can_generate(self.uid): - return report.build() + # If csv is true create a temp file to respond with + if report_type == 'accesslog' and self.request.params.get('csv') == 'true': + + tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) + csv_file = open(os.path.join(tempdir.name, 'accesslog.csv'), 'w+') + writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) + writer.writeheader() + try: + for doc in report.build(): + writer.writerow(doc) + + except APIReportException as e: + self.abort(404, str(e)) + # Need to close and reopen file to flush buffer into file + csv_file.close() + self.response.app_iter = open(os.path.join(tempdir.name, 'accesslog.csv'), 'r') + self.response.headers['Content-Type'] = 'text/csv' + self.response.headers['Content-Disposition'] = 'attachment; filename="accesslog.csv"' + else: + return report.build() else: self.abort(403, 'User {} does not have required permissions to generate report'.format(self.uid)) @@ -437,6 +490,9 @@ def __init__(self, params): :end_date: ISO formatted timestamp :uid: user id of the target user :limit: number of records to return + :subject: subject code of session accessed + :access_types: list of access_types to filter logs + :csv: Boolean if user wants csv file """ super(AccessLogReport, self).__init__(params) @@ -445,6 +501,12 @@ def __init__(self, params): end_date = params.get('end_date') uid = params.get('user') limit= params.get('limit', 100) + subject = params.get('subject', None) + if params.get('bin') == 'true': + access_types = params.get('access_types', []) + else: + access_types = params.getall('access_types') + csv_bool = (params.get('csv') == 'true') if start_date: start_date = dateutil.parser.parse(start_date) @@ -460,12 +522,19 @@ def __init__(self, params): raise APIReportParamsException('Limit must be an integer greater than 0.') if limit < 1: raise APIReportParamsException('Limit must be an integer greater than 0.') + elif limit > 10000: + raise APIReportParamsException('Limit exceeds 10,000 entries, please contact admin to run script.') + for access_type in access_types: + if access_type not in AccessTypeList: + raise APIReportParamsException('Not a valid access type') self.start_date = start_date self.end_date = end_date self.uid = uid self.limit = limit - + self.subject = subject + self.access_types = access_types + self.csv_bool = csv_bool def user_can_generate(self, uid): """ @@ -475,6 +544,20 @@ def user_can_generate(self, uid): return True return False + def flatten(self, json_obj, flat, prefix = ""): + """ + flattens a document to not have nested objects + """ + + for field in json_obj.keys(): + if isinstance(json_obj[field], dict): + flat = self.flatten(json_obj[field], flat, prefix = prefix + field + ".") + else: + flat[prefix + field] = json_obj[field] + return flat + + def make_csv_ready(self, cursor): + return [self.flatten(json_obj, {}) for json_obj in cursor] def build(self): query = {} @@ -487,8 +570,16 @@ def build(self): query['timestamp']['$gte'] = self.start_date if self.end_date: query['timestamp']['$lte'] = self.end_date + if self.subject: + query['context.subject.label'] = self.subject + if self.access_types: + query['access_type'] = {'$in': self.access_types} + + cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING).batch_size(1000) + if self.csv_bool: + return self.make_csv_ready(cursor) - return config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING) + return cursor class UsageReport(Report): """ diff --git a/api/web/request.py b/api/web/request.py index 0d6a6c4cf..7421eef65 100644 --- a/api/web/request.py +++ b/api/web/request.py @@ -16,6 +16,7 @@ 'user_login': 'user_login', 'user_logout': 'user_logout' }) +AccessTypeList = [type_name for type_name, member in AccessType.__members__.items()] class SciTranRequest(Request): diff --git a/bin/log_csv.py b/bin/log_csv.py new file mode 100644 index 000000000..4ed1b7ced --- /dev/null +++ b/bin/log_csv.py @@ -0,0 +1,80 @@ +# This implementation as of July 19 2017 has these resource utilizations of the mongodb container: +# - 2 million entries: 1.50 Gb +# - 3 million entries: 2.05 Gb +# The entire docker application was given 6 Gb to use, when given the default 2 Gb, +# the process would frequently crash before 1 million entries were downloaded. + +import argparse +import unicodecsv as csv +import pymongo +import tarfile +import sys +import logging +import datetime + +from api.web.request import AccessTypeList +from api import config +from api.handlers.reporthandler import AccessLogReport, ACCESS_LOG_FIELDS + +ARG_TO_PARAMS= { + 'l': 'limit', + 's': 'start_date', + 'e': 'end_date', + 'u': 'uid', + 'j': 'subject', + 't': 'access_types' +} + +def download_large_csv(params): + """ + Script to download large csv files to avoid uwsgi worker running out of memory. + """ + entries = int(params['limit']) + params['csv'] = "true" + params['bin'] = "true" + params['limit'] = "100000" + + csv_file = open('accesslog.csv', 'w+') + writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) + writer.writeheader() + unicode_err_count = 0 + while entries > 0: + print "{} entries left".format(entries) + params['limit'] = str(min(entries, 100000)) + report = AccessLogReport(params) + rep = report.build() + end_date = str(rep[-1]['timestamp']) + for doc in rep[:-1]: + entries = entries - 1 + writer.writerow(doc) + + if len(rep) == 1: + entries = 0 + writer.writerow(rep[0]) + if len(rep) < int(params['limit']) - 1: + entries = 0 + csv_file.flush() + params['end_date'] = end_date + + + print "Encountered unicode errors and skipped {} entries".format(unicode_err_count) + csv_file.close() + +def format_arg(args): + return {ARG_TO_PARAMS[arg]: args[arg] for arg in args if args[arg] != None} + +if __name__ == '__main__': + try: + parser = argparse.ArgumentParser() + parser.add_argument("-s", help="Start date", type=str) + parser.add_argument("-e", help="End date", type=str) + parser.add_argument("-u", help="User id", type=str) + parser.add_argument("-l", help="Limit", type=str) + parser.add_argument("-j", help="subJect", type=str) + parser.add_argument("-t", help="list of access Types", type=str, nargs='+') + + args = vars(parser.parse_args()) + download_large_csv(format_arg(args)) + except Exception as e: + logging.exception('Unexpected error in log_csv.py') + sys.exit(1) diff --git a/requirements.txt b/requirements.txt index 8685fa0b7..33113dfdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ uwsgi==2.0.13.1 webapp2==2.5.2 WebOb==1.5.1 git+https://github.com/flywheel-io/gears.git@v0.1.1#egg=gears +unicodecsv==0.9.0 diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py index 49a8a4170..17ba54e1d 100644 --- a/test/integration_tests/python/test_reports.py +++ b/test/integration_tests/python/test_reports.py @@ -1,7 +1,7 @@ import calendar import copy import datetime - +from api.web.request import AccessTypeList # create timestamps for report filtering today = datetime.datetime.today() @@ -93,7 +93,7 @@ def test_project_report(data_builder, as_admin, as_user): assert len(project_report['projects']) == 2 -def test_access_log_report(with_user, as_user, as_admin): +def test_access_log_report(data_builder, with_user, as_user, as_admin): # try to get access log report as user r = as_user.get('/report/accesslog') assert r.status_code == 403 @@ -116,6 +116,12 @@ def test_access_log_report(with_user, as_user, as_admin): r = as_admin.get('/report/accesslog', params={'limit': 0}) assert r.status_code == 400 + # try to get report w/ limit == 1000 and limit > 1000 + r = as_admin.get('/report/accesslog', params={'limit': 10000}) + assert r.ok + r = as_admin.get('/report/accesslog', params={'limit': 10001}) + assert r.status_code == 400 + # get access log report for user r = as_admin.get('/report/accesslog', params={ 'start_date': yesterday_ts, 'end_date': tomorrow_ts, 'user': with_user.user @@ -135,6 +141,57 @@ def test_access_log_report(with_user, as_user, as_admin): assert len(accesslog) == 1 assert accesslog[0]['access_type'] == 'user_login' + # get access log report of certain subject + project = data_builder.create_project() + r = as_admin.post('/sessions', json={ + 'project': project, + 'label': 'test-accesslog-session', + 'timestamp': '1979-01-01T00:00:00+00:00', + 'subject': {'code': 'compliant5'} + }) + assert r.ok + session = r.json()['_id'] + + # In order to have two logs of this subject (POST does not create a log) + r = as_admin.get('/sessions/' + session) + assert r.ok + session = r.json()['_id'] + r = as_admin.get('/sessions/' + session) + assert r.ok + + r = as_admin.get('/report/accesslog', params={'subject': 'compliant5'}) + assert r.ok + for count,log in enumerate(r.json(), start = 1): + assert log.get('context', {}).get('subject', {}).get('label') == 'compliant5' + assert count == 2 + r = as_admin.delete('/sessions/' + session) + data_builder.delete_project(project, recursive=True) + + # get access log report of certain access types + r = as_admin.get('/report/accesslog', params={'access_types': ['user_login', 'view_container']}) + assert r.ok + ul, vc = False, False + + # test that each item in log is either view_container or user_login + for log in r.json(): + assert log.get('access_type') in ['user_login', 'view_container'] + if log.get('access_type') == 'user_login': + ul = True + elif log.get('access_type') == 'view_container': + vc = True + assert ul and vc + + # Download .csv file + r = as_admin.get('/report/accesslog', params={'csv': 'true'}) + assert r.ok + + r.content[0][:3] == '_id' + + # get the access types + r = as_admin.get('/report/accesslog/types') + assert r.ok + assert r.json() == AccessTypeList + def test_usage_report(data_builder, file_form, as_user, as_admin): # try to get usage report as user