From ff573ff6823d1075a687b8bfa860bc2ec688fca2 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Mon, 12 Jun 2017 11:30:08 -0500 Subject: [PATCH 1/9] subject and access_types params added --- api/handlers/reporthandler.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index 174d4d43b..a5875cfb8 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -437,6 +437,8 @@ def __init__(self, params): :end_date: ISO formatted timestamp :uid: user id of the target user :limit: number of records to return + :subject: subject code of session accessed + :access_types: list of access_types to filter logs """ super(AccessLogReport, self).__init__(params) @@ -445,6 +447,8 @@ def __init__(self, params): end_date = params.get('end_date') uid = params.get('user') limit= params.get('limit', 100) + subject = params.get('subject', None) + access_types = params.getall('access_types') if start_date: start_date = dateutil.parser.parse(start_date) @@ -460,12 +464,16 @@ def __init__(self, params): raise APIReportParamsException('Limit must be an integer greater than 0.') if limit < 1: raise APIReportParamsException('Limit must be an integer greater than 0.') + for access_type in access_types: + if access_type not in ['user_login', 'view_container']: + raise APIReportParamsException('Not a valid access type') self.start_date = start_date self.end_date = end_date self.uid = uid self.limit = limit - + self.subject = subject + self.access_types = access_types def user_can_generate(self, uid): """ @@ -487,6 +495,10 @@ def build(self): query['timestamp']['$gte'] = self.start_date if self.end_date: query['timestamp']['$lte'] = self.end_date + if self.subject: + query['context.subject.label'] = self.subject + if self.access_types: + query['access_type'] = {'$in': self.access_types} return config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING) From a2636e7871ea111e03036355d160d4a188cab51c Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Tue, 13 Jun 2017 10:45:06 -0500 Subject: [PATCH 2/9] writes accesslog to csv file --- api/handlers/reporthandler.py | 58 +++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index a5875cfb8..e12f567b3 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -1,4 +1,5 @@ import copy +import csv import datetime import bson @@ -12,6 +13,26 @@ EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60 BYTES_IN_MEGABYTE = float(1<<20) +ACCESS_LOG_FIELDS = [ + "context.session.label", + "context.project.id", + "context.subject.label", + "context.ticket_id", + "context.acquisition.id", + "context.acquisition.label", + "timestamp", + "access_type", + "context.group.id", + "request_method", + "context.subject.id", + "request_path", + "context.group.label", + "context.project.label", + "origin.id", + "_id", + "context.session.id", + "origin.type" +] class APIReportException(Exception): pass @@ -39,7 +60,19 @@ def get(self, report_type): raise NotImplementedError('Report type {} is not supported'.format(report_type)) if self.superuser_request or report.user_can_generate(self.uid): - return report.build() + if report_type == 'accesslog' and self.request.params.get('csv') == 'true': + csv_file = open("acceslog.csv", 'w+') + writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) + writer.writeheader() + + for doc in report.build(): + writer.writerow(doc) + + self.response.app_iter = csv_file + self.response.headers['Content-Type'] = 'text/csv' + self.response.headers['Content-Disposition'] = 'attachment; filename="acceslog.csv"' + else: + return report.build() else: self.abort(403, 'User {} does not have required permissions to generate report'.format(self.uid)) @@ -439,6 +472,7 @@ def __init__(self, params): :limit: number of records to return :subject: subject code of session accessed :access_types: list of access_types to filter logs + :csv: Boolean if user wants csv file """ super(AccessLogReport, self).__init__(params) @@ -449,6 +483,7 @@ def __init__(self, params): limit= params.get('limit', 100) subject = params.get('subject', None) access_types = params.getall('access_types') + csv_bool = params.get('csv') == 'true' if start_date: start_date = dateutil.parser.parse(start_date) @@ -474,6 +509,7 @@ def __init__(self, params): self.limit = limit self.subject = subject self.access_types = access_types + self.csv_bool = csv_bool def user_can_generate(self, uid): """ @@ -483,6 +519,19 @@ def user_can_generate(self, uid): return True return False + def flatten(self, json_obj, flat, prefix = ""): + """ + flattens a + """ + for field in json_obj.keys(): + if isinstance(json_obj[field], dict): + flat = self.flatten(json_obj[field], flat, prefix = prefix + field + ".") + else: + flat[prefix + field] = json_obj[field] + return flat + + def make_csv(self, cursor): + return [self.flatten(json_obj, {}) for json_obj in cursor] def build(self): query = {} @@ -500,7 +549,12 @@ def build(self): if self.access_types: query['access_type'] = {'$in': self.access_types} - return config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING) + cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING) + + if self.csv_bool: + return self.make_csv(cursor) + + return cursor class UsageReport(Report): """ From 7a5119c2ba22c786e40cf3fbedb15764b6ffbdec Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Tue, 13 Jun 2017 11:19:15 -0500 Subject: [PATCH 3/9] csv file can be downloaded --- api/handlers/reporthandler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index e12f567b3..43e321e00 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -1,12 +1,14 @@ import copy import csv import datetime +import os import bson import dateutil import pymongo from .. import config +from .. import tempdir as tempfile from .. import util from ..web import base @@ -61,14 +63,16 @@ def get(self, report_type): if self.superuser_request or report.user_can_generate(self.uid): if report_type == 'accesslog' and self.request.params.get('csv') == 'true': - csv_file = open("acceslog.csv", 'w+') + tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) + csv_file = open(os.path.join(tempdir.name, 'acceslog.csv'), 'w+') writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) writer.writeheader() for doc in report.build(): writer.writerow(doc) - self.response.app_iter = csv_file + csv_file.close() + self.response.app_iter = open(os.path.join(tempdir.name, 'acceslog.csv'), 'r') self.response.headers['Content-Type'] = 'text/csv' self.response.headers['Content-Disposition'] = 'attachment; filename="acceslog.csv"' else: From 6356e1777ebe2fa0c8f89fab521ca172c804245e Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Tue, 13 Jun 2017 15:01:50 -0500 Subject: [PATCH 4/9] tested new functionality of access log reports --- api/handlers/reporthandler.py | 27 ++++++----- test/integration_tests/python/test_reports.py | 47 ++++++++++++++++++- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index 43e321e00..379e09363 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -16,24 +16,24 @@ EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60 BYTES_IN_MEGABYTE = float(1<<20) ACCESS_LOG_FIELDS = [ - "context.session.label", - "context.project.id", - "context.subject.label", - "context.ticket_id", + "_id", + "access_type", "context.acquisition.id", "context.acquisition.label", - "timestamp", - "access_type", "context.group.id", - "request_method", - "context.subject.id", - "request_path", "context.group.label", + "context.project.id", "context.project.label", - "origin.id", - "_id", "context.session.id", - "origin.type" + "context.session.label", + "context.subject.id", + "context.subject.label", + "context.ticket_id", + "origin.id", + "origin.type", + "request_method", + "request_path", + "timestamp" ] class APIReportException(Exception): @@ -71,6 +71,7 @@ def get(self, report_type): for doc in report.build(): writer.writerow(doc) + # Need to close and reopen file to flush buffer into file csv_file.close() self.response.app_iter = open(os.path.join(tempdir.name, 'acceslog.csv'), 'r') self.response.headers['Content-Type'] = 'text/csv' @@ -504,7 +505,7 @@ def __init__(self, params): if limit < 1: raise APIReportParamsException('Limit must be an integer greater than 0.') for access_type in access_types: - if access_type not in ['user_login', 'view_container']: + if access_type not in ['user_login', 'view_container', 'download_file']: raise APIReportParamsException('Not a valid access type') self.start_date = start_date diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py index 49a8a4170..670594924 100644 --- a/test/integration_tests/python/test_reports.py +++ b/test/integration_tests/python/test_reports.py @@ -2,7 +2,6 @@ import copy import datetime - # create timestamps for report filtering today = datetime.datetime.today() ts_format = '{:%Y-%m-%dT%H:%M:%S+00:00}' @@ -93,7 +92,7 @@ def test_project_report(data_builder, as_admin, as_user): assert len(project_report['projects']) == 2 -def test_access_log_report(with_user, as_user, as_admin): +def test_access_log_report(data_builder, with_user, as_user, as_admin): # try to get access log report as user r = as_user.get('/report/accesslog') assert r.status_code == 403 @@ -135,6 +134,50 @@ def test_access_log_report(with_user, as_user, as_admin): assert len(accesslog) == 1 assert accesslog[0]['access_type'] == 'user_login' + # get access log report of certain subject + project = data_builder.create_project() + r = as_admin.post('/sessions', json={ + 'project': project, + 'label': 'test-accesslog-session', + 'timestamp': '1979-01-01T00:00:00+00:00', + 'subject': {'code': 'compliant5'} + }) + assert r.ok + session = r.json()['_id'] + r = as_admin.get('/sessions/' + session) + assert r.ok + session = r.json()['_id'] + r = as_admin.get('/sessions/' + session) + assert r.ok + + r = as_admin.get('/report/accesslog', params={'subject': 'compliant5'}) + assert r.ok + for count,log in enumerate(r.json(), start = 1): + assert log.get('context', {}).get('subject', {}).get('label') == 'compliant5' + assert count == 2 + r = as_admin.delete('/sessions/' + session) + data_builder.delete_project(project, recursive=True) + + # get access log report of certain access types + r = as_admin.get('/report/accesslog', params={'access_types': ['user_login', 'view_container']}) + assert r.ok + ul, vc = False, False + + # test that each item in log is either view_container or user_login + for log in r.json(): + assert log.get('access_type') in ['user_login', 'view_container'] + if log.get('access_type') == 'user_login': + ul = True + elif log.get('access_type') == 'view_container': + vc = True + assert ul and vc + + # Download .csv file + r = as_admin.get('/report/accesslog', params={'csv': 'true'}) + assert r.ok + + r.content[0][:3] == '_id' + def test_usage_report(data_builder, file_form, as_user, as_admin): # try to get usage report as user From 4925e1fe2d6f3df18562479b9680b925bb14d1e1 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Tue, 13 Jun 2017 16:11:46 -0500 Subject: [PATCH 5/9] added endpoint for types --- api/api.py | 1 + api/handlers/reporthandler.py | 14 ++++++++++---- api/web/request.py | 1 + test/integration_tests/python/test_reports.py | 10 +++++++++- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/api/api.py b/api/api.py index bddf373c4..d986423ef 100644 --- a/api/api.py +++ b/api/api.py @@ -97,6 +97,7 @@ def prefix(path, routes): route('/resolve', ResolveHandler, h='resolve', m=['POST']), route('/schemas/', SchemaHandler, m=['GET']), route('/report/', ReportHandler, m=['GET']), + route('/report/accesslog/types', ReportHandler, h='get_types', m=['GET']), # Search diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index 379e09363..3e6d4f96b 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -12,6 +12,8 @@ from .. import util from ..web import base +from ..web.request import AccessTypeList + EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60 BYTES_IN_MEGABYTE = float(1<<20) @@ -48,6 +50,9 @@ class ReportHandler(base.RequestHandler): def __init__(self, request=None, response=None): super(ReportHandler, self).__init__(request, response) + def get_types(self): + return AccessTypeList + def get(self, report_type): report = None @@ -62,6 +67,7 @@ def get(self, report_type): raise NotImplementedError('Report type {} is not supported'.format(report_type)) if self.superuser_request or report.user_can_generate(self.uid): + # If csv is true create a temp file to respond with if report_type == 'accesslog' and self.request.params.get('csv') == 'true': tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) csv_file = open(os.path.join(tempdir.name, 'acceslog.csv'), 'w+') @@ -505,7 +511,7 @@ def __init__(self, params): if limit < 1: raise APIReportParamsException('Limit must be an integer greater than 0.') for access_type in access_types: - if access_type not in ['user_login', 'view_container', 'download_file']: + if access_type not in AccessTypeList: raise APIReportParamsException('Not a valid access type') self.start_date = start_date @@ -526,7 +532,7 @@ def user_can_generate(self, uid): def flatten(self, json_obj, flat, prefix = ""): """ - flattens a + flattens a document to not have nested objects """ for field in json_obj.keys(): if isinstance(json_obj[field], dict): @@ -535,7 +541,7 @@ def flatten(self, json_obj, flat, prefix = ""): flat[prefix + field] = json_obj[field] return flat - def make_csv(self, cursor): + def make_csv_ready(self, cursor): return [self.flatten(json_obj, {}) for json_obj in cursor] def build(self): @@ -557,7 +563,7 @@ def build(self): cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING) if self.csv_bool: - return self.make_csv(cursor) + return self.make_csv_ready(cursor) return cursor diff --git a/api/web/request.py b/api/web/request.py index 0d6a6c4cf..7421eef65 100644 --- a/api/web/request.py +++ b/api/web/request.py @@ -16,6 +16,7 @@ 'user_login': 'user_login', 'user_logout': 'user_logout' }) +AccessTypeList = [type_name for type_name, member in AccessType.__members__.items()] class SciTranRequest(Request): diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py index 670594924..d3cfa0b74 100644 --- a/test/integration_tests/python/test_reports.py +++ b/test/integration_tests/python/test_reports.py @@ -1,6 +1,7 @@ import calendar import copy import datetime +from api.web.request import AccessTypeList # create timestamps for report filtering today = datetime.datetime.today() @@ -144,6 +145,8 @@ def test_access_log_report(data_builder, with_user, as_user, as_admin): }) assert r.ok session = r.json()['_id'] + + # In order to have two logs of this subject (POST does not create a log) r = as_admin.get('/sessions/' + session) assert r.ok session = r.json()['_id'] @@ -176,7 +179,12 @@ def test_access_log_report(data_builder, with_user, as_user, as_admin): r = as_admin.get('/report/accesslog', params={'csv': 'true'}) assert r.ok - r.content[0][:3] == '_id' + r.content[0][:3] == '_id' + + # get the access types + r = as_admin.get('/report/accesslog/types') + assert r.ok + assert r.json() == AccessTypeList def test_usage_report(data_builder, file_form, as_user, as_admin): From 53aa27d1c1e552b67d772efc9fd4abf70e8100d6 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Thu, 13 Jul 2017 14:12:53 -0500 Subject: [PATCH 6/9] script to create large csv file started --- api/handlers/reporthandler.py | 34 +++++++++++----- bin/log_csv.py | 77 +++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 11 deletions(-) create mode 100644 bin/log_csv.py diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index 3e6d4f96b..c264f435d 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -22,6 +22,10 @@ "access_type", "context.acquisition.id", "context.acquisition.label", + "context.analysis.id", + "context.analysis.label", + "context.collection.id", + "context.collection.label", "context.group.id", "context.group.label", "context.project.id", @@ -32,6 +36,8 @@ "context.subject.label", "context.ticket_id", "origin.id", + "origin.method", + "origin.name", "origin.type", "request_method", "request_path", @@ -69,19 +75,22 @@ def get(self, report_type): if self.superuser_request or report.user_can_generate(self.uid): # If csv is true create a temp file to respond with if report_type == 'accesslog' and self.request.params.get('csv') == 'true': + tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) - csv_file = open(os.path.join(tempdir.name, 'acceslog.csv'), 'w+') + csv_file = open(os.path.join(tempdir.name, 'accesslog.csv'), 'w+') writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) writer.writeheader() - - for doc in report.build(): - writer.writerow(doc) - + try: + for doc in report.build(): + writer.writerow(doc) + + except APIReportException as e: + self.abort(404, str(e)) # Need to close and reopen file to flush buffer into file csv_file.close() - self.response.app_iter = open(os.path.join(tempdir.name, 'acceslog.csv'), 'r') + self.response.app_iter = open(os.path.join(tempdir.name, 'accesslog.csv'), 'r') self.response.headers['Content-Type'] = 'text/csv' - self.response.headers['Content-Disposition'] = 'attachment; filename="acceslog.csv"' + self.response.headers['Content-Disposition'] = 'attachment; filename="accesslog.csv"' else: return report.build() else: @@ -493,8 +502,11 @@ def __init__(self, params): uid = params.get('user') limit= params.get('limit', 100) subject = params.get('subject', None) - access_types = params.getall('access_types') - csv_bool = params.get('csv') == 'true' + if params.get('bin') == 'true': + access_types = params.get('access_types', []) + else: + access_types = params.getall('access_types') + csv_bool = (params.get('csv') == 'true') if start_date: start_date = dateutil.parser.parse(start_date) @@ -534,6 +546,7 @@ def flatten(self, json_obj, flat, prefix = ""): """ flattens a document to not have nested objects """ + for field in json_obj.keys(): if isinstance(json_obj[field], dict): flat = self.flatten(json_obj[field], flat, prefix = prefix + field + ".") @@ -560,8 +573,7 @@ def build(self): if self.access_types: query['access_type'] = {'$in': self.access_types} - cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING) - + cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING).batch_size(1000) if self.csv_bool: return self.make_csv_ready(cursor) diff --git a/bin/log_csv.py b/bin/log_csv.py new file mode 100644 index 000000000..ae6e65448 --- /dev/null +++ b/bin/log_csv.py @@ -0,0 +1,77 @@ +# This implementation as of July 19 2017 has these resource utilizations of the mongodb container: +# - 2 million entries: 1.50 Gb +# - 3 million entries: 2.05 Gb +# The entire docker application was given 6 Gb to use, when given the default 2 Gb, +# the process would frequently crash before 1 million entries were downloaded. + +import argparse +import csv +import pymongo +import tarfile +import sys +import logging +import datetime + +from api.web.request import AccessTypeList +from api import config +from api.handlers.reporthandler import AccessLogReport, ACCESS_LOG_FIELDS + +ARG_TO_PARAMS= { + 'l': 'limit', + 's': 'start_date', + 'e': 'end_date', + 'u': 'uid', + 'j': 'subject', + 't': 'access_types' +} + +def download_large_csv(params): + """ + Script to download large csv files to avoid uwsgi worker running out of memory. + """ + lim = int(params['limit']) + params['csv'] = "true" + params['bin'] = "true" + params['limit'] = "100000" + + csv_file = open('accesslog.csv', 'w+') + writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) + writer.writeheader() + + while lim > 0: + print lim + params['limit'] = str(min(lim, 100000)) + report = AccessLogReport(params) + retort = report.build() + start_date = str(retort[-1]['timestamp']) + for doc in retort: + lim = lim - 1 + try: + writer.writerow(doc) + except UnicodeEncodeError as e: + continue + csv_file.flush() + params['start_date'] = start_date + + + + csv_file.close() + +def format_arg(args): + return {ARG_TO_PARAMS[arg]: args[arg] for arg in args if args[arg] != None} + +if __name__ == '__main__': + try: + parser = argparse.ArgumentParser() + parser.add_argument("-s", help="Start date", type=str) + parser.add_argument("-e", help="End date", type=str) + parser.add_argument("-u", help="User id", type=str) + parser.add_argument("-l", help="Limit", type=str) + parser.add_argument("-j", help="subJect", type=str) + parser.add_argument("-t", help="list of access Types", type=str, nargs='+') + + args = vars(parser.parse_args()) + download_large_csv(format_arg(args)) + except Exception as e: + logging.exception('Unexpected error in log_csv.py') + sys.exit(1) \ No newline at end of file From 52f3fad9d313134e7ac3394fb5279ce900b7bc20 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Thu, 20 Jul 2017 14:05:24 -0500 Subject: [PATCH 7/9] fixed duplicate writes --- api/config.py | 1 + bin/log_csv.py | 35 +++++++++++++++++++++++------------ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/api/config.py b/api/config.py index 43bb3133d..c1fd29cc3 100644 --- a/api/config.py +++ b/api/config.py @@ -240,6 +240,7 @@ def initialize_db(): if __config['core']['access_log_enabled']: log_db.access_log.create_index('context.ticket_id') + log_db.access_log.create_index([('timestamp', pymongo.DESCENDING)]) create_or_recreate_ttl_index('authtokens', 'timestamp', 2592000) create_or_recreate_ttl_index('uploads', 'timestamp', 60) diff --git a/bin/log_csv.py b/bin/log_csv.py index ae6e65448..1b2c07ac9 100644 --- a/bin/log_csv.py +++ b/bin/log_csv.py @@ -29,7 +29,7 @@ def download_large_csv(params): """ Script to download large csv files to avoid uwsgi worker running out of memory. """ - lim = int(params['limit']) + entries = int(params['limit']) params['csv'] = "true" params['bin'] = "true" params['limit'] = "100000" @@ -37,24 +37,35 @@ def download_large_csv(params): csv_file = open('accesslog.csv', 'w+') writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS) writer.writeheader() - - while lim > 0: - print lim - params['limit'] = str(min(lim, 100000)) + unicode_err_count = 0 + while entries > 0: + print "{} entries left".format(entries) + params['limit'] = str(min(entries, 100000)) report = AccessLogReport(params) - retort = report.build() - start_date = str(retort[-1]['timestamp']) - for doc in retort: - lim = lim - 1 + rep = report.build() + end_date = str(rep[-1]['timestamp']) + for doc in rep[:-1]: + entries = entries - 1 try: writer.writerow(doc) except UnicodeEncodeError as e: + unicode_err_count += 1 + continue + + if len(rep) == 1: + entries = 0 + try: + writer.writerow(rep[0]) + except UnicodeEncodeError as e: + unicode_err_count += 1 continue + if len(rep) < int(params['limit']) - 1: + entries = 0 csv_file.flush() - params['start_date'] = start_date + params['end_date'] = end_date - + print "Encountered unicode errors and skipped {} entries".format(unicode_err_count) csv_file.close() def format_arg(args): @@ -74,4 +85,4 @@ def format_arg(args): download_large_csv(format_arg(args)) except Exception as e: logging.exception('Unexpected error in log_csv.py') - sys.exit(1) \ No newline at end of file + sys.exit(1) From 8a35f9265cb0ce1644d047238d2330c8e4ee5aa2 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Fri, 21 Jul 2017 11:02:37 -0500 Subject: [PATCH 8/9] using unicodecsv --- api/handlers/reporthandler.py | 2 +- bin/log_csv.py | 14 +++----------- requirements.txt | 1 + 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index c264f435d..dd7210729 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -1,5 +1,5 @@ import copy -import csv +import unicodecsv as csv import datetime import os diff --git a/bin/log_csv.py b/bin/log_csv.py index 1b2c07ac9..4ed1b7ced 100644 --- a/bin/log_csv.py +++ b/bin/log_csv.py @@ -5,7 +5,7 @@ # the process would frequently crash before 1 million entries were downloaded. import argparse -import csv +import unicodecsv as csv import pymongo import tarfile import sys @@ -46,19 +46,11 @@ def download_large_csv(params): end_date = str(rep[-1]['timestamp']) for doc in rep[:-1]: entries = entries - 1 - try: - writer.writerow(doc) - except UnicodeEncodeError as e: - unicode_err_count += 1 - continue + writer.writerow(doc) if len(rep) == 1: entries = 0 - try: - writer.writerow(rep[0]) - except UnicodeEncodeError as e: - unicode_err_count += 1 - continue + writer.writerow(rep[0]) if len(rep) < int(params['limit']) - 1: entries = 0 csv_file.flush() diff --git a/requirements.txt b/requirements.txt index 8685fa0b7..33113dfdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ uwsgi==2.0.13.1 webapp2==2.5.2 WebOb==1.5.1 git+https://github.com/flywheel-io/gears.git@v0.1.1#egg=gears +unicodecsv==0.9.0 From dbd6da10af356362db896ba6d7032d8ea00d0042 Mon Sep 17 00:00:00 2001 From: Harsha Kethineni Date: Thu, 27 Jul 2017 15:50:34 -0500 Subject: [PATCH 9/9] limit for limit parameter --- .gitignore | 1 + api/handlers/reporthandler.py | 2 ++ test/integration_tests/python/test_reports.py | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/.gitignore b/.gitignore index 477cd785c..14982fa1d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ bootstrap.json coverage.xml /htmlcov node_modules/ +/bin/accesslog.csv diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py index dd7210729..f935f0e25 100644 --- a/api/handlers/reporthandler.py +++ b/api/handlers/reporthandler.py @@ -522,6 +522,8 @@ def __init__(self, params): raise APIReportParamsException('Limit must be an integer greater than 0.') if limit < 1: raise APIReportParamsException('Limit must be an integer greater than 0.') + elif limit > 10000: + raise APIReportParamsException('Limit exceeds 10,000 entries, please contact admin to run script.') for access_type in access_types: if access_type not in AccessTypeList: raise APIReportParamsException('Not a valid access type') diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py index d3cfa0b74..17ba54e1d 100644 --- a/test/integration_tests/python/test_reports.py +++ b/test/integration_tests/python/test_reports.py @@ -116,6 +116,12 @@ def test_access_log_report(data_builder, with_user, as_user, as_admin): r = as_admin.get('/report/accesslog', params={'limit': 0}) assert r.status_code == 400 + # try to get report w/ limit == 1000 and limit > 1000 + r = as_admin.get('/report/accesslog', params={'limit': 10000}) + assert r.ok + r = as_admin.get('/report/accesslog', params={'limit': 10001}) + assert r.status_code == 400 + # get access log report for user r = as_admin.get('/report/accesslog', params={ 'start_date': yesterday_ts, 'end_date': tomorrow_ts, 'user': with_user.user