From ff573ff6823d1075a687b8bfa860bc2ec688fca2 Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Mon, 12 Jun 2017 11:30:08 -0500
Subject: [PATCH 1/9] subject and access_types params added

---
 api/handlers/reporthandler.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index 174d4d43b..a5875cfb8 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -437,6 +437,8 @@ def __init__(self, params):
         :end_date:      ISO formatted timestamp
         :uid:           user id of the target user
         :limit:         number of records to return
+        :subject:       subject code of session accessed
+        :access_types:  list of access_types to filter logs
         """
 
         super(AccessLogReport, self).__init__(params)
@@ -445,6 +447,8 @@ def __init__(self, params):
         end_date = params.get('end_date')
         uid = params.get('user')
         limit= params.get('limit', 100)
+        subject = params.get('subject', None)
+        access_types = params.getall('access_types')
 
         if start_date:
             start_date = dateutil.parser.parse(start_date)
@@ -460,12 +464,16 @@ def __init__(self, params):
             raise APIReportParamsException('Limit must be an integer greater than 0.')
         if limit < 1:
             raise APIReportParamsException('Limit must be an integer greater than 0.')
+        for access_type in access_types:
+            if access_type not in ['user_login', 'view_container']:
+                raise APIReportParamsException('Not a valid access type')
 
         self.start_date     = start_date
         self.end_date       = end_date
         self.uid            = uid
         self.limit          = limit
-
+        self.subject        = subject
+        self.access_types   = access_types
 
     def user_can_generate(self, uid):
         """
@@ -487,6 +495,10 @@ def build(self):
             query['timestamp']['$gte'] = self.start_date
         if self.end_date:
             query['timestamp']['$lte'] = self.end_date
+        if self.subject:
+            query['context.subject.label'] = self.subject
+        if self.access_types:
+            query['access_type'] = {'$in': self.access_types}
 
         return config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING)
 

From a2636e7871ea111e03036355d160d4a188cab51c Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Tue, 13 Jun 2017 10:45:06 -0500
Subject: [PATCH 2/9] writes accesslog to csv file

---
 api/handlers/reporthandler.py | 58 +++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index a5875cfb8..e12f567b3 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -1,4 +1,5 @@
 import copy
+import csv
 import datetime
 
 import bson
@@ -12,6 +13,26 @@
 
 EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60
 BYTES_IN_MEGABYTE = float(1<<20)
+ACCESS_LOG_FIELDS = [
+    "context.session.label",
+    "context.project.id",
+    "context.subject.label",
+    "context.ticket_id",
+    "context.acquisition.id",
+    "context.acquisition.label",
+    "timestamp",
+    "access_type",
+    "context.group.id",
+    "request_method",
+    "context.subject.id",
+    "request_path",
+    "context.group.label",
+    "context.project.label",
+    "origin.id",
+    "_id",
+    "context.session.id",
+    "origin.type"
+]
 
 class APIReportException(Exception):
     pass
@@ -39,7 +60,19 @@ def get(self, report_type):
             raise NotImplementedError('Report type {} is not supported'.format(report_type))
 
         if self.superuser_request or report.user_can_generate(self.uid):
-            return report.build()
+            if report_type == 'accesslog' and self.request.params.get('csv') == 'true':
+                csv_file = open("acceslog.csv", 'w+')
+                writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS)
+                writer.writeheader()
+
+                for doc in report.build():
+                    writer.writerow(doc)
+
+                self.response.app_iter = csv_file
+                self.response.headers['Content-Type'] = 'text/csv'
+                self.response.headers['Content-Disposition'] = 'attachment; filename="acceslog.csv"'
+            else:
+                return report.build()
         else:
             self.abort(403, 'User {} does not have required permissions to generate report'.format(self.uid))
 
@@ -439,6 +472,7 @@ def __init__(self, params):
         :limit:         number of records to return
         :subject:       subject code of session accessed
         :access_types:  list of access_types to filter logs
+        :csv:           Boolean if user wants csv file
         """
 
         super(AccessLogReport, self).__init__(params)
@@ -449,6 +483,7 @@ def __init__(self, params):
         limit= params.get('limit', 100)
         subject = params.get('subject', None)
         access_types = params.getall('access_types')
+        csv_bool = params.get('csv') == 'true'
 
         if start_date:
             start_date = dateutil.parser.parse(start_date)
@@ -474,6 +509,7 @@ def __init__(self, params):
         self.limit          = limit
         self.subject        = subject
         self.access_types   = access_types
+        self.csv_bool       = csv_bool
 
     def user_can_generate(self, uid):
         """
@@ -483,6 +519,19 @@ def user_can_generate(self, uid):
             return True
         return False
 
+    def flatten(self, json_obj, flat, prefix = ""):
+        """
+        flattens a
+        """
+        for field in json_obj.keys():
+            if isinstance(json_obj[field], dict):
+                flat = self.flatten(json_obj[field], flat, prefix = prefix + field + ".")
+            else:
+                flat[prefix + field] = json_obj[field]
+        return flat
+
+    def make_csv(self, cursor):
+        return [self.flatten(json_obj, {}) for json_obj in cursor]
 
     def build(self):
         query = {}
@@ -500,7 +549,12 @@ def build(self):
         if self.access_types:
             query['access_type'] = {'$in': self.access_types}
 
-        return config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING)
+        cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING)
+
+        if self.csv_bool:
+            return self.make_csv(cursor)
+
+        return cursor
 
 class UsageReport(Report):
     """

From 7a5119c2ba22c786e40cf3fbedb15764b6ffbdec Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Tue, 13 Jun 2017 11:19:15 -0500
Subject: [PATCH 3/9] csv file can be downloaded

---
 api/handlers/reporthandler.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index e12f567b3..43e321e00 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -1,12 +1,14 @@
 import copy
 import csv
 import datetime
+import os
 
 import bson
 import dateutil
 import pymongo
 
 from .. import config
+from .. import tempdir as tempfile
 from .. import util
 from ..web import base
 
@@ -61,14 +63,16 @@ def get(self, report_type):
 
         if self.superuser_request or report.user_can_generate(self.uid):
             if report_type == 'accesslog' and self.request.params.get('csv') == 'true':
-                csv_file = open("acceslog.csv", 'w+')
+                tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path'))
+                csv_file = open(os.path.join(tempdir.name, 'acceslog.csv'), 'w+')
                 writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS)
                 writer.writeheader()
 
                 for doc in report.build():
                     writer.writerow(doc)
 
-                self.response.app_iter = csv_file
+                csv_file.close()
+                self.response.app_iter = open(os.path.join(tempdir.name, 'acceslog.csv'), 'r')
                 self.response.headers['Content-Type'] = 'text/csv'
                 self.response.headers['Content-Disposition'] = 'attachment; filename="acceslog.csv"'
             else:

From 6356e1777ebe2fa0c8f89fab521ca172c804245e Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Tue, 13 Jun 2017 15:01:50 -0500
Subject: [PATCH 4/9] tested new functionality of access log reports

---
 api/handlers/reporthandler.py                 | 27 ++++++-----
 test/integration_tests/python/test_reports.py | 47 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index 43e321e00..379e09363 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -16,24 +16,24 @@
 EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60
 BYTES_IN_MEGABYTE = float(1<<20)
 ACCESS_LOG_FIELDS = [
-    "context.session.label",
-    "context.project.id",
-    "context.subject.label",
-    "context.ticket_id",
+    "_id",
+    "access_type",
     "context.acquisition.id",
     "context.acquisition.label",
-    "timestamp",
-    "access_type",
     "context.group.id",
-    "request_method",
-    "context.subject.id",
-    "request_path",
     "context.group.label",
+    "context.project.id",
     "context.project.label",
-    "origin.id",
-    "_id",
     "context.session.id",
-    "origin.type"
+    "context.session.label",
+    "context.subject.id",
+    "context.subject.label",
+    "context.ticket_id",
+    "origin.id",
+    "origin.type",
+    "request_method",
+    "request_path",
+    "timestamp"
 ]
 
 class APIReportException(Exception):
@@ -71,6 +71,7 @@ def get(self, report_type):
                 for doc in report.build():
                     writer.writerow(doc)
 
+                # Need to close and reopen file to flush buffer into file
                 csv_file.close()
                 self.response.app_iter = open(os.path.join(tempdir.name, 'acceslog.csv'), 'r')
                 self.response.headers['Content-Type'] = 'text/csv'
@@ -504,7 +505,7 @@ def __init__(self, params):
         if limit < 1:
             raise APIReportParamsException('Limit must be an integer greater than 0.')
         for access_type in access_types:
-            if access_type not in ['user_login', 'view_container']:
+            if access_type not in ['user_login', 'view_container', 'download_file']:
                 raise APIReportParamsException('Not a valid access type')
 
         self.start_date     = start_date
diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py
index 49a8a4170..670594924 100644
--- a/test/integration_tests/python/test_reports.py
+++ b/test/integration_tests/python/test_reports.py
@@ -2,7 +2,6 @@
 import copy
 import datetime
 
-
 # create timestamps for report filtering
 today = datetime.datetime.today()
 ts_format = '{:%Y-%m-%dT%H:%M:%S+00:00}'
@@ -93,7 +92,7 @@ def test_project_report(data_builder, as_admin, as_user):
     assert len(project_report['projects']) == 2
 
 
-def test_access_log_report(with_user, as_user, as_admin):
+def test_access_log_report(data_builder, with_user, as_user, as_admin):
     # try to get access log report as user
     r = as_user.get('/report/accesslog')
     assert r.status_code == 403
@@ -135,6 +134,50 @@ def test_access_log_report(with_user, as_user, as_admin):
     assert len(accesslog) == 1
     assert accesslog[0]['access_type'] == 'user_login'
 
+    # get access log report of certain subject
+    project = data_builder.create_project()
+    r = as_admin.post('/sessions', json={
+        'project': project,
+        'label': 'test-accesslog-session',
+        'timestamp': '1979-01-01T00:00:00+00:00',
+        'subject': {'code': 'compliant5'}
+    })
+    assert r.ok
+    session = r.json()['_id']
+    r = as_admin.get('/sessions/' + session)
+    assert r.ok
+    session = r.json()['_id']
+    r = as_admin.get('/sessions/' + session)
+    assert r.ok
+
+    r = as_admin.get('/report/accesslog', params={'subject': 'compliant5'})
+    assert r.ok
+    for count,log in enumerate(r.json(), start = 1):
+        assert log.get('context', {}).get('subject', {}).get('label') == 'compliant5'
+    assert count == 2
+    r = as_admin.delete('/sessions/' + session)
+    data_builder.delete_project(project, recursive=True)
+
+    # get access log report of certain access types
+    r = as_admin.get('/report/accesslog', params={'access_types': ['user_login', 'view_container']})
+    assert r.ok
+    ul, vc = False, False
+
+    # test that each item in log is either view_container or user_login
+    for log in r.json():
+        assert log.get('access_type') in ['user_login', 'view_container']
+        if log.get('access_type') == 'user_login':
+            ul = True
+        elif log.get('access_type') == 'view_container':
+            vc = True
+    assert ul and vc
+
+    # Download .csv file
+    r = as_admin.get('/report/accesslog', params={'csv': 'true'})
+    assert r.ok
+
+    r.content[0][:3] == '_id' 
+
 
 def test_usage_report(data_builder, file_form, as_user, as_admin):
     # try to get usage report as user

From 4925e1fe2d6f3df18562479b9680b925bb14d1e1 Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Tue, 13 Jun 2017 16:11:46 -0500
Subject: [PATCH 5/9] added endpoint for types

---
 api/api.py                                    |  1 +
 api/handlers/reporthandler.py                 | 14 ++++++++++----
 api/web/request.py                            |  1 +
 test/integration_tests/python/test_reports.py | 10 +++++++++-
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/api/api.py b/api/api.py
index bddf373c4..d986423ef 100644
--- a/api/api.py
+++ b/api/api.py
@@ -97,6 +97,7 @@ def prefix(path, routes):
         route('/resolve',                                       ResolveHandler, h='resolve',  m=['POST']),
         route('/schemas/<schema:{schema}>',                     SchemaHandler,                m=['GET']),
         route('/report/<report_type:site|project|accesslog|usage>',   ReportHandler,                m=['GET']),
+        route('/report/accesslog/types',                        ReportHandler,  h='get_types',  m=['GET']),
 
 
         # Search
diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index 379e09363..3e6d4f96b 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -12,6 +12,8 @@
 from .. import util
 from ..web import base
 
+from ..web.request import AccessTypeList
+
 
 EIGHTEEN_YEARS_IN_SEC = 18 * 365.25 * 24 * 60 * 60
 BYTES_IN_MEGABYTE = float(1<<20)
@@ -48,6 +50,9 @@ class ReportHandler(base.RequestHandler):
     def __init__(self, request=None, response=None):
         super(ReportHandler, self).__init__(request, response)
 
+    def get_types(self):
+        return AccessTypeList
+
     def get(self, report_type):
 
         report = None
@@ -62,6 +67,7 @@ def get(self, report_type):
             raise NotImplementedError('Report type {} is not supported'.format(report_type))
 
         if self.superuser_request or report.user_can_generate(self.uid):
+            # If csv is true create a temp file to respond with
             if report_type == 'accesslog' and self.request.params.get('csv') == 'true':
                 tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path'))
                 csv_file = open(os.path.join(tempdir.name, 'acceslog.csv'), 'w+')
@@ -505,7 +511,7 @@ def __init__(self, params):
         if limit < 1:
             raise APIReportParamsException('Limit must be an integer greater than 0.')
         for access_type in access_types:
-            if access_type not in ['user_login', 'view_container', 'download_file']:
+            if access_type not in AccessTypeList:
                 raise APIReportParamsException('Not a valid access type')
 
         self.start_date     = start_date
@@ -526,7 +532,7 @@ def user_can_generate(self, uid):
 
     def flatten(self, json_obj, flat, prefix = ""):
         """
-        flattens a
+        flattens a document to not have nested objects
         """
         for field in json_obj.keys():
             if isinstance(json_obj[field], dict):
@@ -535,7 +541,7 @@ def flatten(self, json_obj, flat, prefix = ""):
                 flat[prefix + field] = json_obj[field]
         return flat
 
-    def make_csv(self, cursor):
+    def make_csv_ready(self, cursor):
         return [self.flatten(json_obj, {}) for json_obj in cursor]
 
     def build(self):
@@ -557,7 +563,7 @@ def build(self):
         cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING)
 
         if self.csv_bool:
-            return self.make_csv(cursor)
+            return self.make_csv_ready(cursor)
 
         return cursor
 
diff --git a/api/web/request.py b/api/web/request.py
index 0d6a6c4cf..7421eef65 100644
--- a/api/web/request.py
+++ b/api/web/request.py
@@ -16,6 +16,7 @@
     'user_login':       'user_login',
     'user_logout':      'user_logout'
 })
+AccessTypeList = [type_name for type_name, member in AccessType.__members__.items()]
 
 
 class SciTranRequest(Request):
diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py
index 670594924..d3cfa0b74 100644
--- a/test/integration_tests/python/test_reports.py
+++ b/test/integration_tests/python/test_reports.py
@@ -1,6 +1,7 @@
 import calendar
 import copy
 import datetime
+from api.web.request import AccessTypeList
 
 # create timestamps for report filtering
 today = datetime.datetime.today()
@@ -144,6 +145,8 @@ def test_access_log_report(data_builder, with_user, as_user, as_admin):
     })
     assert r.ok
     session = r.json()['_id']
+
+    # In order to have two logs of this subject (POST does not create a log)
     r = as_admin.get('/sessions/' + session)
     assert r.ok
     session = r.json()['_id']
@@ -176,7 +179,12 @@ def test_access_log_report(data_builder, with_user, as_user, as_admin):
     r = as_admin.get('/report/accesslog', params={'csv': 'true'})
     assert r.ok
 
-    r.content[0][:3] == '_id' 
+    r.content[0][:3] == '_id'
+
+    # get the access types
+    r = as_admin.get('/report/accesslog/types')
+    assert r.ok
+    assert r.json() == AccessTypeList
 
 
 def test_usage_report(data_builder, file_form, as_user, as_admin):

From 53aa27d1c1e552b67d772efc9fd4abf70e8100d6 Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Thu, 13 Jul 2017 14:12:53 -0500
Subject: [PATCH 6/9] script to create large csv file started

---
 api/handlers/reporthandler.py | 34 +++++++++++-----
 bin/log_csv.py                | 77 +++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 11 deletions(-)
 create mode 100644 bin/log_csv.py

diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index 3e6d4f96b..c264f435d 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -22,6 +22,10 @@
     "access_type",
     "context.acquisition.id",
     "context.acquisition.label",
+    "context.analysis.id",
+    "context.analysis.label",
+    "context.collection.id",
+    "context.collection.label",
     "context.group.id",
     "context.group.label",
     "context.project.id",
@@ -32,6 +36,8 @@
     "context.subject.label",
     "context.ticket_id",
     "origin.id",
+    "origin.method",
+    "origin.name",
     "origin.type",
     "request_method",
     "request_path",
@@ -69,19 +75,22 @@ def get(self, report_type):
         if self.superuser_request or report.user_can_generate(self.uid):
             # If csv is true create a temp file to respond with
             if report_type == 'accesslog' and self.request.params.get('csv') == 'true':
+
                 tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path'))
-                csv_file = open(os.path.join(tempdir.name, 'acceslog.csv'), 'w+')
+                csv_file = open(os.path.join(tempdir.name, 'accesslog.csv'), 'w+')
                 writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS)
                 writer.writeheader()
-
-                for doc in report.build():
-                    writer.writerow(doc)
-
+                try:
+                    for doc in report.build():
+                        writer.writerow(doc)
+                        
+                except APIReportException as e:
+                    self.abort(404, str(e))
                 # Need to close and reopen file to flush buffer into file
                 csv_file.close()
-                self.response.app_iter = open(os.path.join(tempdir.name, 'acceslog.csv'), 'r')
+                self.response.app_iter = open(os.path.join(tempdir.name, 'accesslog.csv'), 'r')
                 self.response.headers['Content-Type'] = 'text/csv'
-                self.response.headers['Content-Disposition'] = 'attachment; filename="acceslog.csv"'
+                self.response.headers['Content-Disposition'] = 'attachment; filename="accesslog.csv"'
             else:
                 return report.build()
         else:
@@ -493,8 +502,11 @@ def __init__(self, params):
         uid = params.get('user')
         limit= params.get('limit', 100)
         subject = params.get('subject', None)
-        access_types = params.getall('access_types')
-        csv_bool = params.get('csv') == 'true'
+        if params.get('bin') == 'true':
+            access_types = params.get('access_types', [])
+        else:
+            access_types = params.getall('access_types')
+        csv_bool = (params.get('csv') == 'true')
 
         if start_date:
             start_date = dateutil.parser.parse(start_date)
@@ -534,6 +546,7 @@ def flatten(self, json_obj, flat, prefix = ""):
         """
         flattens a document to not have nested objects
         """
+        
         for field in json_obj.keys():
             if isinstance(json_obj[field], dict):
                 flat = self.flatten(json_obj[field], flat, prefix = prefix + field + ".")
@@ -560,8 +573,7 @@ def build(self):
         if self.access_types:
             query['access_type'] = {'$in': self.access_types}
 
-        cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING)
-
+        cursor = config.log_db.access_log.find(query).limit(self.limit).sort('timestamp', pymongo.DESCENDING).batch_size(1000)
         if self.csv_bool:
             return self.make_csv_ready(cursor)
 
diff --git a/bin/log_csv.py b/bin/log_csv.py
new file mode 100644
index 000000000..ae6e65448
--- /dev/null
+++ b/bin/log_csv.py
@@ -0,0 +1,77 @@
+# This implementation as of July 19 2017 has these resource utilizations of the mongodb container:
+#   - 2 million entries: 1.50 Gb
+#   - 3 million entries: 2.05 Gb
+# The entire docker application was given 6 Gb to use, when given the default 2 Gb,
+# the process would frequently crash before 1 million entries were downloaded.
+
+import argparse
+import csv
+import pymongo
+import tarfile
+import sys
+import logging
+import datetime
+
+from api.web.request import AccessTypeList
+from api import config
+from api.handlers.reporthandler import AccessLogReport, ACCESS_LOG_FIELDS
+
+ARG_TO_PARAMS= {
+    'l': 'limit',
+    's': 'start_date',
+    'e': 'end_date',
+    'u': 'uid',
+    'j': 'subject',
+    't': 'access_types'
+}
+
+def download_large_csv(params):
+    """
+    Script to download large csv files to avoid uwsgi worker running out of memory.
+    """
+    lim = int(params['limit'])
+    params['csv'] = "true"
+    params['bin'] = "true"
+    params['limit'] = "100000"
+
+    csv_file = open('accesslog.csv', 'w+')
+    writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS)
+    writer.writeheader()
+
+    while lim > 0:
+        print lim
+        params['limit'] = str(min(lim, 100000))
+        report = AccessLogReport(params)
+        retort = report.build()
+        start_date = str(retort[-1]['timestamp'])
+        for doc in retort:
+            lim = lim - 1
+            try:
+                writer.writerow(doc)
+            except UnicodeEncodeError as e:
+                continue
+        csv_file.flush()
+        params['start_date'] = start_date
+
+            
+
+    csv_file.close()
+
+def format_arg(args):
+    return {ARG_TO_PARAMS[arg]: args[arg] for arg in args if args[arg] != None}
+
+if __name__ == '__main__':
+    try:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("-s", help="Start date",            type=str)
+        parser.add_argument("-e", help="End date",              type=str)
+        parser.add_argument("-u", help="User id",               type=str)
+        parser.add_argument("-l", help="Limit",                 type=str)
+        parser.add_argument("-j", help="subJect",               type=str)
+        parser.add_argument("-t", help="list of access Types",  type=str, nargs='+')
+
+        args = vars(parser.parse_args())
+        download_large_csv(format_arg(args))
+    except Exception as e:
+        logging.exception('Unexpected error in log_csv.py')
+        sys.exit(1)
\ No newline at end of file

From 52f3fad9d313134e7ac3394fb5279ce900b7bc20 Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Thu, 20 Jul 2017 14:05:24 -0500
Subject: [PATCH 7/9] fixed duplicate writes

---
 api/config.py  |  1 +
 bin/log_csv.py | 35 +++++++++++++++++++++++------------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/api/config.py b/api/config.py
index 43bb3133d..c1fd29cc3 100644
--- a/api/config.py
+++ b/api/config.py
@@ -240,6 +240,7 @@ def initialize_db():
 
     if __config['core']['access_log_enabled']:
         log_db.access_log.create_index('context.ticket_id')
+        log_db.access_log.create_index([('timestamp', pymongo.DESCENDING)])
 
     create_or_recreate_ttl_index('authtokens', 'timestamp', 2592000)
     create_or_recreate_ttl_index('uploads', 'timestamp', 60)
diff --git a/bin/log_csv.py b/bin/log_csv.py
index ae6e65448..1b2c07ac9 100644
--- a/bin/log_csv.py
+++ b/bin/log_csv.py
@@ -29,7 +29,7 @@ def download_large_csv(params):
     """
     Script to download large csv files to avoid uwsgi worker running out of memory.
     """
-    lim = int(params['limit'])
+    entries = int(params['limit'])
     params['csv'] = "true"
     params['bin'] = "true"
     params['limit'] = "100000"
@@ -37,24 +37,35 @@ def download_large_csv(params):
     csv_file = open('accesslog.csv', 'w+')
     writer = csv.DictWriter(csv_file, ACCESS_LOG_FIELDS)
     writer.writeheader()
-
-    while lim > 0:
-        print lim
-        params['limit'] = str(min(lim, 100000))
+    unicode_err_count = 0
+    while entries > 0:
+        print "{} entries left".format(entries)
+        params['limit'] = str(min(entries, 100000))
         report = AccessLogReport(params)
-        retort = report.build()
-        start_date = str(retort[-1]['timestamp'])
-        for doc in retort:
-            lim = lim - 1
+        rep = report.build()
+        end_date = str(rep[-1]['timestamp'])
+        for doc in rep[:-1]:
+            entries = entries - 1
             try:
                 writer.writerow(doc)
             except UnicodeEncodeError as e:
+                unicode_err_count += 1
+                continue
+
+        if len(rep) == 1:
+            entries = 0
+            try:
+                writer.writerow(rep[0])
+            except UnicodeEncodeError as e:
+                unicode_err_count += 1
                 continue
+        if len(rep) < int(params['limit']) - 1:
+            entries = 0
         csv_file.flush()
-        params['start_date'] = start_date
+        params['end_date'] = end_date
 
             
-
+    print "Encountered unicode errors and skipped {} entries".format(unicode_err_count)
     csv_file.close()
 
 def format_arg(args):
@@ -74,4 +85,4 @@ def format_arg(args):
         download_large_csv(format_arg(args))
     except Exception as e:
         logging.exception('Unexpected error in log_csv.py')
-        sys.exit(1)
\ No newline at end of file
+        sys.exit(1)

From 8a35f9265cb0ce1644d047238d2330c8e4ee5aa2 Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Fri, 21 Jul 2017 11:02:37 -0500
Subject: [PATCH 8/9] using unicodecsv

---
 api/handlers/reporthandler.py |  2 +-
 bin/log_csv.py                | 14 +++-----------
 requirements.txt              |  1 +
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index c264f435d..dd7210729 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -1,5 +1,5 @@
 import copy
-import csv
+import unicodecsv as csv
 import datetime
 import os
 
diff --git a/bin/log_csv.py b/bin/log_csv.py
index 1b2c07ac9..4ed1b7ced 100644
--- a/bin/log_csv.py
+++ b/bin/log_csv.py
@@ -5,7 +5,7 @@
 # the process would frequently crash before 1 million entries were downloaded.
 
 import argparse
-import csv
+import unicodecsv as csv
 import pymongo
 import tarfile
 import sys
@@ -46,19 +46,11 @@ def download_large_csv(params):
         end_date = str(rep[-1]['timestamp'])
         for doc in rep[:-1]:
             entries = entries - 1
-            try:
-                writer.writerow(doc)
-            except UnicodeEncodeError as e:
-                unicode_err_count += 1
-                continue
+            writer.writerow(doc)
 
         if len(rep) == 1:
             entries = 0
-            try:
-                writer.writerow(rep[0])
-            except UnicodeEncodeError as e:
-                unicode_err_count += 1
-                continue
+            writer.writerow(rep[0])
         if len(rep) < int(params['limit']) - 1:
             entries = 0
         csv_file.flush()
diff --git a/requirements.txt b/requirements.txt
index 8685fa0b7..33113dfdc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ uwsgi==2.0.13.1
 webapp2==2.5.2
 WebOb==1.5.1
 git+https://github.com/flywheel-io/gears.git@v0.1.1#egg=gears
+unicodecsv==0.9.0

From dbd6da10af356362db896ba6d7032d8ea00d0042 Mon Sep 17 00:00:00 2001
From: Harsha Kethineni <harshakethineni@flywheel.io>
Date: Thu, 27 Jul 2017 15:50:34 -0500
Subject: [PATCH 9/9] limit for limit parameter

---
 .gitignore                                    | 1 +
 api/handlers/reporthandler.py                 | 2 ++
 test/integration_tests/python/test_reports.py | 6 ++++++
 3 files changed, 9 insertions(+)

diff --git a/.gitignore b/.gitignore
index 477cd785c..14982fa1d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ bootstrap.json
 coverage.xml
 /htmlcov
 node_modules/
+/bin/accesslog.csv
diff --git a/api/handlers/reporthandler.py b/api/handlers/reporthandler.py
index dd7210729..f935f0e25 100644
--- a/api/handlers/reporthandler.py
+++ b/api/handlers/reporthandler.py
@@ -522,6 +522,8 @@ def __init__(self, params):
             raise APIReportParamsException('Limit must be an integer greater than 0.')
         if limit < 1:
             raise APIReportParamsException('Limit must be an integer greater than 0.')
+        elif limit > 10000:
+            raise APIReportParamsException('Limit exceeds 10,000 entries, please contact admin to run script.')
         for access_type in access_types:
             if access_type not in AccessTypeList:
                 raise APIReportParamsException('Not a valid access type')
diff --git a/test/integration_tests/python/test_reports.py b/test/integration_tests/python/test_reports.py
index d3cfa0b74..17ba54e1d 100644
--- a/test/integration_tests/python/test_reports.py
+++ b/test/integration_tests/python/test_reports.py
@@ -116,6 +116,12 @@ def test_access_log_report(data_builder, with_user, as_user, as_admin):
     r = as_admin.get('/report/accesslog', params={'limit': 0})
     assert r.status_code == 400
 
+    # try to get report w/ limit == 1000 and limit > 1000
+    r = as_admin.get('/report/accesslog', params={'limit': 10000})
+    assert r.ok
+    r = as_admin.get('/report/accesslog', params={'limit': 10001})
+    assert r.status_code == 400
+
     # get access log report for user
     r = as_admin.get('/report/accesslog', params={
         'start_date': yesterday_ts, 'end_date': tomorrow_ts, 'user': with_user.user