From 18265a2cb2dfd9ab0a24b021638b75a85f0bc1c8 Mon Sep 17 00:00:00 2001 From: David Caplan Date: Mon, 17 Apr 2017 21:46:02 -0400 Subject: [PATCH 1/5] add DatasetExport support (closes #150) --- setup.py | 4 +- solvebio/__init__.py | 4 +- solvebio/cli/ipython.py | 1 + solvebio/query.py | 18 +++++-- solvebio/resource/__init__.py | 2 + solvebio/resource/dataset.py | 86 ++++++------------------------ solvebio/resource/datasetexport.py | 56 +++++++++++++++++++ solvebio/test/test_exports.py | 27 +--------- 8 files changed, 95 insertions(+), 103 deletions(-) create mode 100644 solvebio/resource/datasetexport.py diff --git a/setup.py b/setup.py index 3b3b586f..2e58e895 100644 --- a/setup.py +++ b/setup.py @@ -55,9 +55,7 @@ include_package_data=True, install_requires=install_requires, platforms='any', - extras_require={ - 'exporters': ['XlsxWriter>=0.8.0', 'pandas>=0.10.0'], - }, + extras_require={}, entry_points={ 'console_scripts': ['solvebio = solvebio.cli.main:main'] }, diff --git a/solvebio/__init__.py b/solvebio/__init__.py index 43deb33b..633c652c 100644 --- a/solvebio/__init__.py +++ b/solvebio/__init__.py @@ -95,7 +95,8 @@ def emit(self, record): Depository, DepositoryVersion, Upload, Manifest, User, Dataset, DatasetField, - DatasetImport, DatasetCommit, DatasetTemplate + DatasetImport, DatasetExport, + DatasetCommit, DatasetTemplate ) @@ -134,6 +135,7 @@ def login(**kwargs): 'Dataset', 'DatasetField', 'DatasetImport', + 'DatasetExport', 'DatasetCommit', 'DatasetTemplate', 'Depository', diff --git a/solvebio/cli/ipython.py b/solvebio/cli/ipython.py index 1ce98221..1654721a 100644 --- a/solvebio/cli/ipython.py +++ b/solvebio/cli/ipython.py @@ -57,6 +57,7 @@ def launch_ipython_shell(args): # pylint: disable=unused-argument from solvebio import DatasetCommit # noqa from solvebio import DatasetField # noqa from solvebio import DatasetImport # noqa + from solvebio import DatasetExport # noqa from solvebio import DatasetTemplate # noqa from solvebio import Depository # noqa from solvebio import DepositoryVersion # noqa diff --git a/solvebio/query.py b/solvebio/query.py index d3af78a5..dc416535 100644 --- a/solvebio/query.py +++ b/solvebio/query.py @@ -6,7 +6,6 @@ from .client import client from .utils.printing import pretty_int from .utils.tabulate import tabulate -from .exporters import exporters from .errors import SolveError import copy @@ -629,8 +628,21 @@ def execute(self, offset=0, **query): % self._response) return _params, self._response - def export(self, exporter, *args, **kwargs): - return exporters.export(exporter, self, *args, **kwargs) + def export(self, format='json', follow=True, limit=None): + from solvebio import DatasetExport + + params = self._build_query(limit=limit) + params.pop('offset', None) + params.pop('ordering', None) + + export = DatasetExport.create( + dataset_id=self._dataset_id, + params=params) + + if follow: + export.follow() + + return export class BatchQuery(object): diff --git a/solvebio/resource/__init__.py b/solvebio/resource/__init__.py index f246bf4d..203167a6 100644 --- a/solvebio/resource/__init__.py +++ b/solvebio/resource/__init__.py @@ -7,6 +7,7 @@ from .dataset import Dataset from .datasetfield import DatasetField from .datasetimport import DatasetImport +from .datasetexport import DatasetExport from .datasetcommit import DatasetCommit from .datasettemplate import DatasetTemplate from .upload import Upload @@ -16,6 +17,7 @@ types = { 'Dataset': Dataset, 'DatasetImport': DatasetImport, + 'DatasetExport': DatasetExport, 'DatasetCommit': DatasetCommit, 'DatasetTemplate': DatasetTemplate, 'DatasetField': DatasetField, diff --git a/solvebio/resource/dataset.py b/solvebio/resource/dataset.py index 9cc7f7a9..53f1c3f3 100644 --- a/solvebio/resource/dataset.py +++ b/solvebio/resource/dataset.py @@ -10,8 +10,7 @@ from .apiresource import UpdateableAPIResource from .apiresource import DeletableAPIResource from .datasetfield import DatasetField - -from ..exporters import DatasetExportFile +from .datasetexport import DatasetExport class Dataset(CreateableAPIResource, @@ -226,74 +225,19 @@ def import_file(self, path, **kwargs): manifest=manifest.manifest, **kwargs) - def export(self, path, genome_build=None, format='json', - show_progress=True, download=True): - if 'exports_url' not in self: - if 'id' not in self or not self['id']: - raise Exception( - 'No Dataset ID was provided. ' - 'Please instantiate the Dataset ' - 'object with an ID or full_name.') - self['exports_url'] = self.instance_url() + '/exports' - - export = client.post(self['exports_url'], - {'format': format, - 'genome_build': genome_build}) - - print("Exporting dataset {0} to {1}" - .format(self['full_name'], path)) - - total_size = 0 - manifest = export['manifest'] - - for i, manifest_file in enumerate(manifest['files']): - total_size += manifest_file['size'] - export_file = DatasetExportFile( - url=manifest_file['url'], - path=path, - show_progress=show_progress) - - if not download: - print('Downloading is off, skipping file {0} ({1})' - .format(export_file.file_name, - naturalsize(manifest_file['size']))) - continue - - print('Downloading file {0}/{1}: {2} ({3})' - .format(i + 1, len(manifest['files']), - export_file.file_name, - naturalsize(manifest_file['size']))) - export_file.download() - - # Validate the MD5 of the downloaded file. - # Handle's S3's multipart MD5 calculation. - md5, blocks = md5sum( - multipart_threshold=manifest['multipart_threshold_bytes'], - multipart_chunksize=manifest['multipart_chunksize_bytes'] - ) - - if md5 != manifest_file['md5']: - print("### Export failed MD5 verification!") - print("### -------------------------------") - print("### File: {0}".format(export_file.file_name)) - print("### Expected MD5: {0}".format(manifest_file['md5'])) - print("### Calculated MD5: {0}".format(md5)) - if blocks and manifest_file['multipart_blocks'] != blocks: - print("### Multipart block size failed verification") - print("### Expected: {0} blocks" - .format(manifest_file['multipart_blocks'])) - print("### Found: {0} blocks".format(blocks)) - print("\n### Delete the following file and try again: {0}" - .format(export_file.file_name)) - print("### If the problem persists, please email " - "support@solvebio.com") - return None - - print("File {0} completed downloading and MD5 verification." - .format(export_file.file_name)) - - print('Number of files: {0}'.format(len(manifest['files']))) - print('Number of records: {0}'.format(export['documents_count'])) - print('Total size: {0}'.format(naturalsize(total_size))) + def export(self, format='json', follow=True, **kwargs): + if 'id' not in self or not self['id']: + raise Exception( + 'No Dataset ID was provided. ' + 'Please instantiate the Dataset ' + 'object with an ID or full_name.') + + export = DatasetExport.create( + dataset_id=self['id'], + format=format, + **kwargs) + + if follow: + export.follow() return export diff --git a/solvebio/resource/datasetexport.py b/solvebio/resource/datasetexport.py new file mode 100644 index 00000000..2ba48326 --- /dev/null +++ b/solvebio/resource/datasetexport.py @@ -0,0 +1,56 @@ +from .apiresource import CreateableAPIResource +from .apiresource import ListableAPIResource +from .apiresource import DeletableAPIResource +from .apiresource import DownloadableAPIResource + +import time + + +class DatasetExport(CreateableAPIResource, ListableAPIResource, + DownloadableAPIResource, DeletableAPIResource): + """ + DatasetExport represent an export task that takes + a Dataset or filtered Dataset (Query) and exports + the contents to a flat file (CSV, JSON, or XLSX). + + For interactive use, DatasetExport can be "followed" to watch + the progression of the task. + """ + LIST_FIELDS = ( + ('id', 'ID'), + ('title', 'Title'), + ('description', 'Description'), + ('status', 'Status'), + ('created_at', 'Created'), + ) + + def dataset(self): + from .dataset import Dataset + return Dataset.retrieve(self['dataset']) + + def follow(self): + print("Waiting for export (id = {0}) to start...".format(self.id)) + print("View your export status on MESH: " + "https://my.solvebio.com/jobs/export/{0}" + .format(self.id)) + + export_status = self.status + + while self.status in ['queued', 'running']: + if self.status != export_status: + print("Export is now {0} (was {1})" + .format(self.status, export_status)) + export_status = self.status + + if self.status == 'running': + print("Export '{0}' is {1}: {2}/{3} records exported" + .format(self.id, + self.status, + self.metadata['progress']['processed_records'], + self.documents_count)) + + time.sleep(3) + self.refresh() + + if self.status == 'completed': + print("Export complete!") diff --git a/solvebio/test/test_exports.py b/solvebio/test/test_exports.py index d8558cec..701a290f 100644 --- a/solvebio/test/test_exports.py +++ b/solvebio/test/test_exports.py @@ -20,37 +20,14 @@ def setUp(self): self.query = self.dataset.query(filters=filters, fields=['rgd_id'], genome_build='GRCh37', limit=10) - # CSVExporter def test_csv_exporter(self): test_file = '/tmp/test_export.csv' reference_file = 'solvebio/test/data/test_export.csv' - self.query.export('csv', filename=test_file) + export = self.query.export(follow=True, format='csv') + export.download(test_file) self.assertTrue(path.isfile(test_file)) self.assertEqual( hashlib.sha1(open(test_file, 'rb').read()).hexdigest(), hashlib.sha1(open(reference_file, 'rb').read()).hexdigest() ) remove(test_file) - - # XLSXExporter - def test_excel_exporter(self): - test_file = '/tmp/test_export.xlsx' - # reference_file = 'solvebio/test/data/test_export.xlsx' - self.query.export('excel', filename=test_file) - self.assertTrue(path.isfile(test_file)) - remove(test_file) - - # JSONExporter - def test_json_exporter(self): - test_file = '/tmp/test_export.json' - reference_file = 'solvebio/test/data/test_export.json' - self.query.export('json', filename=test_file) - self.assertTrue(path.isfile(test_file)) - self.assertEqual( - hashlib.sha1(open(test_file, 'rb').read()).hexdigest(), - hashlib.sha1(open(reference_file, 'rb').read()).hexdigest() - ) - with open(test_file, 'r') as f: - for row in f: - self.assertTrue(json.loads(row)) - remove(test_file) From ac60713e3b233c1f9cc6479023ee33c1c7627a42 Mon Sep 17 00:00:00 2001 From: David Caplan Date: Mon, 17 Apr 2017 21:47:01 -0400 Subject: [PATCH 2/5] add deprecation note --- solvebio/exporters.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/solvebio/exporters.py b/solvebio/exporters.py index 5c7bf1d7..22f7beea 100644 --- a/solvebio/exporters.py +++ b/solvebio/exporters.py @@ -1,4 +1,10 @@ # -*- coding: utf-8 -*- +# +# **DEPRECATED** +# +# These exporters have been deprecated in favor +# of the DatasetExport resource. +# from __future__ import print_function from __future__ import unicode_literals From 81c9b80910b982d9efa284dd5f33ab308ef37ea5 Mon Sep 17 00:00:00 2001 From: David Caplan Date: Mon, 17 Apr 2017 21:57:44 -0400 Subject: [PATCH 3/5] fix upload attrs in test --- solvebio/test/test_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solvebio/test/test_upload.py b/solvebio/test/test_upload.py index da7a7bf0..6ae88b3e 100644 --- a/solvebio/test/test_upload.py +++ b/solvebio/test/test_upload.py @@ -10,7 +10,7 @@ class UploadTest(SolveBioTestCase): ('class_name', 'Upload'), ('description', None), ('md5', '587941d21d196eef3c17e7e12d3cc687'), - ('size', "590"), + ('size', 590), ] def test_upload_url(self): From 8b8af04f9781ba9e8f366a5343574d7a551d3018 Mon Sep 17 00:00:00 2001 From: David Caplan Date: Mon, 17 Apr 2017 22:07:18 -0400 Subject: [PATCH 4/5] flake8 and fix query limit issue --- solvebio/query.py | 2 ++ solvebio/resource/dataset.py | 2 -- solvebio/test/test_exports.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/solvebio/query.py b/solvebio/query.py index dc416535..156db406 100644 --- a/solvebio/query.py +++ b/solvebio/query.py @@ -631,6 +631,8 @@ def execute(self, offset=0, **query): def export(self, format='json', follow=True, limit=None): from solvebio import DatasetExport + if not limit and self._limit < float('inf'): + limit = self._limit params = self._build_query(limit=limit) params.pop('offset', None) params.pop('ordering', None) diff --git a/solvebio/resource/dataset.py b/solvebio/resource/dataset.py index 53f1c3f3..45f60f16 100644 --- a/solvebio/resource/dataset.py +++ b/solvebio/resource/dataset.py @@ -1,8 +1,6 @@ from ..client import client from ..help import open_help from ..query import Query -from ..utils.humanize import naturalsize -from ..utils.md5sum import md5sum from .solveobject import convert_to_solve_object from .apiresource import CreateableAPIResource diff --git a/solvebio/test/test_exports.py b/solvebio/test/test_exports.py index 701a290f..cdf0ec4c 100644 --- a/solvebio/test/test_exports.py +++ b/solvebio/test/test_exports.py @@ -6,7 +6,6 @@ from .helper import SolveBioTestCase from os import path, remove -import json class ExportsTests(SolveBioTestCase): From e79aa2379c869a32ffba93bc6ba07c73d2b7873d Mon Sep 17 00:00:00 2001 From: David Caplan Date: Tue, 18 Apr 2017 09:25:17 -0400 Subject: [PATCH 5/5] support specified filenames in download function; pass format to exporter --- solvebio/query.py | 1 + solvebio/resource/apiresource.py | 21 +++++++++++++-------- solvebio/test/test_exports.py | 6 ++++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/solvebio/query.py b/solvebio/query.py index 156db406..e80f27c3 100644 --- a/solvebio/query.py +++ b/solvebio/query.py @@ -639,6 +639,7 @@ def export(self, format='json', follow=True, limit=None): export = DatasetExport.create( dataset_id=self._dataset_id, + format=format, params=params) if follow: diff --git a/solvebio/resource/apiresource.py b/solvebio/resource/apiresource.py index 23fe4422..7b4c7861 100644 --- a/solvebio/resource/apiresource.py +++ b/solvebio/resource/apiresource.py @@ -154,18 +154,23 @@ class DownloadableAPIResource(APIResource): def download(self, path=None, **kwargs): """ - Download the file to the specified directory - (or a temp. dir if nothing is specified). + Download the file to the specified directory or file path. + Downloads to a temporary directory if no path is specified. + + Returns the absolute path to the file. """ download_url = self.download_url(**kwargs) + # Set default filename to the extracted name filename = download_url.split('%3B%20filename%3D')[1] if path: - path = os.path.dirname(os.path.expanduser(path)) + path = os.path.expanduser(path) + # If the path is a dir, use the extracted filename + if os.path.isdir(path): + path = os.path.join(path, filename) else: - path = tempfile.gettempdir() - - filename = os.path.join(path, filename) + # Create a temporary directory for the file + path = os.path.join(tempfile.gettempdir(), filename) try: response = requests.request(method='get', url=download_url) @@ -175,10 +180,10 @@ def download(self, path=None, **kwargs): if not (200 <= response.status_code < 400): _handle_api_error(response) - with open(filename, 'wb') as fileobj: + with open(path, 'wb') as fileobj: fileobj.write(response._content) - return filename + return path def download_url(self, **kwargs): download_url = self.instance_url() + '/download' diff --git a/solvebio/test/test_exports.py b/solvebio/test/test_exports.py index cdf0ec4c..a6ce0ca6 100644 --- a/solvebio/test/test_exports.py +++ b/solvebio/test/test_exports.py @@ -5,6 +5,7 @@ from .helper import SolveBioTestCase +import gzip from os import path, remove @@ -20,13 +21,14 @@ def setUp(self): genome_build='GRCh37', limit=10) def test_csv_exporter(self): - test_file = '/tmp/test_export.csv' + # CSV exports are are compressed + test_file = '/tmp/test_export.csv.gz' reference_file = 'solvebio/test/data/test_export.csv' export = self.query.export(follow=True, format='csv') export.download(test_file) self.assertTrue(path.isfile(test_file)) self.assertEqual( - hashlib.sha1(open(test_file, 'rb').read()).hexdigest(), + hashlib.sha1(gzip.open(test_file, 'rb').read()).hexdigest(), hashlib.sha1(open(reference_file, 'rb').read()).hexdigest() ) remove(test_file)