Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for DatasetExports #154

Merged
merged 5 commits into from
Apr 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@
include_package_data=True,
install_requires=install_requires,
platforms='any',
extras_require={
'exporters': ['XlsxWriter>=0.8.0', 'pandas>=0.10.0'],
},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yay!

extras_require={},
entry_points={
'console_scripts': ['solvebio = solvebio.cli.main:main']
},
Expand Down
4 changes: 3 additions & 1 deletion solvebio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def emit(self, record):
Depository, DepositoryVersion,
Upload, Manifest, User,
Dataset, DatasetField,
DatasetImport, DatasetCommit, DatasetTemplate
DatasetImport, DatasetExport,
DatasetCommit, DatasetTemplate
)


Expand Down Expand Up @@ -134,6 +135,7 @@ def login(**kwargs):
'Dataset',
'DatasetField',
'DatasetImport',
'DatasetExport',
'DatasetCommit',
'DatasetTemplate',
'Depository',
Expand Down
1 change: 1 addition & 0 deletions solvebio/cli/ipython.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def launch_ipython_shell(args): # pylint: disable=unused-argument
from solvebio import DatasetCommit # noqa
from solvebio import DatasetField # noqa
from solvebio import DatasetImport # noqa
from solvebio import DatasetExport # noqa
from solvebio import DatasetTemplate # noqa
from solvebio import Depository # noqa
from solvebio import DepositoryVersion # noqa
Expand Down
6 changes: 6 additions & 0 deletions solvebio/exporters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# -*- coding: utf-8 -*-
#
# **DEPRECATED**
#
# These exporters have been deprecated in favor
# of the DatasetExport resource.
#
from __future__ import print_function
from __future__ import unicode_literals

Expand Down
21 changes: 18 additions & 3 deletions solvebio/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from .client import client
from .utils.printing import pretty_int
from .utils.tabulate import tabulate
from .exporters import exporters
from .errors import SolveError

import copy
Expand Down Expand Up @@ -629,8 +628,24 @@ def execute(self, offset=0, **query):
% self._response)
return _params, self._response

def export(self, exporter, *args, **kwargs):
return exporters.export(exporter, self, *args, **kwargs)
def export(self, format='json', follow=True, limit=None):
from solvebio import DatasetExport

if not limit and self._limit < float('inf'):
limit = self._limit
params = self._build_query(limit=limit)
params.pop('offset', None)
params.pop('ordering', None)

export = DatasetExport.create(
dataset_id=self._dataset_id,
format=format,
params=params)

if follow:
export.follow()

return export


class BatchQuery(object):
Expand Down
2 changes: 2 additions & 0 deletions solvebio/resource/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .dataset import Dataset
from .datasetfield import DatasetField
from .datasetimport import DatasetImport
from .datasetexport import DatasetExport
from .datasetcommit import DatasetCommit
from .datasettemplate import DatasetTemplate
from .upload import Upload
Expand All @@ -16,6 +17,7 @@
types = {
'Dataset': Dataset,
'DatasetImport': DatasetImport,
'DatasetExport': DatasetExport,
'DatasetCommit': DatasetCommit,
'DatasetTemplate': DatasetTemplate,
'DatasetField': DatasetField,
Expand Down
21 changes: 13 additions & 8 deletions solvebio/resource/apiresource.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,23 @@ class DownloadableAPIResource(APIResource):

def download(self, path=None, **kwargs):
"""
Download the file to the specified directory
(or a temp. dir if nothing is specified).
Download the file to the specified directory or file path.
Downloads to a temporary directory if no path is specified.

Returns the absolute path to the file.
"""
download_url = self.download_url(**kwargs)
# Set default filename to the extracted name
filename = download_url.split('%3B%20filename%3D')[1]

if path:
path = os.path.dirname(os.path.expanduser(path))
path = os.path.expanduser(path)
# If the path is a dir, use the extracted filename
if os.path.isdir(path):
path = os.path.join(path, filename)
else:
path = tempfile.gettempdir()

filename = os.path.join(path, filename)
# Create a temporary directory for the file
path = os.path.join(tempfile.gettempdir(), filename)

try:
response = requests.request(method='get', url=download_url)
Expand All @@ -175,10 +180,10 @@ def download(self, path=None, **kwargs):
if not (200 <= response.status_code < 400):
_handle_api_error(response)

with open(filename, 'wb') as fileobj:
with open(path, 'wb') as fileobj:
fileobj.write(response._content)

return filename
return path

def download_url(self, **kwargs):
download_url = self.instance_url() + '/download'
Expand Down
88 changes: 15 additions & 73 deletions solvebio/resource/dataset.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
from ..client import client
from ..help import open_help
from ..query import Query
from ..utils.humanize import naturalsize
from ..utils.md5sum import md5sum

from .solveobject import convert_to_solve_object
from .apiresource import CreateableAPIResource
from .apiresource import ListableAPIResource
from .apiresource import UpdateableAPIResource
from .apiresource import DeletableAPIResource
from .datasetfield import DatasetField

from ..exporters import DatasetExportFile
from .datasetexport import DatasetExport


class Dataset(CreateableAPIResource,
Expand Down Expand Up @@ -226,74 +223,19 @@ def import_file(self, path, **kwargs):
manifest=manifest.manifest,
**kwargs)

def export(self, path, genome_build=None, format='json',
show_progress=True, download=True):
if 'exports_url' not in self:
if 'id' not in self or not self['id']:
raise Exception(
'No Dataset ID was provided. '
'Please instantiate the Dataset '
'object with an ID or full_name.')
self['exports_url'] = self.instance_url() + '/exports'

export = client.post(self['exports_url'],
{'format': format,
'genome_build': genome_build})

print("Exporting dataset {0} to {1}"
.format(self['full_name'], path))

total_size = 0
manifest = export['manifest']

for i, manifest_file in enumerate(manifest['files']):
total_size += manifest_file['size']
export_file = DatasetExportFile(
url=manifest_file['url'],
path=path,
show_progress=show_progress)

if not download:
print('Downloading is off, skipping file {0} ({1})'
.format(export_file.file_name,
naturalsize(manifest_file['size'])))
continue

print('Downloading file {0}/{1}: {2} ({3})'
.format(i + 1, len(manifest['files']),
export_file.file_name,
naturalsize(manifest_file['size'])))
export_file.download()

# Validate the MD5 of the downloaded file.
# Handle's S3's multipart MD5 calculation.
md5, blocks = md5sum(
multipart_threshold=manifest['multipart_threshold_bytes'],
multipart_chunksize=manifest['multipart_chunksize_bytes']
)

if md5 != manifest_file['md5']:
print("### Export failed MD5 verification!")
print("### -------------------------------")
print("### File: {0}".format(export_file.file_name))
print("### Expected MD5: {0}".format(manifest_file['md5']))
print("### Calculated MD5: {0}".format(md5))
if blocks and manifest_file['multipart_blocks'] != blocks:
print("### Multipart block size failed verification")
print("### Expected: {0} blocks"
.format(manifest_file['multipart_blocks']))
print("### Found: {0} blocks".format(blocks))
print("\n### Delete the following file and try again: {0}"
.format(export_file.file_name))
print("### If the problem persists, please email "
"support@solvebio.com")
return None

print("File {0} completed downloading and MD5 verification."
.format(export_file.file_name))

print('Number of files: {0}'.format(len(manifest['files'])))
print('Number of records: {0}'.format(export['documents_count']))
print('Total size: {0}'.format(naturalsize(total_size)))
def export(self, format='json', follow=True, **kwargs):
if 'id' not in self or not self['id']:
raise Exception(
'No Dataset ID was provided. '
'Please instantiate the Dataset '
'object with an ID or full_name.')

export = DatasetExport.create(
dataset_id=self['id'],
format=format,
**kwargs)

if follow:
export.follow()

return export
56 changes: 56 additions & 0 deletions solvebio/resource/datasetexport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from .apiresource import CreateableAPIResource
from .apiresource import ListableAPIResource
from .apiresource import DeletableAPIResource
from .apiresource import DownloadableAPIResource

import time


class DatasetExport(CreateableAPIResource, ListableAPIResource,
DownloadableAPIResource, DeletableAPIResource):
"""
DatasetExport represent an export task that takes
a Dataset or filtered Dataset (Query) and exports
the contents to a flat file (CSV, JSON, or XLSX).

For interactive use, DatasetExport can be "followed" to watch
the progression of the task.
"""
LIST_FIELDS = (
('id', 'ID'),
('title', 'Title'),
('description', 'Description'),
('status', 'Status'),
('created_at', 'Created'),
)

def dataset(self):
from .dataset import Dataset
return Dataset.retrieve(self['dataset'])

def follow(self):
print("Waiting for export (id = {0}) to start...".format(self.id))
print("View your export status on MESH: "
"https://my.solvebio.com/jobs/export/{0}"
.format(self.id))

export_status = self.status

while self.status in ['queued', 'running']:
if self.status != export_status:
print("Export is now {0} (was {1})"
.format(self.status, export_status))
export_status = self.status

if self.status == 'running':
print("Export '{0}' is {1}: {2}/{3} records exported"
.format(self.id,
self.status,
self.metadata['progress']['processed_records'],
self.documents_count))

time.sleep(3)
self.refresh()

if self.status == 'completed':
print("Export complete!")
34 changes: 6 additions & 28 deletions solvebio/test/test_exports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from .helper import SolveBioTestCase

import gzip
from os import path, remove
import json


class ExportsTests(SolveBioTestCase):
Expand All @@ -20,37 +20,15 @@ def setUp(self):
self.query = self.dataset.query(filters=filters, fields=['rgd_id'],
genome_build='GRCh37', limit=10)

# CSVExporter
def test_csv_exporter(self):
test_file = '/tmp/test_export.csv'
# CSV exports are are compressed
test_file = '/tmp/test_export.csv.gz'
reference_file = 'solvebio/test/data/test_export.csv'
self.query.export('csv', filename=test_file)
export = self.query.export(follow=True, format='csv')
export.download(test_file)
self.assertTrue(path.isfile(test_file))
self.assertEqual(
hashlib.sha1(open(test_file, 'rb').read()).hexdigest(),
hashlib.sha1(gzip.open(test_file, 'rb').read()).hexdigest(),
hashlib.sha1(open(reference_file, 'rb').read()).hexdigest()
)
remove(test_file)

# XLSXExporter
def test_excel_exporter(self):
test_file = '/tmp/test_export.xlsx'
# reference_file = 'solvebio/test/data/test_export.xlsx'
self.query.export('excel', filename=test_file)
self.assertTrue(path.isfile(test_file))
remove(test_file)

# JSONExporter
def test_json_exporter(self):
test_file = '/tmp/test_export.json'
reference_file = 'solvebio/test/data/test_export.json'
self.query.export('json', filename=test_file)
self.assertTrue(path.isfile(test_file))
self.assertEqual(
hashlib.sha1(open(test_file, 'rb').read()).hexdigest(),
hashlib.sha1(open(reference_file, 'rb').read()).hexdigest()
)
with open(test_file, 'r') as f:
for row in f:
self.assertTrue(json.loads(row))
remove(test_file)
2 changes: 1 addition & 1 deletion solvebio/test/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class UploadTest(SolveBioTestCase):
('class_name', 'Upload'),
('description', None),
('md5', '587941d21d196eef3c17e7e12d3cc687'),
('size', "590"),
('size', 590),
]

def test_upload_url(self):
Expand Down