-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix .bam file handling for visualizations #3484
Changes from 41 commits
0a7ed9c
3c4068e
43db45e
6e8fc3b
28f2c52
bff4862
7a76172
08ae2c6
bedc1f7
312bc8d
529c65a
4999476
f680b4c
82dca41
48e0137
38b6afe
2819048
482cca7
4d18e92
fe17dbc
8dbfa6f
f92a055
5e0f73e
26cbda8
f3b192e
515575f
f374ffc
7ca8a6d
acff06d
06729db
f85df5e
21fb8fd
b1c9706
12df5f5
ba0dfe1
77ebce6
4c31562
4ae5da8
c77d4f4
32e6ca6
6af7517
084449f
023de8d
f38b064
b9b4b02
a762c93
60c1ee8
41b0f6d
74a3821
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import unicode_literals | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('analysis_manager', '0008_analysisstatus_galaxy_workflow_task_group_id'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='analysisstatus', | ||
name='auxiliary_file_task_group_id', | ||
field=models.UUIDField(null=True, editable=False), | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,22 @@ | ||
from datetime import date | ||
import logging | ||
import os | ||
import time | ||
|
||
from django.conf import settings | ||
from django.contrib.auth.models import User | ||
from django.db import transaction | ||
|
||
import botocore | ||
import celery | ||
from celery.task import task | ||
import pysam | ||
import tempfile | ||
|
||
from core.models import DataSet, ExtendedGroup, FileStoreItem | ||
from file_store.models import FileExtension, generate_file_source_translator | ||
from file_store.tasks import FileImportTask | ||
from file_store.tasks import FileImportTask, download_s3_object, \ | ||
copy_file_object | ||
|
||
from .isa_tab_parser import IsaTabParser | ||
from .models import Investigation, Node, initialize_attribute_order | ||
|
@@ -273,22 +277,26 @@ def parse_isatab(username, public, path, identity_id=None, | |
return data_set_uuid | ||
|
||
|
||
@task() | ||
def generate_auxiliary_file(auxiliary_node, parent_node_file_store_item): | ||
@task(soft_time_limit=3600) | ||
def generate_auxiliary_file(parent_node_uuid): | ||
"""Task that will generate an auxiliary file for visualization purposes | ||
with specific file generation tasks going on for different FileTypes | ||
flagged as: `used_for_visualization`. | ||
:param auxiliary_node: a Node instance | ||
:type auxiliary_node: Node | ||
:param datafile_path: relative path to datafile used to generate aux file | ||
:type datafile_path: String | ||
:param parent_node_file_store_item: FileStoreItem associated with the | ||
parent Node | ||
:type parent_node_file_store_item: FileStoreItem | ||
:param parent_node: the parent Node uuid | ||
:type parent_node_file_store_item: Node | ||
""" | ||
generate_auxiliary_file.update_state(state=celery.states.STARTED) | ||
parent_node = Node.objects.get(uuid=parent_node_uuid) | ||
datafile = parent_node.file_item.datafile | ||
auxiliary_file_store_item = FileStoreItem.objects.create() | ||
auxiliary_node = parent_node.create_and_associate_auxiliary_node( | ||
auxiliary_file_store_item | ||
) | ||
try: | ||
datafile_path = parent_node_file_store_item.datafile.path | ||
if not settings.REFINERY_S3_USER_DATA: | ||
datafile_path = datafile.path | ||
else: | ||
datafile_path = datafile.name | ||
except (NotImplementedError, ValueError): | ||
datafile_path = None | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This try block is huge. I think the only function that can raise exceptions is |
||
|
@@ -298,13 +306,15 @@ def generate_auxiliary_file(auxiliary_node, parent_node_file_store_item): | |
# Here we are checking for the FileExtension of the ParentNode's | ||
# FileStoreItem because we will create auxiliary files based on what | ||
# said value is | ||
if parent_node_file_store_item.get_extension().lower() == 'bam': | ||
if parent_node.file_item.get_extension().lower() == 'bam': | ||
hackdna marked this conversation as resolved.
Show resolved
Hide resolved
|
||
generate_bam_index(auxiliary_node.file_item.uuid, datafile_path) | ||
|
||
generate_auxiliary_file.update_state(state=celery.states.SUCCESS) | ||
|
||
logger.debug("Auxiliary file for %s generated in %s " | ||
"seconds." % (datafile_path, time.time() - start_time)) | ||
return auxiliary_file_store_item.uuid | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the task fail here also? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
|
||
except Exception as e: | ||
logger.error( | ||
"Something went wrong while trying to generate the auxiliary file " | ||
|
@@ -332,13 +342,28 @@ def generate_bam_index(auxiliary_file_store_item_uuid, datafile_path): | |
# fail if we can't get what we want. | ||
bam_index_file_extension = FileExtension.objects.get(name="bai").name | ||
auxiliary_file_store_item = FileStoreItem.objects.get( | ||
uuid=auxiliary_file_store_item_uuid) | ||
uuid=auxiliary_file_store_item_uuid | ||
) | ||
|
||
# Leverage pysam library to generate bam index file | ||
# FIXME: This should be refactored once we don't have a need for | ||
# Standalone IGV because this is creating a bam_index file in the same | ||
# directory as it's bam file | ||
pysam.index(bytes(datafile_path)) | ||
if settings.REFINERY_S3_USER_DATA: | ||
key = datafile_path | ||
bucket = settings.MEDIA_BUCKET | ||
temp_file = os.path.join(tempfile.gettempdir(), key) | ||
os.makedirs(os.path.abspath(os.path.join(temp_file, os.pardir))) | ||
with open(temp_file, 'wb') as destination: | ||
download_s3_object(bucket, key, destination) | ||
pysam.index(bytes(temp_file)) | ||
datafile_path = temp_file | ||
os.remove(temp_file) | ||
else: | ||
temp_file = os.path.join(tempfile.gettempdir(), datafile_path) | ||
os.makedirs(os.path.abspath(os.path.join(temp_file, os.pardir))) | ||
with open(temp_file, 'wb') as destination, \ | ||
open(datafile_path, 'rb') as source: | ||
copy_file_object(source, destination) | ||
pysam.index(bytes(temp_file)) | ||
datafile_path = temp_file | ||
os.remove(temp_file) | ||
|
||
# Map source field of FileStoreItem to path of newly created bam index file | ||
auxiliary_file_store_item.source = "{}.{}".format( | ||
|
@@ -367,7 +392,8 @@ def post_process_file_import(**kwargs): | |
node.update_solr_index() | ||
logger.info("Updated Solr index with file import state for Node '%s'", | ||
node.uuid) | ||
if kwargs['state'] == celery.states.SUCCESS: | ||
if kwargs['state'] == celery.states.SUCCESS and \ | ||
node.is_auxiliary_node_needed(): | ||
node.run_generate_auxiliary_node_task() | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is default 60 seconds not sufficient?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this is a file-based operation (we have to both move a file and do an operation on it), the timeout should match the FileImportTask's timeout
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The FileImportTask timeout was chosen to accommodate downloads from sites on public Internet which can take a really long time (e.g., from ftp://ftp.sra.ebi.ac.uk).
Here we are dealing with transfers to/from S3 within AWS network. It would be great to benchmark how long does this operation take for a typical BAM file (download from s3 + indexing + upload to S3) and set the timeout accordingly (perhaps with a 30% margin?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok i'll do that now