# Evaluating Submissions to the SMC-RNA DREAM Challenge on the Seven Bridges CGC

This will go over how to use the API to go from a Task ID to having the submitted application and reference files cached in a new project, then rerunning it with evaluation data. The evaluation data and new project are mocks which can be replaced for the actual evaluation.

In [32]:
from __future__ import print_function
from os import environ
from datetime import datetime
import sevenbridges as sbg
from dream_helpers import *
import pprint 
pp = pprint.PrettyPrinter(indent=4)

# Create API object
api = sbg.Api(config=sbg.Config(url=environ['API_URL'], token=environ['AUTH_TOKEN']))

In [33]:
# task_id="6d2becba-c266-4d84-9de5-92b598de1042" # RSEM with TUMOR_FASTQ in labels (not IDs)
# task_id = "21d6c8f7-d75e-4d57-b1ec-884218becef3"
# task_id = "3faacf31-b840-465c-9ba0-df85dce4ef6d"
task_id = "975423f6-ffda-4eae-9935-529d204c496d"

# # Can grab the necessary objects
validation_task = api.tasks.get(task_id) # task_object
# validation_project = api.projects.get(validation_task.project) # project_object
# validation_app = api.apps.get(validation_task.app) # app_object

def check_task_status(task_object):
    if task_object.status == "COMPLETED":
        print("\nTask status: Completed.")
    else:
        print("WARNING: task not completed. Current status: {}".format(task_object.status))

def replace_file_dicts_with_objects(api, project, task_inputs):
    for k, v in task_inputs.iteritems():
        if isinstance(v, dict) and v["class"] == "File":
            task_inputs[k] = get_file_by_name(api, project, v["name"])
    return task_inputs

def empty_tumor_ports_by_id(task_inputs):
    for k, v in task_inputs.iteritems():
        if "TUMOR_FASTQ" in k:
            task_inputs[k] = "ID indicates TUMOR_FASTQ"
    return task_inputs

def empty_tumor_ports_by_label(app_object, task_inputs):
    for port in app_object.raw['inputs']:
        if "label" in port and "TUMOR_FASTQ".lower() in port['label'].lower():
            task_inputs[port['id'].split("#")[-1]] = "Label indicates TUMOR_FASTQ"
    return task_inputs

def get_task_input_object(api, task_id):
    task_object = api.tasks.get(task_id)
    
    # Get required objects
    app_object = api.apps.get(task_object.app)
    project = api.projects.get(task_object.project)
    
    # Check if task is successfully completed
    check_task_status(task_object)
    
    # Remove keys where value is None/NoneType
    task_inputs = dict((str(k), v) for k, v in task_object.inputs.iteritems() if v)

    # Replace the values that represent files (currently dicts) with File object
    task_inputs = replace_file_dicts_with_objects(api, project, task_inputs)

    # Empty values where TUMOR_FASTQ is in ID or label
    task_inputs = empty_tumor_ports_by_id(task_inputs)
    task_inputs = empty_tumor_ports_by_label(app_object, task_inputs)

    print("Task_inputs: ")
    pp.pprint(task_inputs)
    return task_inputs

input_object = get_task_input_object(api, task_id)

Task_inputs: 
{   'TUMOR_FASTQ_1': 'Label indicates TUMOR_FASTQ',
    'TUMOR_FASTQ_2': 'Label indicates TUMOR_FASTQ',
    'bias': True,
    'bootstrap': 30,
    'fasta': <File: id=57b5e008e4b0192c34a4ee55>,
    'index_filename': u'kallisto_k31_Homo_sapiens.GRCh37.75.cdna.all.index',
    'kmer': 31,
    'output_prefix': u'sim11',
    'threads': 5}


In [3]:
"""
NEXT STEPS
- filter through task_inputs, check if file, then copy to new project
- copy application to new project [done]
- replace input object files with new file objects [done]
- rerun a task on a new pair of dummy files
- wrap it all up as a command-line runnable
"""
# Define Evaluation Project
# print(*[p.id for p in get_projects_list(api)], sep="\n")
eval_project = "gauravdream/dream-eval" # where you'll do the evaluation of the app

# Copy files
#     - note that identical files in multiple projects have UNIQUE IDs
#     - cannot check to see if the file exists already by ID
#     - so we create a new filename that should be unique to the submitter and check on that
def copy_to_eval_project(api, task_object, evaluation_project, task_inputs):
    
    # Initialize new files and new_task_inputs
    new_files = []
    new_task_inputs = task_inputs.copy() # let's not change the original object
    
    # 1. Iterate over the keys and values in task_inputs
    # 2. Check if it's a file
    # 3. Create new_filename
    # 4. If file by new_filename not in eval project, copy
    # 5. Replace values in new_task_inputs with the new, copied files
    # 6. If files were copied, get new_files and new_input_object
    for k, obj in task_inputs.iteritems():
        
        if obj.__class__.__name__ == "File":
            
            submitters_username = task_object.created_by
            new_filename = "_".join([submitters_username, obj.name]) # e.g gauravdream_rsem_index.tar.gz
#             new_filename = "_".join(["test", obj.name]) # debugging only

            # Check if the file with that filename already exists in evaluation project (check_file)
            # - if it does,     replace the value in the input object with that file
            # - if it does not, copy to new project and set value in input object
            check_file = get_file_by_name(api, project=evaluation_project, filename=new_filename)
            print(check_file.name)
            if check_file:
                print("\nWARNING: '{}' already in '{}' project.".format(new_filename, evaluation_project))
                new_task_inputs[k] = check_file # replace file object with object in new project
            else:
                new_file = obj.copy(project=evaluation_project, name=new_filename)
                new_task_inputs[k] = new_file # replace old file object with new one
                print("\n'{}' copied to '{}' project \n\twith new filename: '{}'".format(obj.name, evaluation_project, new_filename))
                print("New file ID: {}".format(new_file.id))
                new_files.append(new_file)
    # If there are new_files, return them and the new task inputs object, else warn and return old inputs object
    if new_files:
        return new_files, new_task_inputs
    else:
        print("\nNo files copied.")
        return None, new_task_inputs

new_files, new_input_object = copy_to_eval_project(api, validation_task, eval_project, input_object)

# Print: new filenames
if new_files: 
    print("\nNew filenames in {}:".format(eval_project))
    print(*[f.name for f in new_files], sep="\n")
pp.pprint(new_input_object)

gauravdream_rsem_index.tar.gz


No files copied.
{   'f': u'"1,6"',
    'index': <File: id=57a9f513e4b0a2cad67e8581>,
    'input': 'Label indicates TUMOR_FASTQ',
    'input_1': 'Label indicates TUMOR_FASTQ',
    'output_filename': u'rererunning_sim8_rsem_isoform_quant.tsv',
    'pairedend': True,
    'strandspecific': True,
    'threads': 8}


In [4]:
"""
Copy the tool to your project
- we will take the app and rename it
- this will cache the app and prevent the user from potentially making changes after submission
- to do this, we grab the raw CWL, modify the label to rename it, and set a new id  with that label
- the label is the submitter's username, the original label, and then the version/revision number (sep="-")
- this will make sure that each submission is uniquely versioned
- we will also do error checking for duplicate apps
"""

def copy_app_to_evaluation_project(api, task_object, evaluation_project):
    
    # Get submission info
    submission_app = api.apps.get(task_object.app) # get app_object using task_object
    submission_username = task_object.created_by
    
    # Grab RAW CWL & modify label and id
    evaluation_app = submission_app.raw
    evaluation_app['label'] = "_".join([submission_username, evaluation_app['label'], str(submission_app.revision)])
#     evaluation_app['label'] = "dream_testing_this_code" # debugging only
    evaluation_app_id = "/".join([evaluation_project, evaluation_app['label']])

    # Try to install the new app -- if it fails, return the app object in the new project
    try:
        installed_app = api.apps.install_app(raw=evaluation_app, id=evaluation_app_id)
        print("'{}' app from '{}' installed in '{}' project.".format(evaluation_app['label'], submission_username, evaluation_project))
        return installed_app
    except:
        print("'{}' app already exists in the '{}' project. Returning app in evaluation project.".format(evaluation_app['label'], evaluation_project))
        return get_app_by_name(api, project=evaluation_project, app_name=evaluation_app['label'])

new_app = copy_app_to_evaluation_project(api, validation_task, evaluation_project=eval_project)

'gauravdream_smcIsoform-RSEM-Workflow_1' app already exists in the 'gauravdream/dream-eval' project. Returning app in evaluation project.


In [5]:
"""
Grab evaluation fastqs here by metadata 
- split_fastqs_tuple returns a list of tuples (contains two fastq file objects)
- we grab the first element here (should only return list of size=1 for unique sample_id)
"""
eval_fastq_metadata = {"sample_id":"evalabc123"} # just a dummy sample_id
eval_fastqs = split_fastqs_tuple(fastqs=get_files_by_metadata(api, eval_project, eval_fastq_metadata))[0]
print(*[fq.name for fq in eval_fastqs], sep="\n")

Good news, everyone! The tuples are paired nicely (by sample id).
evaluation_abc123_1.fq.gz
evaluation_abc123_2.fq.gz


In [6]:
pp.pprint(new_input_object)
"""
Insert evaluation fastqs in input_object
"""

def insert_evaluation_fastqs_into_object(evaluation_fastqs, task_inputs):
    new_task_inputs = task_inputs.copy()
    fastqs = list(evaluation_fastqs)
    while fastqs:
        for k, val in new_task_inputs.iteritems():
            if type(val) == str and "TUMOR_FASTQ" in val: 
                new_task_inputs[k] = fastqs[-1]
                fastqs.pop()
    return new_task_inputs

new_input_object = insert_evaluation_fastqs_into_object(eval_fastqs, new_input_object)
pp.pprint(new_input_object)

{   'f': u'"1,6"',
    'index': <File: id=57b51294e4b0192c34a4ba63>,
    'input': 'Label indicates TUMOR_FASTQ',
    'input_1': 'Label indicates TUMOR_FASTQ',
    'output_filename': u'rererunning_sim8_rsem_isoform_quant.tsv',
    'pairedend': True,
    'strandspecific': True,
    'threads': 8}
{   'f': u'"1,6"',
    'index': <File: id=57b51294e4b0192c34a4ba63>,
    'input': <File: id=57b48e55e4b0192c34a4b993>,
    'input_1': <File: id=57b48e55e4b0192c34a4b992>,
    'output_filename': u'rererunning_sim8_rsem_isoform_quant.tsv',
    'pairedend': True,
    'strandspecific': True,
    'threads': 8}


In [7]:
"""
Final sanity check
- print final input object names to check filenames
"""
names_dict = new_input_object.copy()
for k, v in names_dict.iteritems():
    if v.__class__.__name__ == "File":
        names_dict[k] = v.name
pp.pprint(names_dict)

{   'f': u'"1,6"',
    'index': u'gauravdream_rsem_index.tar.gz',
    'input': u'evaluation_abc123_1.fq.gz',
    'input_1': u'evaluation_abc123_2.fq.gz',
    'output_filename': u'rererunning_sim8_rsem_isoform_quant.tsv',
    'pairedend': True,
    'strandspecific': True,
    'threads': 8}


In [8]:
# Create individualized task names with sample ID and current time
sample_id = eval_fastqs[0].metadata['sample_id']
current_time = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
eval_task_name = "Evaluation_{}_{} - {}".format(new_app.name, sample_id, current_time)

# Create the task
debug = False
if not debug:
    new_task = api.tasks.create(name=eval_task_name, 
                     project=eval_project,
                     app=new_app, 
                     inputs=new_input_object,
                     run=True) # IMPORTANT! set run=True if you want to run, not just draft the tasks
print("\nTask created: {}".format(eval_task_name))
print("\nInput files: {}, {}".format(eval_fastqs[0].name, eval_fastqs[1].name))
print("\nInput object: ")
pp.pprint(new_input_object)


Task created: Evaluation_gauravdream_smcIsoform-RSEM-Workflow_1_evalabc123 - 08-17-2016 22:57:52

Input files: evaluation_abc123_1.fq.gz, evaluation_abc123_2.fq.gz

Input object: 
{   'f': u'"1,6"',
    'index': <File: id=57b51294e4b0192c34a4ba63>,
    'input': <File: id=57b48e55e4b0192c34a4b993>,
    'input_1': <File: id=57b48e55e4b0192c34a4b992>,
    'output_filename': u'rererunning_sim8_rsem_isoform_quant.tsv',
    'pairedend': True,
    'strandspecific': True,
    'threads': 8}
