# Testing & Evaluating Kallisto with SMC-RNA DREAM Challenge Data

Let's see how Kallisto performs on DREAM Challenge training data.

In [8]:
from __future__ import print_function
from os import environ
from datetime import datetime
import time
import sevenbridges as sbg
from dream_helpers import *
import pprint 
pp = pprint.PrettyPrinter(indent=4)

# Create API object
api = sbg.Api(config=sbg.Config(url=environ['API_URL'], token=environ['AUTH_TOKEN']))

In [2]:
# Get projects_list (ids)
projects_list = get_projects_list(api) # see dream_helpers.py
print("List of project IDs: ")
print(*get_ids(projects_list), sep="\n")

List of project IDs: 
gauravdream/dream-demo
gauravdream/ccle
gauravdream/dream-eval
gauravCGC/kallisto-dream
gauravdream/dream


In [3]:
# Set project
project = "gauravCGC/kallisto-dream"

In [4]:
# Get apps in project
print(*[a.name for a in get_apps_in_project(api, project)], sep="\n")

kallisto-index-quant-dream
kallisto-index-quant
kallisto-quant-dream
clean-kallisto-abundances
DREAM Isoform Quantification Evaluation Workflow
kallisto-quant-batchable
batch-kallisto-quant
kallisto-index
kallisto-h5dump
kallisto-quant


In [5]:
# Get kallisto-quant-dream app object (using explicit name) and then print the inputs
kallisto = get_app_by_name(api, project, app_name="kallisto-quant-dream")
print_app_inputs(kallisto)

Input Ports (labels, IDs): 
(u'index', u'#index')
(u'TUMOR_FASTQ_2', u'#TUMOR_FASTQ_2')
(u'TUMOR_FASTQ_1', u'#TUMOR_FASTQ_1')
(u'Bias', u'#bias')
(u'Bootstrap Samples', u'#bootstrap')
(u'Output Filename Prefix', u'#output_prefix')


In [6]:
# Grab the list of fastqs, split, sort, check parity, then zip
fastqs_tuple = split_fastqs_tuple(fastqs=get_all_fastqs(api, project))
print("Fastq files returned: ")
for fq in fastqs_tuple: print("({}, {})".format(fq[0].name, fq[1].name))

Fastq files returned: 
(sim11_mergeSort_1.fq.gz, sim11_mergeSort_2.fq.gz)
(sim13_mergeSort_1.fq.gz, sim13_mergeSort_2.fq.gz)
(sim14_mergeSort_1.fq.gz, sim14_mergeSort_2.fq.gz)
(sim15_mergeSort_1.fq.gz, sim15_mergeSort_2.fq.gz)
(sim16_mergeSort_1.fq.gz, sim16_mergeSort_2.fq.gz)
(sim17_mergeSort_1.fq.gz, sim17_mergeSort_2.fq.gz)
(sim19_mergeSort_1.fq.gz, sim19_mergeSort_2.fq.gz)
(sim1_mergeSort_1.fq.gz, sim1_mergeSort_2.fq.gz)
(sim21_mergeSort_1.fq.gz, sim21_mergeSort_2.fq.gz)
(sim2_mergeSort_1.fq.gz, sim2_mergeSort_2.fq.gz)
(sim3_mergeSort_1.fq.gz, sim3_mergeSort_2.fq.gz)
(sim4_mergeSort_1.fq.gz, sim4_mergeSort_2.fq.gz)
(sim5_mergeSort_1.fq.gz, sim5_mergeSort_2.fq.gz)
(sim7_mergeSort_1.fq.gz, sim7_mergeSort_2.fq.gz)
(sim8_mergeSort_1.fq.gz, sim8_mergeSort_2.fq.gz)


In [7]:
# Grab the index file
kallisto_index = get_files_by_filename_filter(api, project, "kallisto.GRCh37.75")[0]
print("Index file: {}".format(kallisto_index.name))

Index file: kallisto.GRCh37.75.cdna.all.index


In [14]:
# The Main Event

debug = False # set to False to allow for drafting/running tasks
run_opt = True
task_objects = [] # store task names to grab later

for _, fq in enumerate(fastqs_tuple):
    
    # Create individualized task names with sample ID and current time
    sample_id = fq[0].metadata['sample_id']
    current_time = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
    TASK_NAME = "FinalAnalysis_{}_{} - {}".format(kallisto.name, sample_id, current_time)
    
    # Create the input object (don't know what to include? use the generate_input_object() methods)
    #     - index is the same for all tasks
    #     - iterate over the lists to pair files
    #     - set custom output filename prefix
    #     - save the json report file
    #     - set bias to True
    INPUTS = {
        "index": kallisto_index,
        "TUMOR_FASTQ_1": fq[0],
        "TUMOR_FASTQ_2": fq[1],
        "output_prefix": sample_id + "_kallisto-quant_GRCh37",
        "bias": True,
        "bootstrap": 30
        }

    # Create the task
    if not debug:
        new_task = api.tasks.create(name=TASK_NAME, 
                         project=project,
                         app=kallisto, 
                         inputs=INPUTS,
                         run=run_opt) # IMPORTANT! set run=True if you want to run, not just draft the tasks
        task_objects.append(new_task) # add this task to your list of tasks
    print("Task created: {}".format(TASK_NAME))
    print("Input files: {}, {}".format(fq[0].name, fq[1].name))
    print("Output file prefix(es): {}".format(sample_id + "_kallisto-quant_GRCh37"))
    print("\n")

print("Number of tasks run: {}".format(len(fastqs_tuple)))

Task created: FinalAnalysis_kallisto-quant-dream_sim11 - 08-19-2016 13:16:09
Input files: sim11_mergeSort_1.fq.gz, sim11_mergeSort_2.fq.gz
Output file prefix(es): sim11_kallisto-quant_GRCh37


Task created: FinalAnalysis_kallisto-quant-dream_sim13 - 08-19-2016 13:16:11
Input files: sim13_mergeSort_1.fq.gz, sim13_mergeSort_2.fq.gz
Output file prefix(es): sim13_kallisto-quant_GRCh37


Task created: FinalAnalysis_kallisto-quant-dream_sim14 - 08-19-2016 13:16:13
Input files: sim14_mergeSort_1.fq.gz, sim14_mergeSort_2.fq.gz
Output file prefix(es): sim14_kallisto-quant_GRCh37


Task created: FinalAnalysis_kallisto-quant-dream_sim15 - 08-19-2016 13:16:15
Input files: sim15_mergeSort_1.fq.gz, sim15_mergeSort_2.fq.gz
Output file prefix(es): sim15_kallisto-quant_GRCh37


Task created: FinalAnalysis_kallisto-quant-dream_sim16 - 08-19-2016 13:16:16
Input files: sim16_mergeSort_1.fq.gz, sim16_mergeSort_2.fq.gz
Output file prefix(es): sim16_kallisto-quant_GRCh37


Task created: FinalAnalysis_kallist

In [18]:
# Grab the executed tasks based on the stored tasks_names
if task_objects:
    kallisto_tasks = task_objects
    print(*[(k.name, k.status) for k in kallisto_tasks], sep="\n")
    while True:
        [k.get_execution_details() for k in kallisto_tasks]
        statuses = [k.status for k in kallisto_tasks]
        if any()
        
else:
# # Forgot to store or lost those tasks_names? No problem - glob the list and find the ones you need
# Careful with this query, make sure the filter is the right one to return your tasks of choice
    task_query="FinalAnalysis_kallisto"
    # gaurav: check that this works
    kallisto_tasks = [get_task_by_name(api, project, i) for i in get_names(get_tasks_by_string(api, project, query=task_query)) if i.status == "COMPLETED"]
    print(*get_names(get_tasks_by_string(api, project, query=task_query)), sep="\n")

(u'FinalAnalysis_kallisto-quant-dream_sim11 - 08-19-2016 13:16:09', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim13 - 08-19-2016 13:16:11', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim14 - 08-19-2016 13:16:13', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim15 - 08-19-2016 13:16:15', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim16 - 08-19-2016 13:16:16', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim17 - 08-19-2016 13:16:19', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim19 - 08-19-2016 13:16:21', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim1 - 08-19-2016 13:16:22', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim21 - 08-19-2016 13:16:24', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim2 - 08-19-2016 13:16:26', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim3 - 08-19-2016 13:16:28', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim4 - 08-19-2016 13:16:30', u'QUEUED')
(u'FinalAnalysis_kallisto-quant-dream_sim5 - 08-19-2016 

In [None]:
# Grab the Isoform Quant Validation Workflow which we will use next
eval_app = get_app_by_name(api, project, "DREAM Isoform Quantification Evaluation Workflow")
print_app_inputs(eval_app)

In [None]:
"""Inputs, Assemble!"""
# Grab all the abundance TSV files for the kallisto tasks - note that outputs are stored by their Output Port ID
kallisto_outputs = [t.outputs["abundance_tsv"] for t in kallisto_tasks]

# GTF file, I choose you.
gtf = get_file_by_name(api, project, "Homo_sapiens.GRCh37.75.gtf.txt")

# Seek the truth files - isoform
truths = filter_by_all_strings(get_files_in_project(api, project), filter_list=['isoforms_truth'])

# Sort and zip our outputs and truth files
# eval_inputs = tuplify_lists_by_name(kallisto_outputs, truths)
eval_inputs = tuplify_lists_by_name(kallisto_outputs[:-1], truths) # DEBUG ONLY - REMOVE

# Sanity check out evaluation workflow inputs
print("Evaluation Inputs: ")
for e in eval_inputs: print("({}, {})".format(e[0].name, e[1].name))
print("\nGTF file: {}".format(gtf.name))

In [None]:
# Set up the tasks
debug = True # set to False to allow for drafting/running tasks
run_opt = False
eval_tasks = []

for ko in eval_inputs:

    # Create individualized task names
    current_time = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
    TASK_NAME = "DREAM_Isoform_Eval_{} - {}".format(ko[0].name.split("_")[0], current_time)
    eval_task_names.append(TASK_NAME)
    
    # Create the input object
    INPUTS = {
                "gtf": gtf,
                "input": ko[0],
                "truth": ko[1]
             }
    
    # Create the task
    if not debug:
        new_eval_task = api.tasks.create(name=TASK_NAME, project=project, app=eval_app, inputs=INPUTS, run=run_opt)
        eval_tasks.append(new_eval_task)

    print("Task created: {}".format(TASK_NAME))
    print("Input files: {}, {}".format(ko[0].name, ko[1].name))
    print("\n")

In [None]:
"""Check task statuses"""
task_id="6d2becba-c266-4d84-9de5-92b598de1042" # RSEM with TUMOR_FASTQ in labels (not IDs)
# task_id="b8214119-527e-47e1-8489-d777ee35631d" # RSEM being run
eval_tasks = [api.tasks.get(task_id)]

if check_task_status_all(eval_tasks):
    print("All tasks completed")
    
# outputs = [task.outputs for task in eval_tasks if task.output]