# CGC API Quickstart
This Guide leads you through a simple RNA sequencing analysis which parallels the GUI Quickstart using the CGC API. We have written this example in Python, but the concepts can be adapted to your preferred programming language. We encourage you to try this analysis yourself 

## Set project name, application, and AUTH_TOKEN
In the code below, please replace the AUTH_TOKEN string with your authentication token string! Otherwise the code wil only mock you. The authentication token associated with your account, which you can get by going to [Developer Dashboard](https://cgc.sbgenomics.com/account/#developer) after logging into your account.  Remember to **keep your AUTH_TOKEN secure!**

In [None]:
#  IMPORTS
import time as timer
from requests import request
import json
from urllib2 import urlopen
import os


#  GLOBALS
FLAGS = {'targetFound': False,                          # target project exists in CGC project
         'taskRunning': False,                          # task is still running
         'startTasks': True                             # (False) create, but do NOT start tasks
        }
TARGET_PROJECT = 'Quickstart_API'                       # project we will create in CGC (Settings > Project name in GUI)
TARGET_APP = 'RNA-seq Alignment - STAR for TCGA PE tar' # app to use
INPUT_EXT = 'tar.gz'

AUTH_TOKEN = 'AUTH_TOKEN'                               # TODO: replace 'AUTH_TOKEN' with yours here

## Functions & Classes
Since we are going to write the functions that interact with API in Python, we'll prepare a function that converts the information we send and receive into JSON. 
We will not only create things but also need to interact with them, so in this demo we also may use object oriented programming. The class definition is below. Generally, the api_calls will either return a **list of things** (e.g. *myFiles is plural*) or a very **detailed description of one thing** (e.g. *myFile is singular*). The appropriate structure is created automatically in the response_to_fields() method. 

In [None]:
#  FUNCTIONS
def api_call(path, method='GET', query=None, data=None, flagFullPath=False):
    """ Translates all the HTTP calls to interface with the CGC

    code adapted from the Seven Bridges platform API example
    https://docs.sbgenomics.com/display/developerhub/Quickstart
    flagFullPath is novel, added to smoothly resolve API pagination issues"""
    data = json.dumps(data) if isinstance(data, dict) or isinstance(data,list)  else None
    base_url = 'https://cgc-api.sbgenomics.com/v2/'

    headers = {
        'X-SBG-Auth-Token': AUTH_TOKEN,
        'Accept': 'application/json',
        'Content-type': 'application/json',
    }

    if flagFullPath:
        response = request(method, path, params=query, data=data, headers=headers)
    else:
        response = request(method, base_url + path, params=query, data=data, headers=headers)
    response_dict = json.loads(response.content) if response.content else {}

    if response.status_code / 100 != 2:
        print response_dict['message']
        raise Exception('Server responded with status code %s.' % response.status_code)
    return response_dict

def print_project_details(proj, flag_new):
    #Output details of the project
    if flag_new:
        print "Congratulations, you have made a new project. Details: \n"
    else:
        print "Your project exists. Details: \n"

    print u'\u2022' + ("Name: %s \n") % (proj.name)
    print u'\u2022' + ("ID: %s \n") % (proj.id)
    print u'\u2022' + ("Description: %s \n") % (proj.description)
    return None

def download_files(fileList):
    # download a list of files from URLs (adapted from a few stackoverflow threads)
    dl_dir = 'files/downloads/'
    try:                    # make sure we have the download directory
        os.stat(dl_dir)
    except:
        hello()
        a = dl_dir.split('/')[:-1]
        b = ''
        for a_dir in a:
            b = b + a_dir
            os.mkdir(b)
            b = b + '/'
        del a, b, a_dir

    for ii in range(1, len(fileList)):  # skip first [0] entry, it is a text header
        url = fileList[ii]
        file_name = url.split('/')[-1]
        file_name = file_name.split('?')[0]
        file_name = file_name.split('%2B')[1]
        u = urlopen(url)
        f = open((dl_dir + file_name), 'wb')
        meta = u.info()
        file_size = int(meta.getheaders("Content-Length")[0])
        print "Downloading: %s Bytes: %s" % (file_name, file_size)

        file_size_dl = 0
        block_sz = 1024*1024
        prior_percent = 0
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
            file_size_dl += len(buffer)
            f.write(buffer)
            status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
            status = status + chr(8)*(len(status)+1)
            if (file_size_dl * 100. / file_size) > (prior_percent+20):
                print status + '\n'
                prior_percent = (file_size_dl * 100. / file_size)
        f.close()

def hello():
    print("Is it me you're looking for?")
    return True


#%% CLASSES
class API(object):
    # making a class out of the api() function, adding other methods
    def __init__(self, path, method='GET', query=None, data=None, flagFullPath=False):
        self.flag = {'longList': False}
        response_dict = api_call(path, method, query, data, flagFullPath)
        self.response_to_fields(response_dict)

        if self.flag['longList']:
            self.long_list(response_dict, path, method, query, data)

    def response_to_fields(self,rd):
        if 'items' in rd.keys():        # get * {files, projects, tasks, apps}              (object name plural)
            if len(rd['items']) > 0:
                self.list_read(rd)
            else:
                self.empty_read(rd)
        else:                           # get details about ONE {file, project, task, app}  (object name singular)
            self.detail_read(rd)

    def list_read(self,rd):
        n = len(rd['items'])
        keys = rd['items'][0].keys()
        m = len(keys)

        for jj in range(m):
            temp = [None]*n
            for ii in range(n):
                temp[ii] = rd['items'][ii][keys[jj]]
            setattr(self, keys[jj], temp)

        if ('links' in rd.keys()) & (len(rd['links']) > 0):
            self.flag['longList'] = True

    def empty_read(self,rd):    # in case an empty project is queried
        self.href = []
        self.id = []
        self.name = []
        self.project = []

    def detail_read(self,rd):
        keys = rd.keys()
        m = len(keys)

        for jj in range(m):
            setattr(self, keys[jj], rd[keys[jj]])

    def long_list(self, rd, path, method, query, data):
        prior = rd['links'][0]['rel']
        # Normally .rel[0] is the next, and .rel[1] is prior. If .rel[0] = prior, then you are at END_OF_LIST
        keys = rd['items'][0].keys()
        m = len(keys)

        while prior == 'next':
            rd = api_call(rd['links'][0]['href'], method, query, data, flagFullPath=True)
            prior = rd['links'][0]['rel']
            n = len(rd['items'])
            for jj in range(m):
                temp = getattr(self, keys[jj])          # possible speed bottleneck next three ops (allocating memory)
                for ii in range(n):
                    temp.append(rd['items'][ii][keys[jj]])
                setattr(self, keys[jj], temp)


In [None]:
if __name__ != "__main__":
    exit()              # prevent accidentally running script if loading file
    
# Did you remember to change the AUTH_TOKEN
if AUTH_TOKEN == 'AUTH_TOKEN':
    print "You need to replace 'AUTH_TOKEN' string with your actual token. Please fix it."
    exit() 

## Create a project
Projects are the foundation of any analysis on the CGC. We can either use a project that has already been created, or we can use the API to create one. Here we will create a new project, but first check that it doesn't exist to show both methods. The *project name*, Pilot Fund *billing group*, and a project *description* will be sent in our API call. 

In [None]:
# list all billing groups on your account
billingGroups = API('billing/groups')
# Select the first billing group, this is "Pilot_funds(USER_NAME)"
print billingGroups.name[0], \
'will be charged for this computation. Approximate price is $4 for example STAR RNA seq (n=1) \n'

# list all projects you are part of
existingProjects = API(path='projects')                         # make sure your project doesn't already exist

# set up the information for your new project
NewProject = {
        'billing_group': billingGroups.id[0],
        'description': "A project created by the API Quickstart",
        'name': TARGET_PROJECT,
        'tags': ['tcga']
}

# Check to make sure your project doesn't already exist on the platform
for ii,p_name in enumerate(existingProjects.name):
    if TARGET_PROJECT == p_name:
        FLAGS['targetFound'] = True
        break

# Make a shiny, new project
if FLAGS['targetFound']:
    myProject = API(path=('projects/' + existingProjects.id[ii]))    # GET existing project details (we need them later)
else:
    myProject = API(method='POST', data=NewProject, path='projects') # POST new project
    # (re)list all projects, to check that new project posted
    existingProjects = API(path='projects')
    # GET new project details (we will need them later)
    myProject = API(path=('projects/' + existingProjects.id[0]))    # GET new project details (we need them later)

## Add files
Here we have multiple options for adding data to a project, but will only present:
* Copy files from existing project (API)

Here we will take advantage of the already created Quickstart project from the GUI tutorial. This code will look for our three input files from that project and copy them over. 

Note: other options are available in docs (TODO: link)

In [None]:
for ii,p_id in enumerate(existingProjects.id):
    if existingProjects.name[ii] == 'QuickStart':
        filesToCopy = API(('files?limit=100&project=' + p_id))
        break

# Don't make extra copies of files (loop through all files because we don't know what we want)
myFiles = API(('files?limit=100&project=' + myProject.id))  # files currently in project
for jj,f_name in enumerate(filesToCopy.name):
    # Conditional is HARDCODED for RNA Seq STAR workflow
    if f_name[-len(INPUT_EXT):] == INPUT_EXT or f_name[-len('sta'):] == 'sta' or \
                    f_name[-len('gtf'):] == 'gtf':
        if f_name not in myFiles.name:                      # file currently not in project
            api_call(path=(filesToCopy.href[jj] + '/actions/copy'), method='POST', \
                data={'project': myProject.id, 'name': f_name}, flagFullPath=True)

## Add Applications or Workflows
There are more than 150 public apps available on the Seven Bridges CGC. Here we query all of them, then copy the target workflow to our project. 

In [None]:
myFiles = API(('files?limit=100&project=' + myProject.id))   # GET files LIST, regardless of upload method

# Add workflow (copy from other project or GUI, not looping through all apps, we know exactly what we want)
allApps = API(path='apps?limit=100&visibility=public')          # long function call, currently 183
myApps = API(path=('apps?limit=100&project=' + myProject.id))
if TARGET_APP not in allApps.name:
    print "Target app (%s) does not exist in the public repository. Please double-check the spelling" % (TARGET_APP)
else:
    ii = allApps.name.index(TARGET_APP)
    if TARGET_APP not in myApps.name:                           # app not already in project
        temp_name = allApps.href[ii].split('/')[-2]             # copy app from public repository
        api_call(path=('apps/' + allApps.project[ii] + '/' + temp_name + '/actions/copy'), \
            method='POST', data={'project': myProject.id, 'name': TARGET_APP})
        myApps = API(path=('apps?limit=100&project=' + myProject.id))   # update project app list
del allApps

## Build a file processing list
Most likely, we will only have one input file and two reference files in the project. However, if multiple input files were imported, this will create a batch of *single-input-single-output tasks* - one for each file. This code builds the list of files

In [None]:
# Build .fileProcessing (inputs) and .fileIndex (references) lists [for workflow]
FileProcList = ['Files to Process']
Ind_GtfFile = None
Ind_FastaFile = None

for ii,f_name in enumerate(myFiles.name):
    # this conditional is for 'RNA seq STAR alignment' in Quickstart_API. _Adapt_ appropriately for other workflows
    if f_name[-len(INPUT_EXT):] == INPUT_EXT:                                      # input file
        FileProcList.append(ii)
    elif f_name[-len('gtf'):] == 'gtf':
        Ind_GtfFile = ii
    elif f_name[-len('sta'):] == 'sta':
        Ind_FastaFile = ii

## Build & Start tasks
Next we will iterate through the File Processing List (FileProcList) to generate one task for each input file. If the Flag *startTasks* is true, the tasks will start running immediately.

In [None]:
myTaskList = [None]
for ii,f_ind in enumerate(FileProcList[1:]):                    # Start at 1 because FileProcList[0] is a header
    NewTask = {'description': 'APIs are awesome',
        'name': ('batch_task_' +  str(ii)),
        'app': (myApps.id[0]),                                  # ASSUMES only single workflow in project
        'project': myProject.id,
        'inputs': {
            'genomeFastaFiles': {                               # .fasta reference file
                'class': 'File',
                'path': myFiles.id[Ind_FastaFile],
                'name': myFiles.name[Ind_FastaFile]
            },
            'input_archive_file': {                             # File Processing List
                'class': 'File',
                'path': myFiles.id[f_ind],
                'name': myFiles.name[f_ind]
            },
            # .gtf reference file, !NOTE: this workflow expects a _list_ for this input
            'sjdbGTFfile': [
               {
                'class': 'File',
                'path': myFiles.id[Ind_GtfFile],
                'name': myFiles.name[Ind_GtfFile]
               }
            ]
        }
    }
    # Create the tasks, run if FLAGS['startTasks']
    if FLAGS['startTasks']:
        myTask = api_call(method='POST', data=NewTask, path='tasks/', query={'action': 'run'})        # task created and run
        myTaskList.append(myTask['href'])
    else:
        myTask = api_call(method='POST', data=NewTask, path='tasks/')        # task created and run
myTaskList.pop(0)

print "%i tasks have been created. \n" % (ii+1)
print "Enjoy a break, come back to us once you've gotten an email that tasks are done"

## Check task completion
These tasks may take a long time to complete, here are two ways to check in on them:
* Wait for email confirmation
* No additional code need, emails will arrive whether the task was started by GUI or API.

In [None]:
# if tasks were started, check if they've finished
for href in myTaskList:
    # check on one task at a time, if any running, can not continue (no sense to query others)
    print "Pinging CGC for task completion, will download summary files once all tasks completed."
    FLAGS['taskRunning'] = True
    while FLAGS['taskRunning']:
        task = api_call(path=href, flagFullPath=True)
        if task['status'] == 'COMPLETED':
            FLAGS['taskRunning'] = False
        elif task['status'] == 'FAILED':                        # NOTE: also leaving loop on "FAILED" statuses
            print "Task failed, can not continue"
            exit()
        timer.sleep(600)

## EXTRA CELLS 
From the Quickstart, these are the files for:
* downloading files
* uploading local files
* setting file metadata

In [None]:
from urllib2 import urlopen
import os
 
def download_files(fileList):
    # download a list of files from URLs (adapted from a few stackoverflow threads)
    dl_dir = 'downloads/'
    try:                    # make sure we have the download directory
        os.stat(dl_dir)
    except:
        os.mkdir(dl_dir)
 
    for ii in range(1, len(fileList)):  # skip first [0] entry, it is a text header
        url = fileList[ii]
        file_name = url.split('/')[-1]
        file_name = file_name.split('?')[0]
        file_name = file_name.split('%2B')[1]
        u = urlopen(url)
        f = open((dl_dir + file_name), 'wb')
        meta = u.info()
        file_size = int(meta.getheaders("Content-Length")[0])
        print "Downloading: %s Bytes: %s" % (file_name, file_size)
 
        file_size_dl = 0
        block_sz = 1024*1024
        prior_percent = 0
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
            file_size_dl += len(buffer)
            f.write(buffer)
            status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
            status = status + chr(8)*(len(status)+1)
            if (file_size_dl * 100. / file_size) > (prior_percent+20):
                print status + '\n'
                prior_percent = (file_size_dl * 100. / file_size)
        f.close()

# Check which files have been generated (only taking small files to avoid long times)
myNewFiles = API(('files?project=' + myProject.id))        # calling again to see what was generated
dlList = ["links to file downloads"]

for ii, f_name in enumerate(myNewFiles.name):
    # downloading only the summary files. Adapt for whichever files you need
    if (f_name[-4:] == '.out'):
        dlList.append(api_call(path=('files/' + myNewFiles.id[ii] + '/download_info'))['url'])
T0 = timer.time()
download_files(dlList)
print timer.time() - T0, "seconds download time"

In [None]:
# TODO: validate this

print "You need to install the command line uploader before proceeding"
ToUpload = ['G17498.TCGA-02-2483-01A-01R-1849-01.2.tar.gz','ucsc.hg19.fasta','human_hg19_genes_2014.gtf']
for ii in range(len(ToUpload)):
    cmds = "cd ~/cgc-uploader; bin/cgc-uploader.sh -p 0f90eae7-2a76-4332-a233-6d20990189b7 " + \
        "/Users/digi/PycharmProjects/cgc_API/toUpload/" + ToUpload[ii]   
    os.system(cmds)    # TODO, uncomment
del cmds

In [None]:
myFiles = API(('files?project=' + myProject.id))                       # GET files LIST

metadata = {
    "name": "readme.md",
    "library":"TEST",
    "file_type": "fastq",
    "sample": "example_human_Illumina",
    "seq_tech": "Illumina",
    "paired_end": "1",
    'gender': "male",
    "data_format": "awesome"
}
print myFiles.href[3] + '/metadata'

api_call(path=(myFiles.href[3] + '/metadata'), method='PUT', data = metadata, flagFullPath=True)