In [38]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
import time
import re
import os
import requests
import subprocess
import tarfile
import json
import pandas as pd
from datetime import datetime
import shutil
from tqdm._tqdm_notebook import tqdm_notebook as tqdm 

In [40]:
import sys
sys.path.append('../')

In [41]:
from gdc.download import gdc_tool_download, api_download_iterative, api_download_batch

Custom libraries

In [42]:
GDC_CLIENT_TOOL = '/Users/portizdegalisteo/Google\ Drive/Master/TFM/Pablo\ Ortiz/gdc-client'
DATA_ENDPOINT = "https://api.gdc.cancer.gov/data/"
DATA_DIR = 'data'

RESULTS_FILE = 'test_results.csv'
RESULTS_FIELDS = ['dataset', 'n_files', 'avg_size', 'total_size', 'tool', 
                  'stream', 'iterative', 'time', 'speed', 'multiprocess', 'status']

DEFAULT_CHUNK_SIZE = 1

TMP_DIR = os.path.abspath(os.path.join(DATA_DIR, 'tmp'))

Tests

In [43]:
tests = [
         {'dataset': 'dataset_1.csv', 'tool': 'gdc'},
         {'dataset': 'dataset_1.csv', 'tool': 'api', 'stream': True, 'iterative': False},
         {'dataset': 'dataset_1.csv', 'tool': 'api', 'stream': True, 'iterative': True},
         {'dataset': 'dataset_2.csv', 'tool': 'api', 'stream': False, 'iterative': False},
         {'dataset': 'dataset_2.csv', 'tool': 'api', 'stream': False, 'iterative': True},

         
         {'dataset': 'dataset_2.csv', 'tool': 'gdc'},
         {'dataset': 'dataset_2.csv', 'tool': 'api', 'stream': True, 'iterative': False},
         {'dataset': 'dataset_2.csv', 'tool': 'api', 'stream': True, 'iterative': True},
         {'dataset': 'dataset_2.csv', 'tool': 'api', 'stream': False, 'iterative': False},
         {'dataset': 'dataset_2.csv', 'tool': 'api', 'stream': False, 'iterative': True},

         
         {'dataset': 'dataset_3.csv', 'tool': 'gdc'},
         {'dataset': 'dataset_3.csv', 'tool': 'api', 'stream': True, 'iterative': False},
         {'dataset': 'dataset_3.csv', 'tool': 'api', 'stream': True, 'iterative': True},
         {'dataset': 'dataset_3.csv', 'tool': 'api', 'stream': False, 'iterative': False},
         {'dataset': 'dataset_3.csv', 'tool': 'api', 'stream': False, 'iterative': True},
         
         {'dataset': 'dataset_4.csv', 'tool': 'gdc'},
         {'dataset': 'dataset_4.csv', 'tool': 'api', 'stream': True, 'iterative': True},
         {'dataset': 'dataset_4.csv', 'tool': 'api', 'stream': False, 'iterative': True},
         
         {'dataset': 'dataset_5.csv', 'tool': 'gdc'},
         {'dataset': 'dataset_5.csv', 'tool': 'api', 'stream': True, 'iterative': False},
         {'dataset': 'dataset_5.csv', 'tool': 'api', 'stream': False, 'iterative': False},
    
         {'dataset': 'dataset_6.csv', 'tool': 'api', 'stream': True, 'iterative': True},
         {'dataset': 'dataset_6.csv', 'tool': 'api', 'stream': True, 'iterative': True, 'multiprocess': 2},
         {'dataset': 'dataset_6.csv', 'tool': 'api', 'stream': True, 'iterative': True, 'multiprocess': 4},
         {'dataset': 'dataset_6.csv', 'tool': 'api', 'stream': True, 'iterative': True, 'multiprocess': 8},
         {'dataset': 'dataset_6.csv', 'tool': 'api', 'stream': True, 'iterative': True, 'multiprocess': 16}
        ]

Auxiliary functions

In [44]:
def clear_dir(directory):
    
    if os.path.exists(TMP_DIR):
        shutil.rmtree(TMP_DIR)

    os.mkdir(TMP_DIR)
    
def data_summary(df, printed=True):
    
    n_files = len(df)
    avg_size = round(df['file_size'].mean(), 2)
    total_size = round(df['file_size'].sum(), 2)
    
    if printed:
        print('Number of files: {0:>8}'.format(n_files))
        print('Avg size (MB):{0:>11.2f}'.format(avg_size))
        print('Total size (MB):{0:>9.2f}'.format(total_size))
    
    return {'n_files': n_files, 'avg_size': avg_size, 'total_size': total_size}

def run_test(test):
    
    print('Running test...\n')
    for x in test:
        print ('\t', x,': ', test[x], sep='')
    print()
    
    df = pd.read_csv(os.path.join(DATA_DIR, test['dataset']), sep='|')
    
    results = data_summary(df, printed=True)
    results = {**test, **results}
    print()
    
    time_start = time.time()
    
    chunk_size = test['chunk_size'] if 'chunk_size' in test else DEFAULT_CHUNK_SIZE        

    try:

        if test['tool'] == 'gdc':
            gdc_tool_download(df, TMP_DIR, GDC_CLIENT_TOOL)
        elif (test['tool'] == 'api') & (test['iterative'] is True):
            multiprocess = False if 'multiprocess' not in test else test['multiprocess']
            api_download_iterative(df, TMP_DIR, stream=test['stream'], chunk_size=chunk_size, 
                                   multiprocess=multiprocess)        
        elif (test['tool'] == 'api') & (test['iterative'] is False):
            api_download_batch(df, TMP_DIR, stream=test['stream'], chunk_size=chunk_size)
        else:
            raise ValueError('Invalid test parameters combination')

    except Exception as e: 
        print('ERROR!')
        print(type(e).__name__, e.args) 
        results['status'] = 'ERROR'

    else:

        print('OK\n')

        time_elapsed = round(time.time() - time_start, 2)
        speed = round(time_elapsed / results['total_size'], 2)

        results['time'] = time_elapsed
        results['speed'] = speed
        results['status'] = 'OK'

        print('Time: {}s'.format(time_elapsed))
        print('Speed: {}MB/s'.format(round(results['total_size'] / time_elapsed, 2)))
    
    print('-' * 80 + '\n')
        
    return results

Clear tmp dir

In [45]:
clear_dir(TMP_DIR)

## Datasets Info

In [46]:
datasets = sorted(list(set([x['dataset'] for x in tests])))
datasets = {x: pd.read_csv(os.path.join(DATA_DIR, x), sep='|') for x in datasets}

In [47]:
for name,df in datasets.items():
    print('Dataset:', name)
    data_summary(df)
    print()

Dataset: dataset_1.csv
Number of files:       10
Avg size (MB):       0.44
Total size (MB):     4.36

Dataset: dataset_2.csv
Number of files:        4
Avg size (MB):      26.54
Total size (MB):   106.18

Dataset: dataset_3.csv
Number of files:        4
Avg size (MB):     199.11
Total size (MB):   796.45

Dataset: dataset_4.csv
Number of files:        1
Avg size (MB):    1503.25
Total size (MB):  1503.25

Dataset: dataset_5.csv
Number of files:        3
Avg size (MB):    3280.28
Total size (MB):  9840.83

Dataset: dataset_6.csv
Number of files:       29
Avg size (MB):      41.52
Total size (MB):  1204.07



## Run Tests

In [48]:
# with open(RESULTS_FILE, 'w') as f:
#     f.write('|'.join(RESULTS_FIELDS) + '\n')

In [53]:
tests = tests[15:]

In [54]:
for test in tests[1:]:
    
    results = run_test(test)

    clear_dir(TMP_DIR)

    with open(RESULTS_FILE, 'a') as f:
        f.write('|'.join([str(results.get(x, '')) for x in RESULTS_FIELDS]) + '\n')

Running test...

	dataset: dataset_4.csv
	tool: api
	stream: True
	iterative: True

Number of files:        1
Avg size (MB):    1503.25
Total size (MB):  1503.25



HBox(children=(IntProgress(value=0, description='Files', max=1, style=ProgressStyle(description_width='initial…

 

HBox(children=(IntProgress(value=0, description='TCGA-IB-7889-01Z-00-DX1.6D4EE9ED-5AE0-4AC2-B75C-2B26F562D346.…

KeyboardInterrupt: 