In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import requests
import yaml
import json
import os
import pandas as pd
import gzip
import shutil

In [33]:
from gdc.download import api_download_iterative

Config

In [22]:
%ls conf/

__init__.py  user_conf.yaml


In [23]:
with open('conf/user_conf.yaml', 'r') as f:
    conf = yaml.load(f, Loader=yaml.FullLoader)

Paths

In [24]:
slides_metadata_file = os.path.join(conf['data_path'], 'slides_metadata.csv')
rnaseq_metadata_file = os.path.join(conf['data_path'], 'rnaseq_metadata.csv')

In [25]:
slides_path = os.path.join(conf['data_path'], 'slides', 'svs')
if not os.path.exists(slides_path):
    os.mkdir(slides_path)

In [26]:
rnaseq_path = os.path.join(conf['data_path'], 'rnaseq')

if not os.path.exists(rnaseq_path):
    os.mkdir(rnaseq_path)

# Download Files

Read metadata

In [None]:
slides_df = pd.read_csv(slides_metadata_file, sep='|')
slides_df = slides_df[['file_name', 'file_id', 'file_size', 'sample_id', 'experimental_strategy']]
slides_df.head(3)

In [None]:
rnaseq_df = pd.read_csv(rnaseq_metadata_file, sep='|')
rnaseq_df = rnaseq_df[['file_name', 'file_id', 'file_size', 'sample_id', 'workflow_type']]
rnaseq_df.head(3)

## Slides

*Keep only the ones with RNASeq related data?*
*Keep only tissue slides?*

Number of files and count

In [None]:
summary = slides_df.groupby('experimental_strategy').agg({'file_name': 'size', 'file_size': 'sum'})
summary = summary.rename(columns={'file_name': 'count', 'file_size': 'total_size (gb)'})
summary['total_size (gb)'] = round(summary['total_size (gb)'] / 1000, 2)

summary

In [None]:
rna_seq_samples = rnaseq_df['sample_id'].unique()

In [None]:
slides_df = slides_df[slides_df['sample_id'].isin(rna_seq_samples)]

In [None]:
summary = slides_df.groupby('experimental_strategy').agg({'file_name': 'size', 'file_size': 'sum'})
summary = summary.rename(columns={'file_name': 'count', 'file_size': 'total_size (gb)'})
summary['total_size (gb)'] = round(summary['total_size (gb)'] / 1000, 2)

summary

In [None]:
# Download only failed
files_downloaded = os.listdir(slides_path)
slides_df = slides_df[-slides_df['file_name'].isin(files_downloaded)]

In [None]:
summary = slides_df.groupby('experimental_strategy').agg({'file_name': 'size', 'file_size': 'sum'})
summary = summary.rename(columns={'file_name': 'count', 'file_size': 'total_size (gb)'})
summary['total_size (gb)'] = round(summary['total_size (gb)'] / 1000, 2)

summary

In [None]:
api_download_iterative(slides_df, slides_path, multiprocess=4)

## RNA-Seq

In [None]:
summary = rnaseq_df.groupby('workflow_type').size()
summary.reset_index()

In [None]:
summary = rnaseq_df.groupby('workflow_type').agg({'file_name': 'size', 'file_size': 'sum'})
summary = summary.rename(columns={'file_name': 'count', 'file_size': 'total_size (gb)'})
summary['total_size (gb)'] = round(summary['total_size (gb)'] / 1000, 2)

summary

In [None]:
slides_samples = slides_df['sample_id'].unique()

In [None]:
rnaseq_df = rnaseq_df[rnaseq_df['sample_id'].isin(slides_samples)]

In [None]:
summary = rnaseq_df.groupby('workflow_type').agg({'file_name': 'size', 'file_size': 'sum'})
summary = summary.rename(columns={'file_name': 'count', 'file_size': 'total_size (gb)'})
summary['total_size (gb)'] = round(summary['total_size (gb)'] / 1000, 2)

summary

In [None]:
files = api_download_iterative(rnaseq_df, rnaseq_path, multiprocess=8)

In [None]:
def gunzip(source_filepath, dest_filepath):
    with gzip.open(source_filepath, 'rb') as s_file:
        with open(dest_filepath, 'wb') as d_file:
            shutil.copyfileobj(s_file, d_file)

In [None]:
for file_name in files:
    
    source_filepath = os.path.join(rnaseq_path, file_name)
    dest_filepath = re.sub('\.gz$', '', source_filepath)
    
    gunzip(source_filepath, dest_filepath)
    os.remove(source_filepath)