In [1]:
import json
import requests

from dcicutils import ff_utils

In [2]:
# first, load in your access key, using the correct path and key name.
keyname = 'data'
with open('../../keypairs.json', 'r') as keyfile:
    my_key = json.load(keyfile)[keyname]

In [3]:
# collect all the biosources in 4DN
biosources = ff_utils.search_metadata('biosources/?frame=object', key=my_key)

In [4]:
# sample json
biosources[6]

{'@id': '/biosources/4DNSRB6N74S2/',
 '@type': ['Biosource', 'Item'],
 'accession': '4DNSRB6N74S2',
 'aliases': ['4dn-dcic-lab:CD1-mouse-wt-thymocytes'],
 'award': '/awards/1U01CA200059-01/',
 'biosource_category': ['Multicellular Tissue'],
 'biosource_name': 'thymocyte',
 'biosource_type': 'tissue',
 'cell_line_tier': 'Unclassified',
 'date_created': '2019-09-26T18:43:00.517775+00:00',
 'description': 'Thymocytes isolated from a WT CD1 mouse',
 'display_title': 'thymocyte - 4DNSRB6N74S2',
 'external_references': [],
 'individual': '/individuals-mouse/4DNIN623JJBI/',
 'lab': '/labs/4dn-dcic-lab/',
 'last_modified': {'date_modified': '2019-10-21T17:17:34.355856+00:00',
  'modified_by': '/users/e2324f87-0625-4bbc-803b-d47677aebe08/'},
 'principals_allowed': {'edit': ['group.admin'], 'view': ['system.Everyone']},
 'project_release': '2019-10-21',
 'public_release': '2019-10-21',
 'schema_version': '2',
 'status': 'released',
 'submitted_by': '/users/e2324f87-0625-4bbc-803b-d47677aebe08/',

In [5]:
# collect all the experiment types in 4DN
exp_types = [item['display_title'] for item in ff_utils.search_metadata('experiment-types', key=my_key)]

In [6]:
exp_types

['pA-DamID',
 'ChIA-Drop',
 'in situ ChIA-PET',
 'single cell Methyl Hi-C',
 'Methyl Hi-C',
 'Bru-seq',
 'Immunofluorescence',
 'Electron Tomography',
 'TRIP',
 'Dilution Hi-C',
 'DNase Hi-C',
 'Micro-C',
 'ATAC-seq',
 'Capture Hi-C',
 'PLAC-seq',
 '2-stage Repli-seq',
 'DNA SPRITE',
 'single cell Hi-C',
 'sci-Hi-C',
 'GAM',
 'MARGI',
 'TCC',
 'in situ Hi-C',
 'ChIA-PET',
 'DamID-seq',
 'DNA FISH',
 'Multi-stage Repli-seq',
 'TSA-seq',
 'ChIP-seq',
 'CUT&RUN',
 'MC-Hi-C',
 'MC-3C',
 'RNA-DNA SPRITE',
 'SPT',
 'sn-Hi-C',
 'RNA-seq',
 'NAD-seq',
 'TrAC-loop',
 'RNA FISH']

In [7]:
# list of selected assays - change as needed, referring to list above.
selected_assays = ['in situ Hi-C', 'ChIA-PET', 'DNase Hi-C']

In [8]:
# grab all experiment sets from the selected assays - may take a min
experiment_sets_url = 'search/?type=ExperimentSetReplicate&experiments_in_set.experiment_type='
experiment_sets_url += '&experiments_in_set.experiment_type='.join(selected_assays)
selected_expsets = ff_utils.search_metadata(experiment_sets_url, key=my_key)

In [9]:
len(selected_expsets)

269

In [10]:
# function for downloading a file using your credentials
def download_file(url, key, dest_folder=''):
    local_filename = dest_folder + url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, auth=(key['key'], key['secret']), stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    # f.flush()
    return local_filename

In [11]:
# iterate through biosources and assays to find the mcool processed files
files_of_interest = []
for biosource in biosources:
    # filter for experiment sets with the right biosource
    filtered_by_biosource = [item for item in selected_expsets if (
        biosource['display_title'] in [exp.get('biosample', {}).get('biosource', [])[0].get('display_title') 
                      for exp in item.get('experiments_in_set', [])]
    )]
    for assay in selected_assays:
        # filter for experiment sets with the right biosource + right assay type
        filtered_by_assay = [item for item in filtered_by_biosource if (
            assay in [exp.get('experiment_type', {}).get('display_title') 
                      for exp in item.get('experiments_in_set', [])]
        )]
        for expset in filtered_by_assay:
            # find processed files of specified format
            for procfile in expset.get('processed_files', []):
                # change to correct format(s) if different filetype desired
                if procfile.get('display_title').endswith('.mcool'):
                    files_of_interest.append(
                        [biosource['display_title'], assay, expset['accession'], procfile.get('href')]
                    )  # additional pieces of metadata could be added, if desired

print('\n'.join(['\t'.join(item) for item in files_of_interest]))

WTC-11 AAVS1-GFP C28 - 4DNSR8LE4JQF	in situ Hi-C	4DNESPDEZNWX	/files-processed/4DNFIYGPDLKF/@@download/4DNFIYGPDLKF.mcool
WTC-11 AAVS1-GFP C6 - 4DNSR2SDROUR	in situ Hi-C	4DNESJ7S5NDJ	/files-processed/4DNFIVKQADWM/@@download/4DNFIVKQADWM.mcool
induced pluripotent stem cell - 4DNSRI9P775Y	in situ Hi-C	4DNESYEY3SHT	/files-processed/4DNFI3XS7NGK/@@download/4DNFI3XS7NGK.mcool
induced pluripotent stem cell - 4DNSRI9P775Y	in situ Hi-C	4DNESQDM1ZJH	/files-processed/4DNFIDDBCJWJ/@@download/4DNFIDDBCJWJ.mcool
induced pluripotent stem cell - 4DNSRI9P775Y	in situ Hi-C	4DNESAIFV6CN	/files-processed/4DNFI93QV4DD/@@download/4DNFI93QV4DD.mcool
induced pluripotent stem cell - 4DNSRI9P775Y	in situ Hi-C	4DNESGOHU121	/files-processed/4DNFI3NG5PBI/@@download/4DNFI3NG5PBI.mcool
induced pluripotent stem cell - 4DNSRI9P775Y	in situ Hi-C	4DNESREEJFEH	/files-processed/4DNFIYN1QEJU/@@download/4DNFIYN1QEJU.mcool
induced pluripotent stem cell - 4DNSRI9P775Y	in situ Hi-C	4DNESWL6EBND	/files-processed/4DNFIW8DBZLN/@

In [13]:
# save file with minimal metadata, so you can remember what experiment sets the contact matrices are from
with open('metadata.tsv', 'w') as outfile:
    outfile.write('Biosource\tAssay Type\tExperimentSetAccession\tProcessedFileHref\n')
    for item in files_of_interest:
        outfile.write('\t'.join(item) + '\n')

In [None]:
for item in files_of_interest:
    download_file('https://data.4dnucleome.org' + item[-1], key=my_key, dest_folder='')