In [6]:
import os as os

from bs4 import BeautifulSoup as bsoup
import requests as req
import pandas as pd

remc_data = 'http://egg2.wustl.edu/roadmap/data'

npeak_url = os.path.join(remc_data, 'byFileType/peaks/consolidated/narrowPeak')
bpeak_url = os.path.join(remc_data, 'byFileType/peaks/consolidated/broadPeak')
hotspot_url = os.path.join(remc_data, 'byFileType/peaks/consolidated/broadPeak/DNase')
seg_url = os.path.join(remc_data, 'byFileType/chromhmmSegmentations/ChmmModels/core_K27ac/jointModel/final')
exp_url = os.path.join(remc_data, 'byDataType/rna/expression')

repo_base = '/home/pebert/work/code/mpggit/statediff'
project_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff'

dl_dest = os.path.join(project_base, 'loaded_input/remc')

dl_subdirs = {'/css': 'state_seg', '/dnase_narrow': 'dnase_peaks',
              '/hist_narrow': 'hist_peaks', '/hist_broad': 'hist_peaks'}

dl_ext = {'/css': 'segments.bed', '/dnase_narrow': 'macs2.npeak.gz',
          '/hist_narrow': 'macs2.npeak.gz', '/hist_broad': 'macs2.bpeak.gz'}

cache_file = os.path.join(repo_base, 'annotation', 'remc_cache.h5')

histone_libs = ['H3K4me1', 'H3K4me3', 'H3K36me3',
                'H3K27me3', 'H3K9me3', 'H3K27ac', 'Input']

remove_ids = [('E104', 'bad H3K36me3'), ('E098', 'weak H3K27me3'),
              ('E113', 'weak H3K27me3'), ('E114', 'low quality rating')]

def search_expression_eids():
    url = os.path.join(remc_data, 'byDataType/rna/expression/EG.name.txt')
    df = pd.read_csv(url, sep='\t', header=0, names=['EID', 'desc'])
    df.dropna(axis=0, how='any', inplace=True)
    return df

def search_peak_ids(base_url, file_ext, assay):
    resp = req.get(base_url)
    infos = resp.text
    soup = bsoup(infos, 'lxml')
    collector = []
    for link in soup.find_all('a'):
        item = link.get('href')
        if item.endswith(file_ext):
            load_url = os.path.join(base_url, item)
            eid, remain = item.split('-')
            lib, _ = remain.split('.', 1)
            if assay == lib:  # DNase
                collector.append([eid, lib, file_ext, load_url])
                continue
            else:
                if assay == 'histone' and lib in histone_libs:
                    collector.append([eid, lib, file_ext, load_url])
    df = pd.DataFrame(collector, columns=['EID', 'library', 'file_ext', 'file_url'])
    return df

def search_segment_ids(base_url, file_ext):
    resp = req.get(base_url)
    infos = resp.text
    soup = bsoup(infos, 'lxml')
    collector = []
    for link in soup.find_all('a'):
        item = link.get('href')
        if item.endswith(file_ext):
            load_url = os.path.join(base_url, item)
            eid, remain = item.split('_', 1)
            lib, _ = remain.split('.', 1)
            collector.append([eid, '18state', file_ext, load_url])
    df = pd.DataFrame(collector, columns=['EID', 'library', 'file_ext', 'file_url'])
    return df

def identify_shared_eids(cache_path):
    expression = search_expression_eids()
    hist_npeak = search_peak_ids(npeak_url, 'narrowPeak.gz', 'histone')
    dnase_npeak = search_peak_ids(npeak_url, 'narrowPeak.gz', 'DNase')
    hist_bpeak = search_peak_ids(bpeak_url, 'broadPeak.gz', 'histone')
    # for some reason, DNase Hotspots limit the selection to 19 samples
    #dnase_hspot = search_peak_ids(hotspot_url, 'fdr0.01.broad.bed.gz', 'DNase')
    css = search_segment_ids(seg_url, 'segments.bed')
        
    shared_eids = set(expression['EID'].tolist())
    for data in [hist_npeak, dnase_npeak, hist_bpeak, css]:
        this_set = set(data['EID'].tolist())
        shared_eids = shared_eids.intersection(this_set)
    shared_eids = shared_eids - set([t[0] for t in remove_ids])
    
    with pd.HDFStore(cache_path, 'w', complib='blosc', complevel=9) as hdf:
        expression = expression.loc[expression['EID'].isin(shared_eids), :]
        hdf.put('samples', expression, format='table')
        hist_npeak = hist_npeak.loc[hist_npeak['EID'].isin(shared_eids), :]
        hdf.put('hist_narrow', hist_npeak, format='table')
        hist_bpeak = hist_bpeak.loc[hist_bpeak['EID'].isin(shared_eids), :]
        hdf.put('hist_broad', hist_bpeak, format='table')
        dnase_npeak = dnase_npeak.loc[dnase_npeak['EID'].isin(shared_eids), :]
        hdf.put('dnase_narrow', dnase_npeak, format='table')
        css = css.loc[css['EID'].isin(shared_eids), :]
        hdf.put('css', css, format='table')
    return

def download_remc_data(cache):
    
    load_log = []
    with pd.HDFStore(cache, 'r') as hdf:
        for k in hdf.keys():
            if k == '/load_log':
                continue
            if k != '/samples':
                subdir = dl_subdirs[k]
                file_ext = dl_ext[k]
                target_dir = os.path.join(dl_dest, subdir)
                if not os.path.isdir(target_dir):
                    os.makedirs(target_dir, exist_ok=True)
                data = hdf[k]
                for row in data.itertuples():
                    src_url = row.file_url
                    trg_file = '_'.join([row.EID, row.library, file_ext])
                    trg_path = os.path.join(target_dir, trg_file)
                    load_log.append([row.EID, row.library, src_url, trg_file])
                    if not os.path.isfile(trg_path):
                        resp = req.get(src_url)
                        if 'segments' in trg_file:
                            with open(trg_path, 'w') as dump:
                                dump.write(resp.text)
                        else:
                            with open(trg_path, 'wb') as dump:
                                dump.write(resp.content)
    df_log = pd.DataFrame(load_log, columns=['EID', 'library', 'source', 'target'])
    with pd.HDFStore(cache, 'a') as hdf:
        hdf.put('load_log', df_log, format='table')
    return


def download_remc_expression():
    resp = req.get(exp_url)
    infos = resp.text
    soup = bsoup(infos, 'lxml')
    dl_folder = os.path.join(dl_dest, 'rna_data')
    for link in soup.find_all('a'):
        item = link.get('href')
        if item.endswith('rb.gz'):
            continue  # ignore ribosomal genes
        elif item.endswith('.gz'):
            src_url = os.path.join(exp_url, item)
            trg_file = item
            trg_path = os.path.join(dl_folder, trg_file)
            if not os.path.isfile(trg_path):
                resp = req.get(src_url)
                with open(trg_path, 'wb') as dump:
                    dump.write(resp.content)
        else:
            continue
    return

def load_remc_data():
    if not os.path.isfile(cache_file):
        identify_shared_eids(cache_file)
    download_remc_data(cache_file)
    download_remc_expression()
    return True
    
    
        
load_remc_data()

True