In [13]:

import os as os

import pandas as pd

project_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/processing/norm'
dir_testdata = os.path.join(project_base, 'task_testdata_exp/compfeat_groups')

supplement = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/crossspecies/supplement/supp_tables'

norm_samples = {'ESE14': 'esc', 'H1hESC': 'esc'}
norm_region = {'reg5p': 'promoter', 'body': 'body'}

extract_reg5p = ['name', 'symbol', 'start', 'end',
                 'ftmsig_H3K27ac_abs_mean',
                 'ftmsig_H3K4me3_abs_mean']

extract_body = ['name', 'symbol', 'start', 'end',
                'ftmsig_H3K36me3_abs_mean']

select_cols = {'body': extract_body, 'promoter': extract_reg5p}

query_sort_order = {'hg19': {'rheMac2': 0, 'mm9': 1, 'oryCun2': 2, 'rn5': 3,
                             'felCat5': 4, 'bosTau7': 5, 'susScr2': 6, 'canFam3': 7,
                             'equCab2': 8, 'oviAri3': 9, 'monDom5': 10, 'galGal3': 11},
                    'mm9': {'rn5': 0, 'oryCun2': 1, 'hg19': 2, 'rheMac2': 3,
                            'bosTau7': 4, 'susScr2': 5, 'equCab2': 6, 'galGal3': 7}}


def dump_transfer_signal(data_collector, ref_species):
    
    data_collector = pd.concat(data_collector, ignore_index=False,
                               axis=0, sort=False)
    data_collector.reset_index(drop=True, inplace=True)
    if pd.isnull(data_collector).any(axis=1).any():
        # for species combinations where there is no RNA
        # test data available, no signal should have been
        # transferred (apparently, some error in the pipeline
        # code for hg19; for mm9, this is indeed the case)
        data_collector.fillna('no_transcriptome_available', inplace=True)
    col_order = ['assembly', 'name', 'symbol', 'chrom',
                 'start_promoter', 'end_promoter', 'start_body', 'end_body']
    for c in sorted(data_collector.columns):
        if c not in col_order:
            col_order.append(c)
    data_collector = data_collector[col_order]
    if ref_species == 'hg19':
        prefix = 'Additional-file-3_signal_'
    else:
        prefix = 'Additional-file-4_signal_'
    out_path = os.path.join(supplement, prefix + '{}-to-any.tsv'.format(ref_species))
    print('Writing...')
    data_collector.to_csv(out_path, sep='\t',
                          header=True, index=False)
    return []



def collect_testdata(root_path):
    
    all_subfolders = sorted(os.listdir(root_path))
    subfolder = []
    for s in all_subfolders:
        qry, _, trg = s.split('_')
        subfolder.append((trg, query_sort_order[trg][qry], s))
    subfolder = sorted(subfolder)
        
    target = None
    trg_collect = []
    for ref_spec, idx, sub in subfolder:
        qry, _, trg = sub.split('_')
        if target is not None and trg != target:
            print('Dumping data for reference ', target)
            trg_collect = dump_transfer_signal(trg_collect, target)
        target = trg
        qry_collect = None
        qry_done = set()
        for ff in os.listdir(os.path.join(dir_testdata, sub)):
            _, eid, qry, biosample = ff.split('.')[0].split('_')
            biosample = norm_samples.get(biosample, biosample)
            if biosample == 'esc':
                if not (trg in ['hg19', 'mm9'] and qry in ['hg19', 'mm9']):
                    # this catches the erroneous ESC transfer for hg19
                    continue
            regtype = ff.split('.')[-2]
            regtype = norm_region.get(regtype, regtype)
            if regtype == 'uprr' or (eid, regtype) in qry_done:
                continue
            qry_done.add((eid, regtype))
            fpath = os.path.join(dir_testdata, sub, ff)
            col_subset = select_cols[regtype]
            file_collect = []
            with pd.HDFStore(fpath, 'r') as hdf:
                for k in hdf.keys():
                    if k.startswith('/metadata'):
                        continue
                    chrom = k.split('/')[-1]
                    data = hdf[k].loc[:, col_subset]
                    data['chrom'] = chrom
                    data['assembly'] = qry
                    new_cols = []
                    for c in data.columns:
                        if c.startswith('ftmsig'):
                            new_col = c.replace('ftmsig', '{}_{}'.format(eid, biosample))
                            new_col = new_col.replace('abs_mean', 'sig_{}'.format(regtype))
                            new_cols.append(new_col)
                        elif c in ['start', 'end']:
                            new_col = c + '_{}'.format(regtype)
                            new_cols.append(new_col)
                        else:
                            new_cols.append(c)
                    data.columns = new_cols
                    file_collect.append(data)
            file_collect = pd.concat(file_collect, axis=0, ignore_index=False)
            if pd.isnull(file_collect).any(axis=0).any():
                print('File merge failed {}/{}'.format(sub, ff))
                raise ValueError()
            if qry_collect is None:
                qry_collect = file_collect.copy()
            else:
                shared_keys = set(qry_collect.columns).intersection(set(file_collect.columns))
                qry_collect = qry_collect.merge(file_collect, on=sorted(shared_keys), suffixes=('', ''))
                if pd.isnull(qry_collect).any(axis=0).any():
                    print('Query merge failed {}/{}'.format(sub, ff))
                    raise ValueError()
        trg_collect.append(qry_collect)
    print('Dumping data for reference ', target)
    trg_collect = dump_transfer_signal(trg_collect, target)
    print('Done')
    return
             
            
collect_testdata(dir_testdata)
            
            
        
        
        

Dumping data for reference  hg19
Writing...
Dumping data for reference  mm9
Writing...
Done
