In [16]:

import os as os

import pandas as pd

project_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/processing/norm'
dir_testdata = os.path.join(project_base, 'task_testdata_exp/compfeat_groups')

supplement = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/crossspecies/supplement/supp_tables'

norm_samples = {'ESE14': 'esc', 'H1hESC': 'esc'}
norm_region = {'reg5p': 'promoter', 'body': 'body'}

extract_reg5p = ['name', 'symbol', 'start', 'end',
                 'ftmsig_H3K27ac_abs_mean',
                 'ftmsig_H3K4me3_abs_mean']

extract_body = ['name', 'symbol', 'start', 'end',
                'ftmsig_H3K36me3_abs_mean']

select_cols = {'body': extract_body, 'promoter': extract_reg5p}

def collect_testdata(root_path):
    
    subfolder = sorted(os.listdir(root_path), key=lambda x: x.split('_')[2])
    target = None
    trg_collect = []
    for sub in subfolder:
        qry, _, trg = sub.split('_')
        if target is not None and trg != target:
            print('Dumping data for reference ', target)
            trg_collect = pd.concat(trg_collect, axis=0, ignore_index=False)
            if pd.isnull(trg_collect).any(axis=1).any():
                na_idx = pd.isnull(trg_collect).any(axis=1)
                na_sub = trg_collect.loc[na_idx, :]
                print(na_sub.head())
                raise ValueError('Target merge produced NA')
            if target == 'hg19':
                prefix = 'Add-file-3_signal_'
            else:
                prefix = 'Add-file-4_signal_'
            out_path = os.path.join(supplement, prefix + '{}-to-any.tsv.bz2'.format(target))
            print('Writing...')
            trg_collect.to_csv(out_path, sep='\t', header=True, index=False,
                               compression='bz2')
            trg_collect = []            
        target = trg
        qry_collect = None
        qry_done = set()
        for ff in os.listdir(os.path.join(dir_testdata, sub)):
            _, eid, qry, biosample = ff.split('.')[0].split('_')
            biosample = norm_samples.get(biosample, biosample)
            regtype = ff.split('.')[-2]
            regtype = norm_region.get(regtype, regtype)
            if regtype == 'uprr' or (eid, regtype) in qry_done:
                continue
            qry_done.add((eid, regtype))
            fpath = os.path.join(dir_testdata, sub, ff)
            col_subset = select_cols[regtype]
            file_collect = []
            with pd.HDFStore(fpath, 'r') as hdf:
                for k in hdf.keys():
                    if k.startswith('/metadata'):
                        continue
                    chrom = k.split('/')[-1]
                    data = hdf[k].loc[:, col_subset]
                    data['chrom'] = chrom
                    data['assembly'] = qry
                    new_cols = []
                    for c in data.columns:
                        if c.startswith('ftmsig'):
                            new_col = c.replace('ftmsig', '{}_{}'.format(eid, biosample))
                            new_col = new_col.replace('abs_mean', 'sig_{}'.format(regtype))
                            new_cols.append(new_col)
                        elif c in ['start', 'end']:
                            new_col = c + '_{}'.format(regtype)
                            new_cols.append(new_col)
                        else:
                            new_cols.append(c)
                    data.columns = new_cols
                    file_collect.append(data)
            file_collect = pd.concat(file_collect, axis=0, ignore_index=False)
            if pd.isnull(file_collect).any(axis=0).any():
                print('File merge failed {}/{}'.format(sub, ff))
                raise ValueError()
            if qry_collect is None:
                qry_collect = file_collect.copy()
            else:
                shared_keys = set(qry_collect.columns).intersection(set(file_collect.columns))
                qry_collect = qry_collect.merge(file_collect, on=sorted(shared_keys), suffixes=('', ''))
                if pd.isnull(qry_collect).any(axis=0).any():
                    print('Query merge failed {}/{}'.format(sub, ff))
                    raise ValueError()
        trg_collect.append(qry_collect)
    print('Dumping data for reference ', target)
    trg_collect = pd.concat(trg_collect, axis=0, ignore_index=False)
    if target == 'hg19':
        prefix = 'Add-file-3_signal_'
    else:
        prefix = 'Add-file-4_signal_'
    out_path = os.path.join(supplement, prefix + '{}-to-any.tsv.bz2'.format(target))
    print('Writing...')
    trg_collect.to_csv(out_path, sep='\t', header=True, index=False,
                       compression='bz2')
    trg_collect = []
    print('Done')
    return
             
            
collect_testdata(dir_testdata)
            
            
        
        
        

Dumping data for reference  hg19
Writing...
Dumping data for reference  mm9
Writing...
Done
