In [4]:

import os as os
import sys as sys
import collections as col
import pandas as pd
import numpy as np

# 2018-04-22
# What does this do?
# After all tissue-specific train and test datasets
# have been generated by the main pipeline, this
# noteboook creates averaged datasets as requested
# by Christoph. These averaged datasets can be used
# train generic classifiers (i.e., classifiers that
# do not just use the epigenetic signal of one cell type)

create_traindata = False
create_testdata = True

fhgfs_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/processing/norm'
train_root = os.path.join(fhgfs_base, 'task_traindata_exp/train_datasets')
test_root = os.path.join(fhgfs_base, 'task_testdata_exp/test_datasets')

drop_cols = ['score_body', 'strand_body',
             'score_uprr', 'strand_uprr',
             'score_reg5p', 'strand_reg5p']


def create_avg_traindata():
    pairs = col.defaultdict(list)
    for root, dirs, featfiles in os.walk(train_root):
        for ff in featfiles:
            if ff.endswith('.feat.h5'):
                pairs[root].append(ff)

    for folder, featfiles in pairs.items():
        epi_done = set()
        trans_done = set()
        md = None
        used_files = []
        epi_data = col.defaultdict(list)
        trans_data = col.defaultdict(list)
        for ff in featfiles:
            if ff.startswith('G99'):
                continue
            _, epi, trans, _ = ff.split('_', 3)
            if epi in epi_done and trans in trans_done:
                continue
            process_splits = False
            fpath = os.path.join(folder, ff)
            with pd.HDFStore(fpath, 'r') as hdf:
                md = hdf['metadata']
                for k in hdf.keys():
                    if k.startswith('/metadata'):
                        continue
                    data = hdf[k]
                    chrom = os.path.split(k)[-1]
                    if epi not in epi_done or process_splits:
                        used_files.append(ff)
                        data.drop(drop_cols, axis=1, inplace=True)
                        epi_data[chrom].append(data.copy())
                        epi_done.add(epi)
                        process_splits = True
                    if trans not in trans_done or process_splits:
                        data = data.loc[:, ['name', 'symbol', 'tpm_norm', 'rank_norm']]
                        trans_data[chrom].append(data.copy())
                        trans_done.add(trans)
                        process_splits = True
        print('Generating training data ', os.path.split(folder)[-1])
        group_id = md.loc[0, 'group'].split('/')[4]
        md['group'] = md['group'].str.replace(group_id, 'G9930')
        hdf_path = os.path.split(md.loc[0, 'group'])[0]
        md['srcfile'] = ','.join(used_files)
        prefix, suffix = featfiles[-1].split('.', 1)
        ref = prefix.split('_')[3]
        outfile = 'G9930_Ennn_Tnnn_' + ref + '_any.' + suffix
        print('File ', outfile)
        outpath = os.path.join(folder, outfile)        
        with pd.HDFStore(outpath, 'w', complib='blosc', complevel=9) as hdf:
            hdf.put('metadata', md, format='table')
            for chrom, epigenomes in epi_data.items():
                chrom_epi = pd.concat(epigenomes, axis=0, ignore_index=False)
                chrom_epi.drop(['tpm_norm', 'rank_norm'], axis=1, inplace=True)
                chrom_epi = chrom_epi.groupby(['name', 'symbol']).mean()
                chrom_epi['name'] = chrom_epi.index.get_level_values('name').values
                chrom_epi['symbol'] = chrom_epi.index.get_level_values('symbol').values
                chrom_epi.reset_index(drop=True, inplace=True)

                chrom_trans = pd.concat(trans_data[chrom], axis=0, ignore_index=False)
                chrom_trans.reset_index(drop=True, inplace=True)
                chrom_trans = chrom_trans.groupby(['name', 'symbol']).mean()
                chrom_trans['name'] = chrom_trans.index.get_level_values('name').values
                chrom_trans['symbol'] = chrom_trans.index.get_level_values('symbol').values
                chrom_data = chrom_epi.merge(chrom_trans, on=['name', 'symbol'], how='outer')

                chrom_path = os.path.join(hdf_path, chrom)

                hdf.put(chrom_path, chrom_data, format='fixed')
        print('====')
    return True

def create_avg_testdata():
    pairs = col.defaultdict(list)
    for root, dirs, featfiles in os.walk(test_root):
        for ff in featfiles:
            if ff.endswith('.feat.h5'):
                pairs[root].append(ff)

    for folder, featfiles in pairs.items():
        epi_done = set()
        trans_done = set()
        md = None
        used_files = []
        epi_data = col.defaultdict(list)
        trans_data = col.defaultdict(list)
        for ff in featfiles:
            if ff.startswith('G99'):
                continue
            _, epi, trans, _ = ff.split('_', 3)
            if epi in epi_done and trans in trans_done:
                continue
            process_splits = False
            fpath = os.path.join(folder, ff)
            with pd.HDFStore(fpath, 'r') as hdf:
                md = hdf['metadata']
                for k in hdf.keys():
                    if k.startswith('/metadata'):
                        continue
                    data = hdf[k]
                    chrom = os.path.split(k)[-1]
                    if epi not in epi_done or process_splits:
                        used_files.append(ff)
                        data.drop(drop_cols, axis=1, inplace=True)
                        epi_data[chrom].append(data.copy())
                        epi_done.add(epi)
                        process_splits = True
                    if trans not in trans_done or process_splits:
                        data = data.loc[:, ['name', 'symbol', 'tpm_norm', 'rank_norm']]
                        trans_data[chrom].append(data.copy())
                        trans_done.add(trans)
                        process_splits = True

        group_id = md.loc[0, 'group'].split('/')[4]
        md['group'] = md['group'].str.replace(group_id, 'G9930')
        hdf_path = os.path.split(md.loc[0, 'group'])[0]
        md['srcfile'] = ','.join(used_files)
        prefix, suffix = featfiles[-1].split('.', 1)
        ref = prefix.split('_')[3]
        outfile = 'G9930_Ennn_Tnnn_' + ref + '_any.' + suffix
        outpath = os.path.join(folder, outfile)
        
        print('Writing ', outfile)
        with pd.HDFStore(outpath, 'w', complib='blosc', complevel=9) as hdf:
            hdf.put('metadata', md, format='table')
            for chrom, epigenomes in epi_data.items():
                chrom_epi = pd.concat(epigenomes, axis=0, ignore_index=False)
                chrom_epi.drop(['tpm_norm', 'rank_norm'], axis=1, inplace=True)
                chrom_epi = chrom_epi.groupby(['name', 'symbol']).mean()
                chrom_epi['name'] = chrom_epi.index.get_level_values('name').values
                chrom_epi['symbol'] = chrom_epi.index.get_level_values('symbol').values
                chrom_epi.reset_index(drop=True, inplace=True)

                chrom_trans = pd.concat(trans_data[chrom], axis=0, ignore_index=False)
                chrom_trans.reset_index(drop=True, inplace=True)
                chrom_trans = chrom_trans.groupby(['name', 'symbol']).mean()
                chrom_trans['name'] = chrom_trans.index.get_level_values('name').values
                chrom_trans['symbol'] = chrom_trans.index.get_level_values('symbol').values
                chrom_trans.reset_index(drop=True, inplace=True)

                chrom_data = chrom_epi.merge(chrom_trans, on=['name', 'symbol'], how='outer')
                chrom_path = os.path.join(hdf_path, chrom)

                hdf.put(chrom_path, chrom_data, format='fixed')
    return True

if create_traindata:
    _ = create_avg_traindata()

if create_testdata:
    _ = create_avg_testdata()


Writing  G9930_Ennn_Tnnn_oviAri3_any.from.hg19.feat.h5




Writing  G9930_Ennn_Tnnn_bosTau7_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_equCab2_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_rn5_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_galGal3_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_hg19_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_rheMac2_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_rn5_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_oryCun2_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_rheMac2_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_oryCun2_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_canFam3_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_susScr2_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_mm9_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_galGal3_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_bosTau7_any.from.mm9.feat.h5
Writing  G9930_Ennn_Tnnn_susScr2_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_equCab2_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_felCat5_any.from.hg19.feat.h5
Writing  G9930_Ennn_Tnnn_monDom5_