In [11]:

import os as os
import collections as collect

import numpy as np
import pandas as pd

project_root = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/processing/norm'

feat_folder = os.path.join(project_root, 'task_testdata_exp', 'compfeat_groups')

out_folder = os.path.join(project_root, 'caching', 'aln_ranking')

def load_aln_info(fpath, aln_select, aln_col):
    dataset = []
    with pd.HDFStore(fpath, 'r') as hdf:
        for k in hdf.keys():
            if k != '/metadata':
                data = hdf[k]
                data = data.loc[:, ['name', aln_select]]
                data.columns = ['name', aln_col]
                dataset.append(data)
    dataset = pd.concat(dataset, ignore_index=False, axis=0)
    return dataset


def cache_aln_ranks():
    cache_file = os.path.join(out_folder, '20180618_gene-aln_ranks.h5')
    if os.path.isfile(cache_file):
        print('Nothing to do')
        return
    done = set()
    # 101 here since right=False
    bins = np.array(list(range(0, 100, 5)) + [101], dtype=np.float16)
    filemode = 'w'
    for root, dirs, files in os.walk(feat_folder):
        if files:
            query, _, target = os.path.split(root)[-1].split('_')
            if (target, query) not in done:
                body_file = [f for f in files if '.body.' in f and f.endswith('.h5')][0]
                body_data = load_aln_info(os.path.join(root, body_file),
                                          'ftmsig_H3K36me3_pct_cons',
                                          'body_cons')
                prom_file = [f for f in files if '.reg5p.' in f and f.endswith('.h5')][0]
                prom_data = load_aln_info(os.path.join(root, prom_file),
                                          'ftmsig_H3K4me3_pct_cons',
                                          'prom_cons')
                dataset = body_data.merge(prom_data, on='name', how='outer')
                dataset['aln_score'] = dataset['body_cons']
                dataset['aln_rank'] = dataset['aln_score'].rank(pct=True, ascending=True)
                dataset['aln_level'] = np.digitize(dataset['aln_score'].values, bins, right=False)
                # we start counting at 0...
                dataset['aln_level'] -= 1
                
                with pd.HDFStore(cache_file, filemode) as hdf:
                    store_path = os.path.join(target, query, 'aln_ranks')
                    hdf.put(store_path, dataset)
                filemode = 'a'
                done.add((target, query))
    return
                
cache_aln_ranks()
                
                
                


                

Nothing to do
