In [27]:

import os as os
import collections as col
import pandas as pd
import numpy as np

# What is this?
# Extract sets of gene promoters that show a strong
# histone signal (weighted by alignment rate) after
# cross-species transfer of the epigenetic signal
#
# Dump that list as BED file for a LOLA enrichment analysis

project_folder = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/processing/norm'

testdata_root = os.path.join(project_folder, 'task_testdata_exp/test_datasets')

output_folder = os.path.join(project_folder, 'task_lola', 'top_marked_genes')
os.makedirs(output_folder, exist_ok=True)

done_epigenomes = set()

select_cols =  ['chrom', 'start_body', 'end_body', 'strand_body',
                'start_reg5p', 'end_reg5p', 'strand_reg5p',
                'name', 'symbol',
                'ftmsig_H3K36me3_pct_cons_body', 'ftmsig_H3K36me3_abs_mean_body',
                'ftmsig_H3K4me3_pct_cons_reg5p', 'ftmsig_H3K4me3_abs_mean_reg5p']
prom_cols = ['chrom', 'start_reg5p', 'end_reg5p', 'strand_reg5p', 'name', 'symbol', 'score_promoter']
body_cols = ['chrom', 'start_body', 'end_body', 'strand_body', 'name', 'symbol', 'score_gene']

for root, dirs, featfiles in os.walk(testdata_root):
    if featfiles:
        gene_collector = None
        qry, _ ,trg = os.path.split(root)[-1].split('_')
        if (trg, qry) not in [('hg19', 'mm9'), ('mm9', 'hg19')]:
            continue
        print('Processing ', os.path.split(root)[-1])
        for ff in featfiles:
            if ff.startswith('G9930'):
                continue
            _, epigenome, _ = ff.split('_', 2)
            if epigenome in done_epigenomes:
                continue
            tissue = ff.split('.')[0].split('_')[-1]
            if tissue in ['H1hESC', 'ESE14']:
                tissue = 'esc'
            fpath = os.path.join(root, ff)
            dataset = []
            with pd.HDFStore(fpath, 'r') as hdf:
                for k in hdf.keys():
                    if k.startswith('/metadata'):
                        continue
                    chrom = os.path.split(k)[-1]
                    data = hdf[k]
                    data['chrom'] = chrom
                    dataset.append(data)
            dataset = pd.concat(dataset, axis=0, ignore_index=False)
            dataset = dataset.loc[:, select_cols].copy()
            dataset['tissue'] = tissue
            #dataset['score_promoter'] = dataset['ftmsig_H3K4me3_abs_mean_reg5p'] * (dataset['ftmsig_H3K4me3_pct_cons_reg5p'] / 100)
            dataset['score_promoter'] = dataset['ftmsig_H3K4me3_abs_mean_reg5p']

            dataset['rank_promoter'] = dataset['score_promoter'].rank(pct=True)
            
            #dataset['score_gene'] = dataset['ftmsig_H3K36me3_abs_mean_body'] * (dataset['ftmsig_H3K36me3_pct_cons_body'] / 100)
            dataset['score_gene'] = dataset['ftmsig_H3K36me3_abs_mean_body']

            dataset['rank_gene'] = dataset['score_gene'].rank(pct=True)
            
            # ranking: select >= 0.95 for output to select top 5%
            t = 0.95
            
            select_genes = np.logical_or(dataset['rank_promoter'] >= t, dataset['rank_gene'] >= t)
            dataset = dataset.loc[select_genes, :].copy()
            if gene_collector is None:
                gene_collector = dataset.copy()
            else:
                gene_collector = pd.concat([gene_collector, dataset], axis=0, ignore_index=False)
            done_epigenomes.add(epigenome)
            
        # dump
        for tissue in gene_collector['tissue'].unique():
            sub_select = np.logical_and(gene_collector['tissue'] == tissue,
                                        gene_collector['rank_promoter'] >= t)
            prom_subset = gene_collector.loc[sub_select, prom_cols]
            mean_score = prom_subset.loc[:, ['name', 'score_promoter']].groupby('name').mean()
            prom_subset.drop_duplicates(['name', 'symbol'], inplace=True)
            prom_subset.index = prom_subset['name']
            prom_subset['mean_score'] = mean_score
            prom_subset.sort_values(['chrom', 'start_reg5p', 'end_reg5p'], ascending=True, inplace=True)
            
            outfile = '{}_{}_top-marked-genes_promoter.bed'.format(qry, tissue)
            print('Writing ', outfile)
            outpath = os.path.join(output_folder, outfile)
            prom_subset.to_csv(outpath, sep='\t', index=False, index_label=False, mode='w', header=False,
                              columns=['chrom', 'start_reg5p', 'end_reg5p', 'name', 'mean_score',
                                       'strand_reg5p', 'symbol'])
            
            # same for gene bodies
            sub_select = np.logical_and(gene_collector['tissue'] == tissue,
                                        gene_collector['rank_gene'] >= t)
            body_subset = gene_collector.loc[sub_select, body_cols]
            mean_score = body_subset.loc[:, ['name', 'score_gene']].groupby('name').mean()
            body_subset.drop_duplicates(['name', 'symbol'], inplace=True)
            body_subset.index = body_subset['name']
            body_subset['mean_score'] = mean_score
            body_subset.sort_values(['chrom', 'start_body', 'end_body'], ascending=True, inplace=True)
            
            outfile = '{}_{}_top-marked-genes_body.bed'.format(qry, tissue)
            print('Writing ', outfile)
            outpath = os.path.join(output_folder, outfile)
            body_subset.to_csv(outpath, sep='\t', index=False, index_label=False, mode='w', header=False,
                              columns=['chrom', 'start_body', 'end_body', 'name', 'mean_score',
                                       'strand_body', 'symbol'])
print('Done')


Processing  hg19_from_mm9
Writing  hg19_ncd4_top-marked-genes_promoter.bed
Writing  hg19_ncd4_top-marked-genes_body.bed
Writing  hg19_esc_top-marked-genes_promoter.bed
Writing  hg19_esc_top-marked-genes_body.bed
Writing  hg19_liver_top-marked-genes_promoter.bed
Writing  hg19_liver_top-marked-genes_body.bed
Writing  hg19_heart_top-marked-genes_promoter.bed
Writing  hg19_heart_top-marked-genes_body.bed
Writing  hg19_kidney_top-marked-genes_promoter.bed
Writing  hg19_kidney_top-marked-genes_body.bed
Processing  mm9_from_hg19
Writing  mm9_esc_top-marked-genes_promoter.bed
Writing  mm9_esc_top-marked-genes_body.bed
Writing  mm9_hepa_top-marked-genes_promoter.bed
Writing  mm9_hepa_top-marked-genes_body.bed
Writing  mm9_ncd4_top-marked-genes_promoter.bed
Writing  mm9_ncd4_top-marked-genes_body.bed
Done
