In [9]:

import os as os
import collections as col
import pandas as pd
import numpy as np

# What is this?
# Extract sets of genes uniquely predicted positive in one tissue/cell
# and dump as promoter BED files for use with LOLA

# REV DATE
date = 20181116

project_folder = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/processing/norm'
summary_file = os.path.join(project_folder, 'task_summarize', 'agg_expstat_est.h5')

annotations = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/genemodel/subsets/protein_coding/roi_hdf'

prom_files = {'human': 'hsa_hg19_gencode_v19.body.h5',
              'mouse': 'mmu_mm9_gencode_vM1.body.h5'}

output_folder = os.path.join(project_folder, 'task_lola', 'uniq_tp_genes')
os.makedirs(output_folder, exist_ok=True)

check_file = os.path.join(output_folder, '{}_lola_uniq-tp-genes.chk'.format(date))

if os.path.isfile(check_file):
    print('Dated check file exists - abort...')
else:
    exp_est = col.defaultdict(set)
    with pd.HDFStore(summary_file, 'r') as hdf:
        all_keys = [k for k in hdf.keys() if k.startswith('/pos/can')]
        relevant_keys = [k for k in all_keys if k.startswith('/pos/can/human/mouse') or
                         k.startswith('/pos/can/mouse/human')]
        for k in relevant_keys:
            if k.endswith('/data'):
                parts = k.split('/')
                trg, qry = parts[3], parts[4]
                biosample = parts[6].split('_')[-1]
                ds = hdf[k]
                genes = set(ds.loc[ds['tp'] == 1, :].index.tolist())
                exp_est[((trg, qry), biosample)].update(genes)

    for c in [('human', 'mouse'), ('mouse', 'human')]:
        tissue_genes = [(k[1], v) for k, v in exp_est.items() if k[0] == c]
        unique_genes = dict()
        unique_count = col.Counter()
        for t1, g1 in tissue_genes:
            unique_tp = set()
            unique_tp = g1
            for t2, g2 in tissue_genes:
                if t1 == t2:
                    continue
                else:
                    unique_tp = unique_tp - g2
            unique_genes[t1] = sorted(unique_tp)
            unique_count[t1] = len(unique_tp)
        print(unique_count)
        species_file = prom_files[c[1]]
        file_path = os.path.join(annotations, species_file)
        with pd.HDFStore(file_path, 'r') as hdf:
            bodies = []
            for k in hdf.keys():
                if k == '/metadata':
                    continue
                chrom = k.split('/')[-1]
                data = hdf[k]
                data['chrom'] = chrom
                bodies.append(data)
            bodies = pd.concat(bodies, axis=0, ignore_index=False)
            fw_genes = np.array(bodies['strand'] == '+', dtype=np.bool)
            rv_genes = np.array(bodies['strand'] == '-', dtype=np.bool)
            
            # add promoter length to gene body and store
            # genes as loci
            bodies.loc[fw_genes, 'start'] = bodies.loc[fw_genes, 'start'] - 1000
            bodies.loc[rv_genes, 'end'] = bodies.loc[rv_genes, 'end'] + 1000
            
        for tissue, genes in unique_genes.items():
            outprefix = '{}_{}_uniq-tp-genes_loci'.format(c[1], tissue)
            subset = bodies.loc[bodies['name'].isin(genes), :].copy()
            subset.sort_values(['chrom', 'start', 'end'], ascending=True, inplace=True)

            out_hdf = os.path.join(output_folder, outprefix + '.h5')
            if os.path.isfile(out_hdf):
                continue
            with pd.HDFStore(out_hdf, 'w') as hdf:
                hdf.put('/{}/{}/uniqtp'.format(c[1], tissue), subset, format='table')
            out_bed = os.path.join(output_folder, outprefix + '.bed')
            with open(out_bed, 'w') as dump:
                _ = dump.write('#')
            subset.to_csv(out_bed, sep='\t', columns=['chrom', 'start', 'end', 'name',
                                                      'score', 'strand', 'symbol'],
                          index=False, index_label=False, header=True, mode='a')
            out_list = os.path.join(output_folder, outprefix + '.tsv')
            subset.to_csv(out_list, sep='\t', columns=['name', 'symbol'],
                          index=False, index_label=False, header=False, mode='w')
    
    with open(check_file, 'w') as check:
        _ = check.write('Date: {}'.format(date))


Counter({'liver': 949, 'ESE14': 802, 'ncd4': 550})
Counter({'H1hESC': 933, 'hepa': 909, 'ncd4': 500})
