In [13]:

import os as os
import pandas as pd

out_folder = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/crossspecies/supplement'

orth_file = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/orthologs/hdf/odb9_gene-orthologs.h5'

gene_folder = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/genemodel/subsets/protein_coding/roi_hdf'
hsa_genes_path = os.path.join(gene_folder, 'hsa_hg19_gencode_v19.body.h5')
mmu_genes_path = os.path.join(gene_folder, 'mmu_mm9_gencode_vM1.body.h5')

def load_gene_annotation(fpath):
    subsets = []
    with pd.HDFStore(fpath, 'r') as hdf:
        for k in hdf.keys():
            if 'metadata' in k:
                continue
            data = hdf[k]
            _ , chrom = k.rsplit('/', 1)
            data['chrom'] = chrom
            subsets.append(data)
    df = pd.concat(subsets, axis=0, ignore_index=True)
    return df['name'].tolist()


def load_orthologs(fpath, which, species):
    with pd.HDFStore(fpath, 'r') as hdf:
        data = hdf[which]
        sub = data[species + '_name'].tolist()
    return sub


def load_geneset(fname):
    fpath = os.path.join(out_folder, fname)
    names = []
    with open(fpath, 'r') as dump:
        for line in dump:
            if line.strip():
                cols = line.strip().split('\t')
                names.append(cols[3])
    return names
                

def dump_upset_list(genes, orthologs, subsets, outpath):
    with open(outpath, 'w') as uplists:
        #_ = uplists.write('Genes\t' + '\t'.join(genes) + '\n')
        #_ = uplists.write('Orthologs\t' + '\t'.join(orthologs) + '\n')
        for label, subset in subsets:
            sub_names = load_geneset(subset)
            _ = uplists.write(label + '\t' + '\t'.join(sub_names) + '\n')
    return
    
    


hsa_files = [('ESC_ML', '201709_human_H1hESC_TP_genes_promoters.bed'),
             ('ESC_Orth', '201709_human_H1hESC_TP_orthologs_promoters.bed'),
             ('CD4_ML', '201709_human_ncd4_TP_genes_promoters.bed'),
             ('CD4_Orth', '201709_human_ncd4_TP_orthologs_promoters.bed'),
             ('Hepa_ML', '201709_human_hepa_TP_genes_promoters.bed'),
             ('Hepa_Orth', '201709_human_hepa_TP_orthologs_promoters.bed')]

mmu_files = [('ESC_ML', '201709_mouse_ESE14_TP_genes_promoters.bed'),
             ('ESC_Orth', '201709_mouse_ESE14_TP_orthologs_promoters.bed'),
             ('CD4_ML', '201709_mouse_ncd4_TP_genes_promoters.bed'),
             ('CD4_Orth', '201709_mouse_ncd4_TP_orthologs_promoters.bed'),
             ('Liver_ML', '201709_mouse_liver_TP_genes_promoters.bed'),
             ('Liver_Orth', '201709_mouse_liver_TP_orthologs_promoters.bed')]


hsa_orth = load_orthologs(orth_file, 'auto/pairs/mouse/human', 'human')
hsa_genes = load_gene_annotation(hsa_genes_path)

_ = dump_upset_list(hsa_genes, hsa_orth, hsa_files,
                    os.path.join(out_folder, '201709_human_upset_lists.tsv'))

mmu_orth = load_orthologs(orth_file, 'auto/pairs/mouse/human', 'mouse')
mmu_genes = load_gene_annotation(mmu_genes_path)

_ = dump_upset_list(mmu_genes, mmu_orth, mmu_files,
                    os.path.join(out_folder, '201709_mouse_upset_lists.tsv'))




13362
