In [2]:
import os as os

import numpy as np
import sklearn as skl
import pandas as pd

# Manually create dumps for hg19/mm9 alignments in dump folder
dump_folder = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/chainfiles/hdf_map/dump'

# run commands:
# creepiest.py -nod dump --input ../hg19_to_mm9.idx.h5 -ig default -o hg19_to_mm9.qry.aln.tsv.gz -mapref query
# creepiest.py -nod dump --input ../hg19_to_mm9.idx.h5 -ig default -o hg19_to_mm9.trg.aln.tsv.gz -mapref target

# creepiest.py -nod dump --input ../mm9_to_hg19.idx.h5 -ig default -o mm9_to_hg19.qry.aln.tsv.gz -mapref query
# creepiest.py -nod dump --input ../mm9_to_hg19.idx.h5 -ig default -o mm9_to_hg19.trg.aln.tsv.gz -mapref target

# Manually intersect gene ROIs with alignment blocks
isect_folder = os.path.join(dump_folder, 'isect')

# run commands - print also genes w/o overlap
# bedtools intersect -wao -a hsa_hg19_gencode_v19.reg5p.bed.gz -b mm9_to_hg19.qry.aln.tsv.gz > hg19_from_mm9.ovl.reg5p.tsv
# bedtools intersect -wao -a hsa_hg19_gencode_v19.body.bed.gz -b mm9_to_hg19.qry.aln.tsv.gz > hg19_from_mm9.ovl.body.tsv

# bedtools intersect -wao -a mmu_mm9_gencode_vM1.body.bed.gz -b hg19_to_mm9.qry.aln.tsv.gz > mm9_from_hg19.ovl.body.tsv
# bedtools intersect -wao -a mmu_mm9_gencode_vM1.reg5p.bed.gz -b hg19_to_mm9.qry.aln.tsv.gz > mm9_from_hg19.ovl.reg5p.tsv

hsa_files = ['hg19_from_mm9.ovl.body.tsv', 'hg19_from_mm9.ovl.reg5p.tsv']
mmu_files = ['mm9_from_hg19.ovl.body.tsv', 'mm9_from_hg19.ovl.reg5p.tsv']

tsv_header = ['chrom', 'start', 'end', 'name', 'score', 'strand_char', 'symbol',
              'aln_chrom', 'aln_start', 'aln_end', 'block_id', 'overlap']


def _replace_strand(x):
    if x == '-':
        return -1
    else:
        return 1


def load_overlap_table(fpath, prefix):
    """
    """
    with open(fpath, 'r') as table:
        df = pd.read_csv(table, sep='\t', header=None, names=tsv_header,
                         usecols=['chrom', 'start', 'end', 'name', 'symbol', 'strand_char', 'overlap'],
                         dtype={'chrom': str, 'start': np.int32, 'end': np.int32,
                                'name': str, 'symbol': str, 'strand_char': str, 'overlap': np.int32})
        df = df.loc[df['chrom'].str.match('chr[0-9]+'), :].copy()
        df[prefix + '_start'] = df['start']
        df[prefix + '_end'] = df['end']
        df[prefix + '_overlap'] = df['overlap']
        df['strand'] = df['strand_char'].map(_replace_strand)
        df.drop(['start', 'end', 'overlap', 'strand_char'], inplace=True, axis=1)
    ovl_sum = df.groupby(['chrom', prefix + '_start', prefix + '_end', 'strand', 'symbol', 'name']).sum()
    # not sure if groupby is guaranteed to keep order of items, so do explicit merging
    ovl_sum['name'] = ovl_sum.index.get_level_values('name')
    ovl_sum.reset_index(drop=True, inplace=True)
    df.drop([prefix + '_overlap'], axis=1, inplace=True)
    df.drop_duplicates(subset=['name', 'symbol'], inplace=True)
    df = df.merge(ovl_sum, on='name', how='outer')
    return df


def merge_overlap_frames(fp_left, prefix_left, fp_right, prefix_right):
    """
    """
    assert prefix_left == 'promoter', 'Wrong prefix: {}'.format(prefix_left)
    left = load_overlap_table(fp_left, prefix_left)
    right = load_overlap_table(fp_right, prefix_right)
    
    left_ovl = prefix_left + '_overlap'
    right_ovl = prefix_right + '_overlap'
    
    left = left.merge(right, on=['name', 'symbol', 'strand', 'chrom'], how='outer')
    
    # mark different subsets of unaligned or weakly aligned genes
    left['unaln_both'] = np.array((left[left_ovl] < 100) & (left[right_ovl] < 100), dtype=np.int8)
    left['unaln_prom'] = np.array((left[left_ovl] < 100) & (left[right_ovl] >= 100), dtype=np.int8)
    left['unaln_body'] = np.array((left[left_ovl] >= 100) & (left[right_ovl] < 100), dtype=np.int8)
    
    return left


out_folder = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/crossspecies/supplement'

for species, files in zip(['hsa', 'mmu'], [hsa_files, mmu_files]):
    spec_ovl = merge_overlap_frames(os.path.join(isect_folder, files[0]), 'promoter',
                                    os.path.join(isect_folder, files[1]), 'body')
    
    subset = spec_ovl.loc[spec_ovl['unaln_both'] > 0, :]
    
    out_path = os.path.join(out_folder, '201709_{}_unaln_genes'.format(species))
    
    name_out = out_path + '_names.txt'
    with open(name_out, 'w') as dump:
        subset.to_csv(dump, columns=['name'], index=False, header=False)
        
    symbol_out = out_path + '_symbols.txt'
    with open(symbol_out, 'w') as dump:
        subset.to_csv(dump, columns=['symbol'], index=False, header=False)
        
    both_out = out_path + '_names-symbols.tsv'
    with open(both_out, 'w') as dump:
        subset.to_csv(dump, columns=['name', 'symbol'], sep='\t', index=False, header=False)
    
    prom_bed = out_path + '_promoters.bed'
    with open(prom_bed, 'w') as dump:
        subset.to_csv(dump, columns=['chrom', 'promoter_start', 'promoter_end',
                                     'name', 'symbol', 'strand'], sep='\t', index=False, header=False)
    
    store_out = out_path + '_store.h5'
    if os.path.isfile(store_out):
        mode = 'a'
    else:
        mode = 'w'
    with pd.HDFStore(store_out, mode, complib='blosc', complevel=9) as hdf:
        hdf.put(species, spec_ovl, format='table')

print('Done')    

    

    






Done
