In [18]:

import os as os
import re as re
import csv as csv
import gzip as gzip
import io as io
import pandas as pd
import numpy as np
import shutil as sh

fhgfs_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff'

chromhmm_segment = os.path.join(fhgfs_base, 'chromhmm/deep/segmentation')
epigenome_colors = {'LiHG_Ct': '#a6cee3'.upper(),
                    'LiHe_Ct': '#1f78b4'.upper(),
                    'BlMo_Ct': '#b2df8a'.upper(),
                    'BlMa_Ct': '#33a02c'.upper()}
output_dir = os.path.join(fhgfs_base, 'chromdiff/input_files')

def step0_adapt_state_names():
    
    segfiles = sorted(os.listdir(chromhmm_segment))
    records = []
    for sf in segfiles:
        if 'LiHe' in sf:
            if not ('Hf02' in sf or 'Hf03' in sf):
                continue
        if sf.startswith('52_'):
            continue
        out_buffer = io.StringIO()
        new_file = os.path.join(output_dir, sf)
        if os.path.isfile(new_file):
            continue
        with open(os.path.join(chromhmm_segment, sf), 'r') as bedfile:
            for line in bedfile:
                if line:
                    parts = line.strip().split()
                    try:
                        new_line = '\t'.join(parts[:3] + [parts[3].strip('E')])
                    except IndexError:
                        print(line)
                        raise
                    out_buffer.write(new_line + '\n')
        with open(new_file, 'w') as dump:
            _ = dump.write(out_buffer.getvalue())
    return
    

def step1_epigenome_annotation():
    metadata_file = os.path.join(output_dir, 'sample_annotation.tsv')
    metadata_header = ['ID', 'filepath', 'color', 'name', 'celltype', 'tissue']
    #metadata_header = ['ID', 'filepath', 'color', 'name', 'celltype']
    
    tissue_map = {'Li': 'Liver', 'Bl': 'Blood'}
    
    segfiles = sorted(os.listdir(output_dir))
    records = []
    for sf in segfiles:
        if not sf.endswith('segments.bed'):
            continue
        infos = {'filepath': os.path.join(output_dir, sf)}
        parts = sf.split('_')
        eid = '_'.join(parts[1:3])
        try:
            color = epigenome_colors['_'.join(parts[2:4])]
        except KeyError:
            continue
        celltype = parts[2][2:]
        infos['celltype'] = celltype
        infos['tissue'] = tissue_map[parts[2][:2]]
        infos['ID'] = eid
        infos['color'] = color
        infos['name'] = '_'.join(parts[1:4])
        records.append(infos)
    with open(metadata_file, 'w', newline='') as table:
        writer = csv.DictWriter(table, metadata_header, delimiter='\t')
        writer.writeheader()
        writer.writerows(records)
    return metadata_file

def step2_gene_annotation():
    """
    Restrict to protein coding
    genes and keep only chr1-22, X
    """
    header = ['#chrom', 'start', 'end', 'strand', 'gencode_gene_id', 'genesymbols']
    gencode_file = os.path.join(output_dir, 'gencode_v21_pc.bed')
    
    gencode_cache = os.path.join(output_dir, 'gencode_v21_pc.h5')
    if os.path.isfile(gencode_cache):
        with pd.HDFStore(gencode_cache, 'r') as hdf:
            df = hdf['genes']
        return gencode_file, df
    
    gencode_pc_genes = os.path.join(fhgfs_base, 'references', 'gencode.v21.pc_transcripts.fa.gz')
    gencode_genes = os.path.join(fhgfs_base, 'references', 'gencode.v21.annotation.gtf.gz')
    transcripts = set()
    genes = set()
    with gzip.open(gencode_pc_genes, 'rt') as fasta:
        for line in fasta:
            if line.startswith('>'):
                parts = line.strip('>').split('|')
                transcripts.add(parts[0].strip())
                genes.add(parts[1].strip())
    keep_chroms = ['chr' + str(x) for x in range(1, 23, 1)]
    keep_chroms.append('chrX')
    
    get_name = re.compile('gene_name\s\"(?P<SYMBOL>[A-Z0-9\.a-z\-_\:]+)\"')
    get_geneid = re.compile('gene_id\s\"(?P<GENEID>ENS[A-Z0-9\.a-z]+)\"')
    
    locations = []
    with gzip.open(gencode_genes, 'rt') as gtf:
        for line in gtf:
            if line.startswith('chr'):
                parts = line.split('\t')
                c, _, entity, s, e, _, strand, _, attr = parts
                if c in keep_chroms and entity == 'gene':
                    mobj = get_geneid.search(attr)
                    gene_id = mobj.group('GENEID')
                    mobj = get_name.search(attr)
                    assert mobj is not None, 'Could not find name: {}'.format(attr)
                    gene_symbol = mobj.group('SYMBOL')
                    if gene_id in genes:
                        locations.append((c, s, e, strand, gene_id.strip(), gene_symbol.strip()))
    df = pd.DataFrame(locations, columns=['chrom', 'start', 'end', 'strand', 'name', 'symbol'])
    with open(gencode_file, 'w') as dump:
        _ = dump.write('#')
        df.to_csv(dump, sep='\t', header=True, index=False,
                  index_label=False, line_terminator='\n')
    with pd.HDFStore(gencode_cache, 'w') as hdf:
        hdf.put('genes', df, format='table')
    return gencode_file, df
                
def step3_state_annotation():
    annotation_src = '/home/pebert/work/code/mpggit/statediff/annotation/chromhmm_18/chromhmm_18_states_info.tsv'
    fname = os.path.basename(annotation_src)
    annotation_dest = os.path.join(output_dir, fname)
    sh.copyfile(annotation_src, annotation_dest)
    return annotation_dest
    
def step4_expression_table(genes):
    exptab_file = os.path.join(output_dir, 'pcgene_exp_tpm.tsv')
    exp_root = os.path.join(fhgfs_base, 'salmon/deep/quant')
    exp_table = []
    for root, dirs, datafiles in os.walk(exp_root):
        if root.endswith('_Ct'):
            quant = [f for f in datafiles if f == 'quant.genes.sf']
            quant_file = os.path.join(root, quant[0])
            eid = os.path.split(root)[-1]
            eid = eid[3:12]
            df = pd.read_csv(quant_file, usecols=['Name', 'TPM'], sep='\t', index_col=0)
            df.columns = [eid]
            exp_table.append(df)
    exp_table = pd.concat(exp_table, axis=1, ignore_index=False)
    exp_table = exp_table.loc[exp_table.index.isin(genes['name']), :].copy()
    exp_table.index.name = 'gene_id'
    exp_table.to_csv(exptab_file, sep='\t', header=True, index=True)
    return exptab_file
            
def step5_covariate_matrix(smp_ann):
    covmat_file = os.path.join(output_dir, 'covmat.tsv')
    md = pd.read_csv(smp_ann, sep='\t')
    md = md[['ID', 'celltype', 'tissue']]
#     md = md[['ID', 'celltype']]
    md.index = md['ID']
    md.drop(['ID'], axis=1, inplace=True)
    
    num_cols = md['celltype'].unique().size + md['tissue'].unique().size
    col_names = md['celltype'].unique().tolist() + md['tissue'].unique().tolist()
    num_rows = md.shape[0]
    
    cov_mat = pd.DataFrame(np.zeros((num_rows, num_cols), dtype=np.int8),
                           index=md.index, columns=col_names)
    for row in md.itertuples():
        cov_mat.loc[row.Index, row.celltype] = 1
        cov_mat.loc[row.Index, row.tissue] = 1
    cov_mat.to_csv(covmat_file, sep='\t', index=True, index_label=False, header=True)
    return covmat_file

def step6_map_vars_cov():
    map_vars_file = os.path.join(output_dir, 'map_vars_cov.tsv')
    with open(map_vars_file, 'w') as table:
        _ = table.write('\t'.join(['HG' , 'HE', 'MO', 'MA', 'LIVER', 'BLOOD']) + '\n')
        _ = table.write('CELLTYPE' + '\t'.join(['', 'TRUE', 'TRUE', 'TRUE', 'TRUE', 'FALSE', 'FALSE']) + '\n')
        _ = table.write('TISSUE' + '\t'.join(['', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE']) + '\n')
#         _ = table.write('\t'.join(['HG' , 'HE', 'MO', 'MA']) + '\n')
#         _ = table.write('CELLTYPE' + '\t'.join(['', 'TRUE', 'TRUE', 'TRUE', 'TRUE']) + '\n')
    return map_vars_file
    
def create_run_scripts(annotations):
    chromdiff_script = '/home/pebert/work/code/github/ChromDiff/notes_v2.sh'
    replace_lines = {'metadatafile': 'metadatafile="{smp_ann_fpath}"',
                     'genefile': 'genefile="{genes_fpath}"',
                     'generegions_label': 'generegions_label="gencode_v21"',
                     'statecalls_label': 'statecalls_label="deep"',
                     'states_info': 'states_info="{state_ann_fpath}"',
                     'covariate_mat_file': 'covariate_mat_file="{cov_mat_fpath}"',
                     'map_covariates_file': 'map_covariates_file="{map_vars_fpath}"',
                     'expfile': 'expfile="{exptable_fpath}"',
                     'state_annotations_file': 'state_annotations_file="{state_ann_fpath}"',
                     'property': 'property="{property}"',
                     'a_option': 'a_option="{a_group}"',
                     'b_option': 'b_option="{b_group}"',
                     'curr_label': "curr_label=${{statecalls_label}}_${{generegions_label}}"}
    comp_groups = [('celltype', 'HG', 'He'), ('celltype','HG', 'Ma'),
                   ('celltype','HG', 'Mo'), ('celltype', 'He', 'Ma'),
                   ('celltype', 'He', 'Mo'), ('celltype', 'Ma', 'Mo'),
                   ('tissue', 'Liver', 'Blood')]
    
    buffer = io.StringIO()
    with open(chromdiff_script, 'r') as bash:
        for line in bash:
            if line.startswith('## then look at'):
                continue
            elif line.startswith('#'):
                buffer.write(line)
            else:
                rep = replace_lines.get(line.split('=')[0], line.strip())
                buffer.write(rep + '\n')
    script_out = os.path.join(fhgfs_base, 'chromdiff/exec_scripts')
    script_out2 = '/home/pebert/work/code/github/ChromDiff'
    for (p, a, b) in comp_groups:
        tmp = annotations
        tmp.update({'a_group': a,
                    'b_group': b,
                    'property': p})
        script = buffer.getvalue()
        script = script.format(**tmp)
        script_name = 'chromdiff_{}_vs_{}.sh'.format(a, b)
        script_path = os.path.join(script_out, script_name)
        print('Dumping ', script_path)
        with open(script_path, 'w') as dump:
            _ = dump.write(script)
        script_path = os.path.join(script_out2, script_name)
        with open(script_path, 'w') as dump:
            _ = dump.write(script)
    return              
    
step0_adapt_state_names()
ann_files = dict()
sample_file = step1_epigenome_annotation()
ann_files['smp_ann_fpath'] = sample_file
gencode_file, genes = step2_gene_annotation()
ann_files['genes_fpath'] = gencode_file
state_file = step3_state_annotation()
ann_files['state_ann_fpath'] = state_file
exptab_file = step4_expression_table(genes)
ann_files['exptable_fpath'] = exptab_file
covmat_file = step5_covariate_matrix(sample_file)
ann_files['cov_mat_fpath'] = covmat_file
mapvars_file = step6_map_vars_cov()
ann_files['map_vars_fpath'] = mapvars_file
create_run_scripts(ann_files)

Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_HG_vs_He.sh
Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_HG_vs_Ma.sh
Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_HG_vs_Mo.sh
Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_He_vs_Ma.sh
Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_He_vs_Mo.sh
Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_Ma_vs_Mo.sh
Dumping  /TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/chromdiff/exec_scripts/chromdiff_Liver_vs_Blood.sh
