In [4]:

import os as os
import json as js
import collections as col

import pandas as pd

donor_labels = js.load(open('/home/pebert/work/code/mpggit/statediff/annotation/misc/donor_char.json'))

data_path = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/linked_input/deep'

rna_path = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/loaded_input/deep/rna_data/sequence'

cfg_path = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/ruffus/log'

out_root = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/statediff/supplement'

out_tex_runtime_table = os.path.join(out_root, 'supp_table_SX_runtime.tex')
out_tex_dataset_table = os.path.join(out_root, 'supp_table_SX_dataset.tex')
out_tex_expression_table = os.path.join(out_root, 'supp_table_SX_expression.tex')

# out_tsv_version = None
out_tsv_version = '/home/pebert/work/code/mpggit/dissertation/Supplement/diffchrom'


def collect_runtimes(out_tex_table):
    table_columns = ['step', 'command', 'cores', 'samples', 'runtime (min)', 'SCIDDO version']
    table_rows = []
    for cfg_file in os.listdir(cfg_path):
        if not cfg_file.endswith('.json'):
            continue
        cfg_type = cfg_file.split('_')[-2]
        if cfg_type in ['dump']:
            continue
        fpath = os.path.join(cfg_path, cfg_file)
        config = js.load(open(fpath, 'r'))
        cpus = config['args']['workers']
        runtime = config['time_min']
        runtime = int(round(float(runtime), 0) + 1)
        runtime = '< ' + str(runtime)
        if cfg_type == 'scan':
            if config['args']['scoring'] != ['penem'] or 'cmm18' not in config['args']['dataset']:
                continue
            num_samples = '4 v 5' if 'TISSUE_Li' in config['args']['group1'] else '2 v 2'
            num_samples = '3 v 2' if 'CELLTYPE_Mo' in config['args']['group1'] else num_samples
            row = [4, 'scan', cpus, num_samples, runtime, 'v' + config['sciddo_version']]
            table_rows.append(row)
        elif cfg_type == 'convert':
            if 'ChromHMM' not in config['args']['segformat']:
                continue
            row = [1, 'convert', cpus, '9', runtime, 'v' + config['sciddo_version']]
            table_rows.append(row)
        else:
            if 'cmm18' not in config['args']['dataset']:
                continue
            if cfg_type == 'score':
                row = [3, 'score', 1, 'n/a', runtime, 'v' + config['sciddo_version']]
                table_rows.append(row)
            elif cfg_type == 'stats':
                row = [2, 'stats', cpus, '9', runtime, 'v' + config['sciddo_version']]
                table_rows.append(row)
            else:
                raise ValueError

    df = pd.DataFrame(table_rows, columns=table_columns)
    df.sort_values(['step', 'runtime (min)'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    to_table = df.loc[[0, 1, 2, 4, 9], ['command', 'cores', 'samples', 'runtime (min)']].copy()

    with pd.option_context('display.max_colwidth', -1):
        to_table.to_latex(out_tex_table, bold_rows=True, column_format='rccc',
                          encoding='ascii', header=True, index=False)
    return None
    
def make_dataset_table(out_tex_table):
    used_samples = "01_Hc01_LiHG_Ct 01_Hc02_LiHG_Ct 43_Hm01_BlMo_Ct 43_Hm03_BlMa_Ct \
                    43_Hm03_BlMo_Ct 43_Hm05_BlMa_Ct 43_Hm05_BlMo_Ct 41_Hf02_LiHe_Ct \
                    41_Hf03_LiHe_Ct".split()
    records = []
    for link in os.listdir(data_path):
        if not link.endswith('.bam'):
            continue
        smp_id = link.rsplit('_', 3)[0]
        if smp_id not in used_samples:
            continue
        short_id = smp_id[10:12] + '-' + smp_id[6]
        src_file = os.readlink(os.path.join(data_path, link))
        src_name = os.path.basename(src_file)
        mark = src_name.split('_')[4]
        smp_id = src_name.rsplit('_', 3)[0]
        donor_id = donor_labels[short_id]
        records.append((short_id, smp_id, donor_id, mark, src_name))
        
    df = pd.DataFrame(records, columns=['ID', 'DEEP sample', 'donor', 'histone mark', 'filename'])
    df.sort_values(['ID', 'histone mark'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    with pd.option_context('display.max_colwidth', -1):
        df.to_latex(out_tex_table, bold_rows=True, column_format='clccl',
                    encoding='ascii', header=True, index=False)
    if out_tsv_version is not None:
        fn = os.path.basename(out_tex_table)
        fn = fn.replace('.tex', '.tsv').replace('SX', 'S1')
        full_path = os.path.join(out_tsv_version, fn)
        df.to_csv(full_path, sep='\t', header=True, index=False, mode='w')
    return None


def make_expression_table(out_tex_table):
    
    fastq_files = os.listdir(rna_path)
    records = []
    for ff in fastq_files:
        if 'complete' in ff:
            short_file = ff.rsplit('_', 2)[0]
        else:
            short_file = ff.rsplit('_', 1)[0]
        smp_id = short_file.rsplit('_', 6)[0]
        short_id = smp_id[10:12] + '-' + smp_id[6]
        donor = donor_labels[short_id]
        if 'Hc' in short_file:
            ct_num = smp_id[6]
            short_file = short_file.replace('Hc0' + ct_num, 'HepG2')
            short_file = short_file.replace('_Ct', '_Ct' + ct_num)
            smp_id = smp_id.replace('Hc0' + ct_num, 'HepG2')
            smp_id += ct_num
        records.append((short_id, smp_id, donor, short_file))
    
    df = pd.DataFrame(records, columns=['ID', 'DEEP sample', 'donor', 'filename'])
    df.sort_values(['ID', 'filename'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    with pd.option_context('display.max_colwidth', -1):
        df.to_latex(out_tex_table, bold_rows=True, column_format='clcl',
                    encoding='ascii', header=True, index=False)
    if out_tsv_version is not None:
        fn = os.path.basename(out_tex_table)
        fn = fn.replace('.tex', '.tsv').replace('SX', 'S2')
        full_path = os.path.join(out_tsv_version, fn)
        df.to_csv(full_path, sep='\t', header=True, index=False, mode='w')
    return None
        
    
    
        
        
collect_runtimes(out_tex_runtime_table)
make_dataset_table(out_tex_dataset_table)
make_expression_table(out_tex_expression_table)
        
