In [15]:
import os
import math

import pandas as pd
import numpy as np

"""
What does this do?
Clean up LOLA output tables; replace filenames with more descriptive names
and cap the OR at 1000 (for np.inf cases).
Subset table to stat. sig. rows.

Dumps tables that have to be manually merged to generate Supp. Table "assembly breaks / enriched annotations"
"""

#print('0.05 - ', -1 * math.log10(0.05))

#qvals = [round(-1 * math.log10(float(q)), 2) for q in qv.strip('"').split()]

path = '/home/local/work/data/hgsvc/lola'

desc_map = {
    'Ngap_ctg_ucsc.bed': '"N gap (contig)"',
    'Ngap_telo_ucsc.bed': '"N gap (telomere)"',
    'Ngap_scf_ucsc.bed': '"N gap (scaffold)"',
    'centromeres.bed': '"Centromere"',
    'Ngap_hetchrom_ucsc.bed': '"N gap (heterochromatin)"',
    'segdups.bed': '"Segmental duplications"',
    'cpg_islands.bed': '"CpG islands"',
    'Ngap_shortarm_ucsc.bed': '"N gap (short arm)"',
    'microsats.bed': '"Microsatellites"',
    'rmsk_highconf_RNArep.tsv': '"RNA repeats (RPMSK)"'
}

for table in os.listdir(path):
    if not table.endswith('lola.tsv'):
        continue
    print(table)
    table_path = os.path.join(path, table)
    df = pd.read_csv(table_path, sep='\t', header=0)
    df = df.loc[df['collection'] == 'ucsc_features', :].copy()
    df.drop(['cellType', 'tissue', 'antibody', 'treatment', 'dataSource', 'collection'], axis=1, inplace=True)
    df['qValueLog'] = -1 * np.log10(df['qValue'])
    
    # for annotations that are entirely covered in the user set
    # (trivially, e.g., corresponding to N gaps that cannot be aligned to),
    # set the OR to 1000 for easier manipulation of the DataFrame
    select_inf = df['oddsRatio'] == np.inf
    df.loc[select_inf, 'oddsRatio'] = 1000.
    
    # limit to results considered statistically significant
    select_qv = df['qValueLog'] > 2
    select_or = df['oddsRatio'] > 1
    
    df = df.loc[(select_qv & select_or), :].copy()
    df['description'] = df['filename'].apply(lambda x: desc_map[x])
    df.sort_values(['oddsRatio', 'qValueLog'], ascending=False, inplace=True)
    
    dump_columns = [
        'description',
        'oddsRatio',
        'qValueLog',
        'support'
    ]
    
    outfile = table_path.replace('.tsv', '.clean.tsv')
    df[dump_columns].to_csv(
        outfile,
        sep='\t',
        index=False,
        header=True
    )    


breaks_inv-0-60_any_all.10kb.3c.lola.tsv
breaks_inv-10-60_any_all.10kb.3c.lola.tsv
breaks_inv-20-60_any_all.10kb.3c.lola.tsv
