In [8]:
import os
import importlib
import pandas as pd
import numpy as np
import collections as col

"""
What does this do?
Dump Supp. Table "MAPQ60 ALT contig coverage"
(or extension to existing supp. table listing MAPQ60 contig coverage)
"""

plot_aux_module = '/home/local/work/code/github/project-diploid-assembly/notebooks/aux_mods/plot_aux.py'
plot_aux_spec = importlib.util.spec_from_file_location("plot_aux", plot_aux_module)
plot_aux = importlib.util.module_from_spec(plot_aux_spec)
plot_aux_spec.loader.exec_module(plot_aux)

samples = plot_aux.load_sample_table()
hexcodes, rgbcodes, popmap = plot_aux.load_population_annotation()

ref_contigs = '/home/local/work/pipeline/run_folder/references/assemblies/GRCh38_HGSVC2_incalt.sizes'
contig_aln_folder = '/home/local/work/data/hgsvc/contig_aln_bed_incalt'
min_mapq = 60


def load_alt_contigs(fpath):
    
    alt_contigs = []
    with open(fpath, 'r') as table:
        for line in table:
            contig, size = line.split()
            if not contig.endswith('_alt'):
                continue
            chrom = contig.split('_')[0]
            try:
                sort_order = int(chrom.strip('chr'))
            except ValueError:
                if chrom == 'chrX':
                    sort_order = 23
                elif chrom == 'chrY':
                    sort_order = 24
                else:
                    continue
            alt_contigs.append((sort_order, int(size), contig))
    
    df = pd.DataFrame.from_records(alt_contigs, columns=['sort_order', 'size', 'chrom'])
    df.sort_values(
        ['sort_order', 'size', 'chrom'],
        ascending=[True, False, True],
        inplace=True
    )
    df.reset_index(drop=True, inplace=True)
    
    return df


def load_sample_info(filename, sample_table=samples, rgb_codes=rgbcodes):
    
    if not filename.endswith('incalt.bed'):
        return None
    sample, platform = plot_aux.extract_sample_platform(filename, mapped_readset=True, long_read_pos=1)
    sample_info = sample_table[sample]
    if 'h1-un' in filename:
        hap = 10
    elif 'h2-un' in filename:
        hap = 20
    else:
        raise
    if sample_info['sex'] == 'male':
        sex = 'M'
    else:
        sex = 'F'
    if sample in ['HG00733', 'HG00514', 'NA19240']:
        relation = 'related'
    else:
        relation = 'unrelated'
    super_pop = sample_info['super_population']
    population = sample_info['population']
    return (super_pop, population, sample, sex, platform, hap, relation, rgb_codes[population])


def load_contig_alignments(fpath, alt_contigs):
    
    df = pd.read_csv(
        fpath,
        sep='\t',
        names=['chrom', 'start', 'end', 'contig', 'mapq', 'strand']
    )
    df = df.loc[df['mapq'] >= min_mapq, :].copy()
    df = df.loc[df['chrom'].str.endswith('_alt'), :].copy()
    
    spop, pop, sample, sex, platform, hap, relation, _ = load_sample_info(os.path.basename(fpath))

    idx_cov_count = (spop, pop, sample, sex, platform, hap, relation, 'coverage', 'count')
    idx_cov_bp = (spop, pop, sample, sex, platform, hap, relation, 'coverage', 'bp')
    idx_cov_pct = (spop, pop, sample, sex, platform, hap, relation, 'coverage', 'pct')
    
    col_index = pd.MultiIndex.from_tuples(
        [idx_cov_count, idx_cov_bp, idx_cov_pct],
        names=[
            'super_population',
            'population',
            'sample',
            'sex',
            'platform',
            'haplotype',
            'relation',
            'statistic',
            'unit'
        ]
    )
    
    row_index = pd.MultiIndex.from_tuples(
        [(row['chrom'], row['size']) for _, row in alt_contigs.iterrows()],
        names=['alt_contig', 'contig_size']
    )
    
    new_df = pd.DataFrame(
        np.zeros((alt_contigs.shape[0], 3), dtype=np.float64),
        index=row_index,
        columns=col_index
        
    )
    
    for alt_ctg, alt_aln in df.groupby('chrom'):
        alt_contig_length = alt_contigs.loc[alt_contigs['chrom'] == alt_ctg, 'size'].values[0]
        row_idx = alt_ctg, alt_contig_length
        
        aligned_contigs = alt_aln['contig'].nunique()
        
        blocks = []
        current_start = -1
        current_end = -1
        for idx, row in alt_aln.iterrows():
            if current_start == -1:
                current_start = row['start']
                current_end = row['end']
            else:
                block_ovl = min(current_end, row['end']) - max(current_start, row['start'])
                if block_ovl >= 0:
                    current_start = min(current_start, row['start'])
                    current_end = max(current_end, row['end'])
                else:
                    blocks.append(current_end - current_start)
                    current_start = row['start']
                    current_end = row['end']
        blocks.append(current_end - current_start)
        total_aln_length = sum(blocks)
        pct_aln_length = round(total_aln_length / alt_contig_length * 100, 2)

        new_df.loc[row_idx, idx_cov_count] = aligned_contigs
        new_df.loc[row_idx, idx_cov_bp] = total_aln_length
        new_df.loc[row_idx, idx_cov_pct] = pct_aln_length
    
    new_df[idx_cov_count] = new_df[idx_cov_count].astype('int64')
    new_df[idx_cov_bp] = new_df[idx_cov_bp].astype('int64')
    
    return new_df


alt_contigs = load_alt_contigs(ref_contigs)

bed_path = '/home/local/work/data/hgsvc/contig_aln_bed_incalt'
cache_file = '/home/local/work/data/hgsvc/aln_summary/ctg_alt_aln.cache.h5'

if not os.path.isfile(cache_file):

    all_alt_align = []
    
    for bedfile in os.listdir(bed_path):
        bedfile_path = os.path.join(bed_path, bedfile)
        aln = load_contig_alignments(bedfile_path, alt_contigs)
        all_alt_align.append(aln)
        
    all_alt_align = pd.concat(all_alt_align, axis=1, ignore_index=False)
    
    with pd.HDFStore(cache_file, 'w', complevel=9) as hdf:
        hdf.put('ctg_alt_aln', all_alt_align, format='fixed')
        
df = pd.read_hdf(cache_file)
print(df.shape)
    
def compute_response_stats(df):
    
    # selected only unrelated samples
    unrel = df.xs(['unrelated', 'pct'], level=['relation', 'unit'], axis=1)
    print(unrel.shape)
    
    alt_mean = unrel.mean(axis=1)
    alt_median = unrel.median(axis=1)
    
    for t in [50, 75, 90]:
        print('threshold ', t)
        above_threshold = unrel.loc[(unrel > t).any(axis=1), :]
        above_contigs = above_threshold.shape[0]
        above_assm = (above_threshold > t).sum(axis=1)
        print('>>> num alt contigs ', above_threshold.shape[0])
        print('>>> total bp alt contigs ', above_threshold.index.get_level_values('contig_size').values.sum())
        print('>>> median num assm ', above_assm.median())
        print('>>> max num assm ', above_assm.max())
        #print(above_threshold.loc[above_assm > 32, :].index)
    return
        
    
_ = compute_response_stats(df)

def dump_supptable_columns(df):
    
    total_alt_size = df.index.get_level_values('contig_size').values.sum()
    print(total_alt_size)
    total_alt_coverage = df.xs('bp', level='unit', axis=1).sum(axis=0)
    total_pct_coverage = total_alt_coverage / total_alt_size
    total_pct_coverage *= 100
    total_pct_coverage = total_pct_coverage.round(2)   
   
    num_alt_above = (df.xs('pct', level='unit', axis=1) > 50).sum(axis=0)
    
    dump = pd.concat(
        [total_alt_coverage, total_pct_coverage, num_alt_above],
        axis=1,
        ignore_index=False
    )
    dump.columns = ['total_alt_cov', 'total_pct_cov', 'num_alt_geq50']
    dump.sort_index(axis=0, level=['platform', 'sample', 'haplotype'], inplace=True)
    out_tsv = '/home/local/work/data/hgsvc/aln_summary/alt_contig_cov.tsv'
    dump.to_csv(out_tsv, sep='\t', header=True, index=True)
    
#dump_supptable_columns(df)

(261, 264)
(261, 76)
threshold  50
>>> num alt contigs  34
>>> total bp alt contigs  18159658
>>> median num assm  3.0
>>> max num assm  67
threshold  75
>>> num alt contigs  23
>>> total bp alt contigs  12916125
>>> median num assm  2.0
>>> max num assm  67
threshold  90
>>> num alt contigs  18
>>> total bp alt contigs  10908115
>>> median num assm  2.5
>>> max num assm  67
