In [19]:
import pandas as pd
import os
import collections as col
import numpy as np
import sklearn.cluster as sklclust
import pickle as pck

in_file = '/home/local/work/data/hgsvc/roi/20201026_DP_LOHs_minSeg100.bed'

def split_loh_regions():

    out_path = '/home/local/work/data/hgsvc/roi/loh_splits'

    df = pd.read_csv(in_file, sep='\t', header=0, names=['#chrom', 'start', 'end', 'sample'])

    for sample, regions in df.groupby('sample'):
        sub = regions.sort_values(['#chrom', 'start'], inplace=False)
        sub['name'] = sub['sample'] + '_' + sub['#chrom'] + '_' + sub['start'].astype(str)
        out_file = '{}_loh.bed'.format(sample)
        sub.to_csv(
            os.path.join(out_path, out_file),
            sep='\t',
            columns=['#chrom', 'start', 'end', 'name'],
            header=True,
            index=False)
    return

# split_loh_regions()

def load_segdup_overlap():
    """
     bedtools intersect -wao
     -a 20201026_DP_LOHs_minSeg100.bed
     -b GRCh38_segdups.bed
     > 20201026_DP_LOHs_minSeg100.sd-ovl.bed 
    """
    path = '/home/local/work/data/hgsvc/roi/20201026_DP_LOHs_minSeg100.sd-ovl.bed'
    df = pd.read_csv(
        path,
        sep='\t',
        header=None,
        names=[
            'chrom',
            'start',
            'end',
            'sample',
            'sd_chrom',
            'sd_start',
            'sd_end',
            'sd_name',
            'sd_pct_id',
            'sd_strand',
            'overlap'
        ],
        usecols=['chrom', 'start', 'end', 'sample', 'sd_pct_id', 'overlap']
    )
    df['region_id'] = df['sample'] + '_' + df['chrom'] + '_' + df['start'].astype(str)
    
    uniq_values = []
    uniq_index = []
    
    for idx, row in df.groupby('region_id')[['sd_pct_id', 'overlap']]:
        region_id = row['region_id'].values[0]
        chrom = row['chrom'].values[0]
        start = row['start'].values[0]
        end = row['end'].values[0]
        
        if row['overlap'].values[0] == 0:
            uniq_values.append((chrom, start, end, 0., 0.))
            uniq_index.append(region_id)
            continue
        total_overlap = row['overlap'].sum()
        region_size = (row['end'] - row['start']).values[0]
        sd_weights = row['overlap'] / total_overlap
        avg_sd_pct = (np.average(row['sd_pct_id'], weights=sd_weights) /  1000).round(3)
        avg_sd_ovl = (total_overlap / region_size).round(3)
        
        uniq_values.append((chrom, start, end, avg_sd_pct, avg_sd_ovl))
        uniq_index.append(region_id)
        
    df = pd.DataFrame(
        uniq_values,
        columns=['chrom', 'start', 'end', 'avg_sd_pct_id', 'avg_sd_pct_ovl'],
        index=uniq_index
    )
    df.index.name = 'region_id'
    
    return df


cov_tables = '/home/local/work/data/hgsvc/roi/loh_splits/avg_cov'

table_columns = [
    'region_id',
    'region_size',
    'coverage_bp',
    'coverage_cum',
    'coverage_mean',
    'coverage_nz_mean'
]

def load_cov_tables():
    merged = col.defaultdict(list)
    tables = [f for f in os.listdir(cov_tables) if f.endswith('.tsv')]
    for table in tables:
        sample = table.split('_')[0]
        if '-clr' in table:
            tech = 'CLR'
        else:
            tech = 'HiFi'
        hap = table.rstrip('cov_loh.tsv').rsplit('.', 1)[-1]
        df = pd.read_csv(
            os.path.join(cov_tables, table),
            sep='\t',
            header=None,
            names=table_columns
        )
        df['sample'] = sample
        df['tech'] = tech
        df['coverage_pct'] = (df['coverage_bp'] / df['region_size']).round(2)
        df.columns = [hap + '_' + c if c.startswith('coverage') else c for c in df.columns]
        df.index = pd.MultiIndex.from_tuples(
            [(sample, tech, reg_id) for reg_id in df['region_id']],
            names=['sample', 'tech', 'region_id']
        )
        df.drop(
            [
                'region_size',
                'region_id',
                'sample',
                'tech',

            ], axis=1, inplace=True
        )
        merged[(sample, tech)].append(df)
        
    final_merge = []
    for k, v in merged.items():
        tmp = v[0].join(v[1:], how='outer')
        tmp.drop(
            [
                'h1_coverage_cum',
                'h2_coverage_cum',
                'un_coverage_cum'
            ], axis=1, inplace=True
        )
        final_merge.append(tmp)
    final_merge = pd.concat(final_merge, axis=0, ignore_index=False)
    
    return final_merge


def load_input_coverage():
    path = '/home/local/work/data/hgsvc/fig1_panels/input_read_stats/input_read_stats.tsv'
    df = pd.read_csv(path, sep='\t', header=None, names=['sample', 'super_pop', 'pop', 'tech', 'input_cov'])
    df.index = pd.MultiIndex.from_tuples(
        [(row[0], row[3]) for row in df.itertuples(index=False)],
        names=['sample', 'tech']
    )
    df = df.loc[~df['sample'].isin(['HG00733', 'HG00514', 'NA19240']), :].copy()
    df.drop(['sample', 'tech', 'pop'], axis=1, inplace=True)
    return df


df = load_cov_tables()
input_cov = load_input_coverage()
sd_ovl = load_segdup_overlap()

df = df.join(sd_ovl, how='outer')
df = df.join(input_cov, how='outer')

df['read_cov'] = (df['h1_coverage_mean'] + df['h2_coverage_mean'] + df['un_coverage_mean']).round(3)

df['feat_un_cov'] = (df['un_coverage_mean'] / df['input_cov']).round(3)
df['feat_h1_cov'] = (df['h1_coverage_mean'] / df['input_cov']).round(3)
df['feat_h2_cov'] = (df['h2_coverage_mean'] / df['input_cov']).round(3)
df['feat_locus_cov'] = (df['read_cov'] / df['input_cov']).round(3)
df['feat_h1_loc'] = df['h1_coverage_pct']
df['feat_h2_loc'] = df['h2_coverage_pct']
df['feat_un_loc'] = df['un_coverage_pct']
df['feat_avg_sd_id'] = df['avg_sd_pct_id']
df['feat_avg_sd_ovl'] = df['avg_sd_pct_ovl']

cluster_features = [
    c for c in df.columns if c.startswith('feat')
]

df['cluster_LOH'] = -1
df['cluster_LOHSD'] = -1

model_output = '/home/local/work/data/hgsvc/roi'
if False:

    clustering_models = []

    for num_clust in [2, 3]:
        
        if num_clust == 2:
            use_features = [c for c in cluster_features if '_sd_' not in c]
        else:
            use_features = cluster_features

        kmeans_model = sklclust.KMeans(
            n_clusters=num_clust,
            init='k-means++',
            n_init=50,
            max_iter=500,
            tol=0.0001,
            verbose=False,
            algorithm='full'
        )

        labels = kmeans_model.fit_predict(df[use_features])
        label = 'cluster_LOH' if num_clust == 2 else 'cluster_LOHSD'
        
        for l in set(labels):
            df.loc[labels == l, label] = l
            
        
        model_file = 'kmeans_n{}.pck'.format(num_clust)
        with open(os.path.join(model_output, model_file), 'wb') as dump:
            _ = pck.dump(kmeans_model, dump)
            
    with pd.HDFStore(os.path.join(model_output, 'loh_regions_labeled.h5'), 'w') as hdf:
        hdf.put('loh_regions', df)

with pd.HDFStore(os.path.join(model_output, 'loh_regions_labeled.h5'), 'r') as hdf:
        df = hdf['loh_regions']

df['cLOH_is_LOH'] = -1
df['cLOHSD_is_LOH'] = -1

df.loc[df['cluster_LOH'] == 0, 'cLOH_is_LOH'] = 0
df.loc[df['cluster_LOH'] == 1, 'cLOH_is_LOH'] = 1

df.loc[df['cluster_LOHSD'] == 0, 'cLOHSD_is_LOH'] = 1
df.loc[df['cluster_LOHSD'] == 1, 'cLOHSD_is_LOH'] = 0
df.loc[df['cluster_LOHSD'] == 2, 'cLOHSD_is_LOH'] = -1

df['BNG_SV_DEL_clusterID'] = 'none'
df['BNG_SV_DEL_cluster_size'] = 'none'

sv_cluster_file = '/home/local/work/data/hgsvc/roi/bng_sv_del_nonovl.tsv'
sv_clusters = pd.read_csv(sv_cluster_file, sep='\t', header=0)


for idx, row in df.iterrows():
    chrom_match = row['chrom'].strip('chr') + '$'
    select_chrom = sv_clusters['Chr'].str.match(chrom_match)
    select_start = (sv_clusters['Start'] - 100000) < row['end']
    select_end = (sv_clusters['End'] + 100000) > row['start']
    select_pop = sv_clusters[row['super_pop']] > 0
    select_combined = select_chrom & select_start & select_end & select_pop
    if not select_combined.any():
        continue
    selected = sv_clusters.loc[select_combined, :]
    labels = ','.join(selected['#ClusterID'].astype(str).values)
    sizes = ','.join(selected['ClusterSVsize'].astype(str).values)
    df.loc[idx, 'BNG_SV_DEL_clusterID'] = labels
    df.loc[idx, 'BNG_SV_DEL_cluster_size'] = sizes
    
df.reset_index(drop=False, inplace=True)

total = df.shape[0]
loh_loh = (df['cLOH_is_LOH'] == 1).sum()
lohsd_loh = (df['cLOH_is_LOH'] == 1).sum()


loh_regions = set()
loh_regions_with_del = set()
loh_counts = col.Counter()
loh_regions_bng_cluster = set()

clr_lengths = set()
hifi_lengths = set()

for (sample, tech), regions in df.groupby(['sample', 'tech']):
    
    select_is_loh = regions['cLOH_is_LOH'] == 1
    select_has_cluster = regions['BNG_SV_DEL_clusterID'] != 'none'
    
    sample_loh = regions.loc[select_is_loh, 'region_id'].values
    
    region_lengths = (regions.loc[select_is_loh, 'end'] - regions.loc[select_is_loh, 'start']).values
    if tech == 'CLR':
        clr_lengths = clr_lengths.union(
            set(
                [(region_id, region_length) 
                 for region_id, region_length in zip(sample_loh, region_lengths)
                ]
            )
        )
    else:
        hifi_lengths = hifi_lengths.union(
            set(
                [(region_id, region_length) 
                 for region_id, region_length in zip(sample_loh, region_lengths)
                ]
            )
        )
    
    loh_counts[(sample, tech)] += sample_loh.size
    loh_regions = loh_regions.union(set(sample_loh))
    
    sample_loh_with_del = regions.loc[select_is_loh & select_has_cluster, 'region_id'].values
    loh_regions_with_del = loh_regions_with_del.union(set(sample_loh_with_del))
    

print('clr mean ', np.mean(np.array([l for i, l in clr_lengths], dtype=np.int64)))
print('hifi mean ', np.mean(np.array([l for i, l in hifi_lengths], dtype=np.int64)))
    
print(loh_counts)
print('Unique LOH regions ', len(loh_regions))
print('Unique LOH regions with DEL ', len(loh_regions_with_del))
print('Regions identified ', df['region_id'].nunique())


hifi_loh = [c for (s, t), c in loh_counts.items() if t == 'HiFi']
clr_loh = [c for (s, t), c in loh_counts.items() if t == 'CLR']

print(np.median(hifi_loh))
print(np.median(clr_loh))
raise
    
    

df.sort_values(['sample', 'tech', 'chrom', 'start'], inplace=True)

dump_columns = [
    'chrom',
    'start',
    'end',
    'sample',
    'tech',
    'cLOH_is_LOH',
    'cLOHSD_is_LOH',
    'BNG_SV_DEL_clusterID',
    'BNG_SV_DEL_cluster_size'
]
dump_columns.extend(cluster_features)

out_dump = '/home/local/work/data/hgsvc/roi/20201026_DP_LOHs_minSeg100.BNG-DEL.bed'

with open(out_dump, 'w') as dump:
    _ = dump.write('#')
    df.to_csv(
        dump,
        sep='\t',
        header=True,
        index=False,
        columns=dump_columns
    )



clr mean  2412559.975609756
hifi mean  1857356.5129533678
Counter({('HG00513', 'HiFi'): 35, ('NA12878', 'HiFi'): 35, ('HG00512', 'HiFi'): 25, ('HG00731', 'HiFi'): 25, ('HG00732', 'HiFi'): 24, ('NA24385', 'HiFi'): 15, ('NA19238', 'HiFi'): 9, ('NA19650', 'CLR'): 9, ('HG00171', 'CLR'): 8, ('HG00732', 'CLR'): 8, ('HG03486', 'HiFi'): 8, ('NA19239', 'HiFi'): 8, ('HG00096', 'CLR'): 7, ('HG01114', 'CLR'): 7, ('NA20847', 'CLR'): 7, ('HG02818', 'HiFi'): 5, ('HG03125', 'HiFi'): 4, ('HG00864', 'CLR'): 3, ('HG01505', 'CLR'): 3, ('HG03065', 'CLR'): 3, ('NA12329', 'CLR'): 3, ('HG00731', 'CLR'): 2, ('HG02011', 'CLR'): 2, ('HG03371', 'CLR'): 2, ('HG03683', 'CLR'): 2, ('HG03732', 'CLR'): 2, ('NA18534', 'CLR'): 2, ('NA20509', 'CLR'): 2, ('HG00512', 'CLR'): 1, ('HG00513', 'CLR'): 1, ('HG01596', 'CLR'): 1, ('HG02492', 'CLR'): 1, ('HG02587', 'CLR'): 1, ('HG03009', 'CLR'): 1, ('NA18939', 'CLR'): 1, ('NA19238', 'CLR'): 1, ('NA19239', 'CLR'): 1, ('NA19983', 'CLR'): 1})
Unique LOH regions  261
Unique LOH region

RuntimeError: No active exception to reraise