In [55]:
%matplotlib inline

import os as os
import collections as col
import itertools as itt
import pickle as pck
import time as ti

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import numpy as np
import numpy.random as rng
import scipy.stats as stats
import pandas as pd
import seaborn as sns

# What does this do?
# Plot a histogram of the HSP length
# distribution (intended for merged HSPs)

date = '20180403'

run_plot_hsp_length_dist = True

save_figures = True

sns.set(style='white',
        font_scale=1.5,
        rc={'font.family': ['sans-serif'],
            'font.sans-serif': ['DejaVu Sans']})

fhgfs_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff'
cache_dir = os.path.join(fhgfs_base, 'caching/notebooks')

hsp_files_folder = os.path.join(fhgfs_base, 'solidstate/deep')

base_out = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/statediff'
fig_supp = os.path.join(base_out, 'figures', 'pub', 'supp')
fig_main = os.path.join(base_out, 'figures', 'pub', 'main')
fig_collect = os.path.join(base_out, 'figures', 'pub', 'collection')
                   
    
def collect_hsp_data(rootfolder):
    collector = col.defaultdict(list)
    for root, dirs, datafiles in os.walk(rootfolder):
        if root.endswith('hsp_run') and datafiles:
            for df in datafiles:
                # deep_hsp_hg38_ecs10_CELLTYPE_HG_vs_CELLTYPE_Ma.h5
                infos = df.split('.')[0].split('_')
                seg = infos[3]
                c1, c2 = infos[5], infos[8]
                fpath = os.path.join(root, df)
                with pd.HDFStore(fpath, 'r') as hdf:
                    for k in hdf.keys():
                        if k.startswith('/segments') and not k.endswith('/thresholds'):
                            _, _, scoring, chrom = k.split('/')
                            data = hdf[k]
                            sizes = data['num_bins'].values.tolist()
                            collector[(seg, c1, c2, scoring)].extend(sizes)
    return collector
    

def collect_region_overlaps(paths):
    col_names = ['chrom', 'start', 'end', 'name', 'score',
                 'segpv', 'sumpv', 'sample1', 'sample2', 'rank', 'pct']
    columns = [c + '_A' for c in col_names]
    columns.extend([c + '_B' for c in col_names])
    columns.append('overlap')
    collector = col.defaultdict(col.Counter)
    for path in paths:
        for root, dirs, tables in os.walk(path):
            if tables:
                for t in tables:
                    if t.endswith('.tsv'):
                        file_id1, file_id2 = t.split('.')[0].split('-isect-')
                        tpath = os.path.join(root, t)
                        ovl = pd.read_csv(tpath, sep='\t', header=None,
                                          names=columns, usecols=['sample1_A', 'sample2_A',
                                                                  'sample1_B', 'sample2_B',
                                                                  'overlap'])
                        ovl['overlap'] = ovl['overlap'].astype(np.int32)
                        shared = ovl.groupby(['sample1_A', 'sample2_A', 'sample1_B',
                                              'sample2_B'])['overlap'].sum()
                        shared = shared.astype(np.int32)
                        collector[file_id1, file_id2].update(shared.to_dict())
    return collector


def build_jaccard_dist_matrix(totals, shared, selectors, add_score=False, add_segment=False):
    jaccard = []
    labels = []
    for (file_a, file_b), overlaps in shared.items():
        if all([(s in file_a) and (s in file_b) for s in selectors]):
            for (a1, a2, b1, b2), ovl in overlaps.items():
                a_totals = totals[file_a][(a1, a2)]
                b_totals = totals[file_b][(b1, b2)] 
                j = np.round(ovl / (a_totals + b_totals - ovl), 3)                    
                a_label = a1[7] + a1[8] + a1[3] + ' v ' + a2[7] + a2[8] + a2[3]
                b_label = b1[7] + b1[8] + b1[3] + ' v ' + b2[7] + b2[8] + b2[3]
                if add_score:
                    a_label += ' ' + file_a.split('_')[-1][0].capitalize()
                    b_label += ' ' + file_b.split('_')[-1][0].capitalize()
                if add_segment:
                    seg_a = file_a.split('_')[0]
                    seg_a = seg_a[0].capitalize() + seg_a[-1]
                    a_label += ' ' + seg_a
                    seg_b = file_b.split('_')[0]
                    seg_b = seg_b[0].capitalize() + seg_b[-1]
                    b_label += ' ' + seg_b
                jaccard.append((a_label, b_label, j))
                jaccard.append((b_label, a_label, j))
                labels.extend([a_label, b_label])
    labels = sorted(set(labels))
    dim = len(labels)
    df = pd.DataFrame(np.zeros((dim, dim), dtype=np.float32),
                      index=labels, columns=labels)
    for r, c, j in jaccard:
        df.loc[r, c] = j
    return df    


def create_histogram(data, title):
    """
    """
    fig, ax = plt.subplots(figsize=(8, 8))
    pct = 100
    pct_75 = np.percentile(data, 75)
    # make this be default 
    # for vis purposes
    pct_99 = np.ceil(np.percentile(data, 99))
    data = data[data < pct_99]
    pct = 99
    
    if data.size < 1000:
        num_bins = 25
    else:
        num_bins = 50
    label = '{}% ({} bins)'.format(pct, num_bins)
    hist = sns.distplot(data, kde=False, rug=False, bins=num_bins,
                        ax=ax, color='blue', label=label)
    ax.axvline(pct_75, ymax=0.99, color='red',
               linestyle='dashed', alpha=0.75, label='75%ile')
    plt.legend(loc='upper right')
    ax.set_xlabel('HSP size (genomic bins)')
    ax.set_ylabel('Count')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    tt = ax.set_title(title)
    tt.set_position([0.5, 1.02])
    return fig, []
    
            
def plot_hsp_length_dist():
    cache_file = os.path.join(cache_dir, '{}_plot_hsp_lendist.pck'.format(date))
    if not os.path.isfile(cache_file):
        sizes = collect_hsp_data(hsp_files_folder)
        with open(cache_file, 'wb') as cache:
            pck.dump(sizes, cache)
    else:
        with open(cache_file, 'rb') as cache:
            sizes = pck.load(cache)
    tools = set()
    comparisons = set()
    scorings = set()
    for k in sizes.keys():
        t, c1, c2, s = k
        tools.add(t)
        comparisons.add((c1, c2))
        scorings.add(s)
        
    for tool in sorted(tools):
        for scoring in sorted(scorings):
            for c1, c2 in sorted(comparisons):
                dataset = np.array(sizes[(tool, c1, c2, scoring)], dtype=np.int32)
                title = 'HSP size dist. (N={}): {} vs {} - '\
                        '{} ({} scoring)'.format(len(dataset), c1, c2, tool.upper(), scoring)
                fig, exart = create_histogram(dataset, title)
                if save_figures:
                    outname = '{}_fig_X_hsp_lendist_{}_{}_{}_vs_{}'.format(date, tool, scoring, c1, c2)
                    out_svg = os.path.join(fig_collect, outname + '.svg')
                    fig.savefig(out_svg, bbox_inches='tight', extra_artists=exart)
                    out_pdf = os.path.join(fig_collect, outname + '.pdf')
                    fig.savefig(out_pdf, bbox_inches='tight', extra_artists=exart)
                    out_png = os.path.join(fig_collect, outname + '.png')
                    fig.savefig(out_png, bbox_inches='tight', extra_artists=exart, dpi=300)
                plt.close(fig)
    return 0
     
    
if run_plot_hsp_length_dist:
    plot_hsp_length_dist()
