# P(s) for one specific region

In [None]:
import pprint
import os
import sys
import collections

import multiprocessing as mp

import bioframe
import click
import cooler
import cooltools
import cooltools.expected
import matplotlib.pyplot as plt
import matplotlib.gridspec
from matplotlib.lines import Line2D
import numpy as np
from numpy.lib.function_base import average
from pairlib.scalings import norm_scaling
import pandas as pd
import pairlib
import pairlib.scalings
import pairtools
from diskcache import Cache
from itertools import combinations
import itertools

from pandas.io.pytables import IndexCol

from diskcache import Cache
from itertools import combinations

from pandas.io.pytables import IndexCol

np.seterr(divide='ignore', invalid='ignore')

In [None]:
# user provided parameters
pairs_paths = []
out_path = ''
labels = []
title = ''
region = ('chrXII', 150000, 468931)
assembly = 'sacCer3'
centromeres_path = ''
normalized = True
plot_slope = True
no_cache = True

In [None]:
def calc_pair_freqs(scalings, trans_levels, calc_avg_trans, normalized):
    dist_bin_mids = np.sqrt(scalings.min_dist * scalings.max_dist)
    pair_frequencies = scalings.n_pairs / scalings.n_bp2
    mask = pair_frequencies > 0

    avg_trans = None
    if calc_avg_trans:
        avg_trans = (
                trans_levels.n_pairs.astype('float64').sum() /
                trans_levels.np_bp2.astype('float64').sum()
        )

    if normalized:
        norm_fact = pairlib.scalings.norm_scaling_factor(dist_bin_mids, pair_frequencies, anchor=int(1e3))
        pair_frequencies = pair_frequencies / norm_fact
        avg_trans = avg_trans / norm_fact if avg_trans else None

    return (dist_bin_mids[mask], pair_frequencies[mask]), avg_trans

In [None]:
def calc_scalings(path, label, regions, chromsizes, dist_range=(int(1e1), int(1e9)), n_dist_bins=128, chunksize=int(1e7)):
    cis_scalings, trans_levels = pairlib.scalings.compute_scaling(
        path,
        regions,
        chromsizes,
        dist_range=(int(1e1), int(1e9)),
        n_dist_bins=128,
        chunksize=int(1e7)
    )

    cis_scalings = cis_scalings[(cis_scalings.start1 >= 0) & (cis_scalings.end1 >= 0) & (cis_scalings.start2 >= 0) & (cis_scalings.end2 >= 0)]

    sc_agg = (cis_scalings
    .groupby(['min_dist', 'max_dist'])
    .agg({'n_pairs': 'sum', 'n_bp2': 'sum'})
    .reset_index()
    )

    cis_scalings, avg_trans = calc_pair_freqs(
        scalings=sc_agg,
        trans_levels=trans_levels,
        calc_avg_trans=False,
        normalized=normalized
        )

    return (label, cis_scalings, avg_trans)

In [None]:
def plot_scalings(scalings, plot_slope, labels, title, out_path):
    """
    Plot scaling curves from a list of (bin, pair frequencies) tuples.
    """
    fig = plt.figure(figsize=(6, 10))
    gs = matplotlib.gridspec.GridSpec(2, 1, height_ratios=[3, 1.5])
    scale_ax = fig.add_subplot(gs[0, 0])
    slope_ax = fig.add_subplot(gs[1, 0]) if plot_slope else None

    for label, value in scalings.items():
        cis_scalings, avg_trans_levels = value
        dist_bin_mids, pair_frequencies = cis_scalings

        scale_ax.loglog(
            dist_bin_mids,
            pair_frequencies,
            label=label,
            lw=1,
            alpha=0.5
        )

        if avg_trans_levels is not None:
            scale_ax.axhline(
                avg_trans_levels[idx],
                ls='dashed',
                c=scale_ax.get_lines()[-1].get_color(),
                lw=1,
                alpha=0.5
            )

        if slope_ax is not None:
            slope_ax.semilogx(
                np.sqrt(dist_bin_mids.values[1:] * dist_bin_mids.values[:-1]),
                np.diff(np.log10(pair_frequencies.values)) / np.diff(np.log10(dist_bin_mids.values)),
                label=label,
                lw=1,
                alpha=0.5
            )

    plt.sca(scale_ax)
    plt.grid(lw=0.5,color='gray')
    plt.gca().set_aspect(1.0)
    plt.xlim(1e2, 1e6)
    plt.xlabel('genomic separation (bp)')
    plt.ylabel('contact frequency')

    handles, labels = plt.gca().get_legend_handles_labels()
    if avg_trans_levels is not None:
        handles.append(Line2D([0], [0], color='black', lw=1, ls='dashed'))
        labels.append('average trans')
    plt.legend(handles, labels, loc=(1.025, 0.5), frameon=False)

    if slope_ax is not None:
        plt.sca(slope_ax)
        plt.grid(lw=0.5,color='gray')
        plt.xlim(1e2, 1e6)
        plt.ylim(-3.0, 0.0)
        plt.gca().set_aspect(1.0)
        plt.xlabel('distance (bp)')
        plt.ylabel('log-log slope')

    fig.suptitle(title)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)

    plt.savefig(out_path, dpi=300)

In [None]:
chromsizes = bioframe.fetch_chromsizes(assembly, filter_chroms=False, as_bed=True)

In [None]:
regions = bioframe.select(chromsizes, region).reset_index()

In [None]:
all_scalings = dict((label, 0) for label in labels)

def store_scalings(result):
    label, scalings, avg_trans = result
    all_scalings[label] = (scalings, avg_trans)

In [None]:
# parallelize computing scalings of multiple pairs files

pool = mp.Pool(len(pairs_paths))

for idx, path in enumerate(pairs_paths):
    pool.apply_async(calc_scalings, args=(path, labels[idx], regions, chromsizes), callback=store_scalings)

pool.close()
pool.join()

plot_scalings(all_scalings, plot_slope, labels, title, out_path)