In [1]:
import torch

In [182]:
A = torch.rand(100, 384)
B = torch.rand(100, 384)
# A = A.view(-1)

In [183]:
samples = A.shape[0]

In [184]:
def pdf_info(tensor, q1, q2):
    reshaped = tensor.view(-1)
    vals, _ = torch.sort(reshaped)
    lower_index = torch.tensor(len(vals)*(q1/100.), dtype=torch.long)
    upper_index = torch.tensor(len(vals)*(q2/100.), dtype=torch.long)
    iqr, r = vals[upper_index]-vals[lower_index], max(reshaped)-min(reshaped)
    return iqr, r

In [185]:
iqr, r = pdf_info(A, 25, 75)

In [186]:
if iqr > 1e-5:
    bin_width = 2*iqr/np.cbrt(samples)
    bins = int(torch.round(r/bin_width))
else:
    # MNIST (since it's really only supposed to be either 0 or 1 as output)
    # TODO: bin number
    bins = 2

# Bin data
x = []
for i in range(A.shape[1]):
    x.append(torch.histc(A[:, i].unsqueeze(0), bins=bins))
    
x = torch.stack(x, dim=0).t()
x[x == 0.] = .0001
# res = np.array(x).T
# res[res == 0] = .00001

In [204]:
help(torch.histc)

Help on built-in function histc:

histc(...)
    histc(input, bins=100, min=0, max=0, out=None) -> Tensor
    
    Computes the histogram of a tensor.
    
    The elements are sorted into equal width bins between :attr:`min` and
    :attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and
    maximum values of the data are used.
    
    Args:
        input (Tensor): the input tensor
        bins (int): number of histogram bins
        min (int): lower end of the range (inclusive)
        max (int): upper end of the range (inclusive)
        out (Tensor, optional): the output tensor
    
    Returns:
        Tensor: Histogram represented as a tensor
    
    Example::
    
        >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)
        tensor([ 0.,  2.,  1.,  0.])



In [192]:
x

tensor([[17., 25., 18.,  ..., 24., 22., 23.],
        [13., 15., 23.,  ..., 18., 12., 22.],
        [25., 18., 19.,  ..., 26., 14., 19.],
        [29., 19., 22.,  ..., 14., 34., 19.],
        [16., 23., 18.,  ..., 18., 18., 17.]])

In [168]:
# Computing metrics for different archtypes
def compute_divergences(A, B):
    """ Compute divergence metrics (Jensen Shannon, Kullback-Liebler,
    Wasserstein Distance, Energy Distance) between predicted distribution A
    and true distribution B """

    # Get number of samples, IQR statistics, range
    samples = A.shape[0]
    iqr = np.percentile(A, 75)-np.percentile(A, 25)
    r = np.max(A) - np.min(A)

    # Get PDFs of predicted distribution A, true distribution B
    B = get_pdf(B, iqr, r, samples)
    A = get_pdf(A, iqr, r, samples)
    
    return A, B

    # Mean
#     m = (np.array(A)+np.array(B))/2

#     # Compute metrics
#     kl = entropy(pk=A, qk=B).sum()/A.shape[1]
#     js = .5*(entropy(pk=A, qk=m)+entropy(pk=B, qk=m)).sum()/A.shape[1]
#     wd = sum([wasserstein_distance(A[:,i], B[:,i]) for i in range(A.shape[1])])
#     ed = sum([energy_distance(A[:,i], B[:,i]) for i in range(A.shape[1])])

#     divergences = {"KL-Divergence": kl,
#                     "Jensen-Shannon": js,
#                     "Wasserstein-Distance": wd,
#                     "Energy-Distance": ed}

#     return divergences


def get_pdf(data, iqr, r, samples):
    """ Compute optimally binned probability distribution function  """
    x = []

    if iqr > 1e-5:
        bin_width = 2*iqr/np.cbrt(samples)
        bins = int(round(r/bin_width, 0))
    else:
        # MNIST (since it's really only supposed to be either 0 or 1 as output)
        # TODO: bin number
        bins = 2

    # Bin data
    for i in range(data.shape[1]):
        x.append(list(np.histogram(data[:, i], bins=bins, density=True)[0]))
    
    res = np.array(x).T
    res[res == 0] = .00001
    return res

In [169]:
A = torch.rand(100, 384)
B = torch.rand(100, 384)

In [193]:
A, B  = compute_divergences(A.numpy(), B.numpy()) 

In [199]:
A

array([[0.87283418, 1.25912741, 0.94811558, ..., 1.20506506, 1.10345383,
        1.17052257],
       [0.66746143, 0.75547645, 1.21148102, ..., 0.90379879, 0.60188391,
        1.11963028],
       [1.28357967, 0.90657174, 1.00078867, ..., 1.30548715, 0.70219789,
        0.96695343],
       [1.48895242, 0.95693683, 1.15880793, ..., 0.70295462, 1.70533774,
        0.96695343],
       [0.82149099, 1.15839722, 0.94811558, ..., 0.90379879, 0.90282586,
        0.86516885]])

In [202]:
help(np.histogram)

Help on function histogram in module numpy.lib.function_base:

histogram(a, bins=10, range=None, normed=False, weights=None, density=None)
    Compute the histogram of a set of data.
    
    Parameters
    ----------
    a : array_like
        Input data. The histogram is computed over the flattened array.
    bins : int or sequence of scalars or str, optional
        If `bins` is an int, it defines the number of equal-width
        bins in the given range (10, by default). If `bins` is a
        sequence, it defines the bin edges, including the rightmost
        edge, allowing for non-uniform bin widths.
    
        .. versionadded:: 1.11.0
    
        If `bins` is a string from the list below, `histogram` will use
        the method chosen to calculate the optimal bin width and
        consequently the number of bins (see `Notes` for more detail on
        the estimators) from the data that falls within the requested
        range. While the bin width will be optimal for the actua