# Trying to tame matplotlib

Ryan Reece <https://github.com/rreece>        
created: 2019-06-27

## 2-bin example

Following: <https://scikit-hep.org/pyhf/examples/notebooks/binderexample/StatisticalAnalysis.html>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import mplhep as hep
import pyhf
import math

In [2]:
pdf = pyhf.simplemodels.hepdata_like(signal_data=[12.0, 11.0], bkg_data=[50.0, 52.0], bkg_uncerts=[3.0, 7.0])


AttributeError: module 'pyhf.simplemodels' has no attribute 'hepdata_like'

In [None]:
pdf.config.auxdata

In [None]:
CLs_obs, CLs_exp_band = pyhf.infer.hypotest(1.0, [51, 48] + pdf.config.auxdata, pdf, return_expected_set=True)
print('Observed CLs         : %.4f' % (CLs_obs))
print('')
print('Expected CLs -2 sigma: %.4f' % (CLs_exp_band[0]))
print('Expected CLs -1 sigma: %.4f' % (CLs_exp_band[1]))
print('Expected CLs         : %.4f' % (CLs_exp_band[2]))
print('Expected CLs +1 sigma: %.4f' % (CLs_exp_band[3]))
print('Expected CLs +2 sigma: %.4f' % (CLs_exp_band[4]))

## n-bin example

Following: <https://github.com/scikit-hep/pyhf/blob/master/tests/benchmarks/test_benchmark.py>

In [None]:
def generate_source_static(n_bins):
    """
    Create the source structure for the given number of bins.

    Args:
        n_bins: `list` of number of bins

    Returns:
        source
    """
    binning = list(range(n_bins+1))
    data = [120.0] * n_bins
    bkg = [100.0] * n_bins
    bkgerr = [10.0] * n_bins
    sig = [30.0] * n_bins

    source = {
        'binning': binning,
        'bindata': {'data': data, 'bkg': bkg, 'bkgerr': bkgerr, 'sig': sig},
    }
    return source


def generate_source_poisson(n_bins):
    """
    Create the source structure for the given number of bins.
    Sample from a Poisson distribution

    Args:
        n_bins: `list` of number of bins

    Returns:
        source
    """
    np.random.seed(0)  # Fix seed for reproducibility
    binning = list(range(n_bins+1))
    data = np.random.poisson(120.0, n_bins).tolist()
    bkg = np.random.poisson(100.0, n_bins).tolist()
    bkgerr = np.random.poisson(10.0, n_bins).tolist()
    sig = np.random.poisson(30.0, n_bins).tolist()

    source = {
        'binning': binning,
        'bindata': {'data': data, 'bkg': bkg, 'bkgerr': bkgerr, 'sig': sig},
    }
    return source


def hypotest(pdf, data):
    return pyhf.infer.hypotest(
        1.0,
        data,
        pdf,
        pdf.config.suggested_init(),
        pdf.config.suggested_bounds(),
        return_tail_probs=True,
        return_expected=True,
        return_expected_set=True,
        return_test_statistics=True,
    )

In [None]:
n_bins = 3
source = generate_source_poisson(n_bins)
pdf = pyhf.simplemodels.hepdata_like(source['bindata']['sig'], 
                                     source['bindata']['bkg'], 
                                     source['bindata']['bkgerr'])
data = source['bindata']['data'] + pdf.config.auxdata

In [None]:
results = hypotest(pdf, data)
len(results)

In [None]:
results

In [None]:
source['bindata']['sig']

In [None]:
source['bindata']['bkg']

In [None]:
source['bindata']['bkgerr']

In [None]:
source['binning']

In [None]:
#plt.style.use(hep.style.ATLAS)
plt.style.use([hep.style.CMS, hep.style.firamath])

In [None]:
hep.histplot(source['bindata']['bkg'], source['binning'])

In [None]:
axs = hep.histplot(source['bindata']['bkg'], source['binning'], yerr=source['bindata']['bkgerr'])

In [None]:
x = range(1,6)
y = [ [1,4,6,8,9], [2,2,7,10,12], [2,8,5,10,6] ]
labels = ['A','B','C']

ytotal = [sum(i) for i in zip(*y)] 
yerr   = [math.sqrt(_y) for _y in ytotal]
 
fig, ax = plt.subplots()
plt.stackplot(x, y, labels=labels)
plt.errorbar(x, ytotal, yerr=yerr, fmt='o', color='black',
            ecolor='lightgray', elinewidth=3, capsize=0,
            label='sum')
plt.legend(loc='upper left')



In [None]:
x = range(0,20)
y = [ [1,4,6,8,9], [2,2,7,10,12], [2,8,5,10,6] ]
#assert len(x) == len(y[0])+1
labels = ['A','B','C']

ytotal = [sum(i) for i in zip(*y)] 
yerr   = [math.sqrt(_y) for _y in ytotal]
 
fig, ax = plt.subplots()
colors = [plt.cm.Spectral(i/float(len(y)-1)) for i in range(len(y))]
plt.hist(y, x, stacked=True, density=False, color=colors[:len(y)])
#plt.errorbar(x, ytotal, yerr=yerr, fmt='o', color='black',
#            ecolor='lightgray', elinewidth=3, capsize=0,
#            label='sum')
#plt.legend(loc='upper left')



In [None]:
"""
See:
https://en.wikipedia.org/wiki/Poisson_distribution#Confidence_interval
https://en.wikipedia.org/wiki/Chi-square_distribution#Asymptotic_properties
https://www.johndcook.com/blog/wilson_hilferty/
https://github.com/CoffeaTeam/coffea/blob/master/coffea/hist/plot.py
"""

def poisson_error_up(data):
    y1 = data + 1.0
    d = 1.0 - 1.0/(9.0*y1) + 1.0/(3*math.sqrt(y1))
    return y1*d*d*d-data

def poisson_error_down(data):
    y = data
    if y == 0.0: return 0.0
    d = 1.0 - 1.0/(9.0*y) - 1.0/(3.0*math.sqrt(y))
    return data-y*d*d*d

In [None]:
#bins = list(np.arange(-0.5, 5.5, 1.0))
bins = list(range(6))
bin_width = 1
y = [ [1,4,6,8,9], [2,2,7,10,12], [2,8,5,10,6] ]
labels = ['A','B','C']
data = [6, 12, 19, 30, 26]

data.reverse()
for y_i in y:
    y_i.reverse()

#################################

n_bins = len(bins)-1
n_samples = len(y)

for y_i in y:
    assert n_bins == len(y_i)

ytotal  = [sum(i) for i in zip(*y)]
#yerr    = [math.sqrt(_y) for _y in ytotal]
#dataerr = [math.sqrt(_y) for _y in data]
yerr    = [
            [0.2*poisson_error_down(_y) for _y in ytotal], # HACK
            [0.2*poisson_error_up(_y) for _y in ytotal],
          ]
dataerr = [
            [poisson_error_down(_y) for _y in data],
            [poisson_error_up(_y) for _y in data],
          ]

colors = [plt.cm.Spectral(i/float(n_samples-1)) for i in range(n_samples)]
binned = [bins[:-1] for _ in range(n_samples)]
weights = y
bincenters = np.mean(np.vstack([bins[0:-1],bins[1:]]), axis=0)

fig, ax = plt.subplots()

plt.hist(binned, bins, weights=weights,
    stacked=True,
    density=False,
    color=colors,
    label=labels,
    )

plt.xlabel('Dependent variable [unit]')
plt.ylabel('Events / (%i unit)' % bin_width)

plt.errorbar(bincenters, ytotal, yerr=yerr,
    label='stat. unc.',
    fmt='none',
    ecolor='darkgray',
    elinewidth=60., # HACK
    capsize=0,
    alpha=0.4,
    )

plt.errorbar(bincenters, data, yerr=dataerr, 
    label='data',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    )

total_mean = sum([y_i*x_i for y_i, x_i in zip(ytotal, bincenters)])/n_bins
middle_of_range = (bins[-1] - bins[0])/2

leg_loc = 'upper left'
if total_mean > middle_of_range:
    leg_loc = 'upper right'
leg = plt.legend(loc=leg_loc)


In [None]:
#bins = list(np.arange(-0.5, 5.5, 1.0))
bins = list(range(6))
bin_width = 1
y = [ [1,4,6,8,9], [2,2,7,10,12], [2,8,5,10,6] ]
labels = ['A','B','C']
data = [6, 12, 19, 30, 26]

data.reverse()
for y_i in y:
    y_i.reverse()

#################################

n_bins = len(bins)-1
n_samples = len(y)

for y_i in y:
    assert n_bins == len(y_i)

ytotal  = [sum(i) for i in zip(*y)]
#yerr    = [math.sqrt(_y) for _y in ytotal]
#dataerr = [math.sqrt(_y) for _y in data]
yerr    = [
            [0.2*poisson_error_down(_y) for _y in ytotal],
            [0.2*poisson_error_up(_y) for _y in ytotal],
          ]
dataerr = [
            [poisson_error_down(_y) for _y in data],
            [poisson_error_up(_y) for _y in data],
          ]

colors = [plt.cm.Spectral(i/float(n_samples-1)) for i in range(n_samples)]
binned = [bins[:-1] for _ in range(n_samples)]
weights = y
bincenters = np.mean(np.vstack([bins[0:-1],bins[1:]]), axis=0)

#fig, ax = plt.subplots()
fig = plt.figure()
axes = list()

ax1 = plt.subplot(211)
axes.append(ax1)

plt.hist(binned, bins, weights=weights,
    stacked=True,
    density=False,
    color=colors,
    label=labels,
    )

#plt.xlabel('Dependent variable [unit]')
plt.ylabel('Events / (%i unit)' % bin_width)

plt.errorbar(bincenters, ytotal, yerr=yerr,
    label='stat. unc.',
    fmt='none',
    ecolor='darkgray',
    elinewidth=60., # HACK
    capsize=0,
    alpha=0.4,
    )

plt.errorbar(bincenters, data, yerr=dataerr, 
    label='data',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    )

total_mean = sum([y_i*x_i for y_i, x_i in zip(ytotal, bincenters)])/n_bins
middle_of_range = (bins[-1] - bins[0])/2

leg_loc = 'upper left'
if total_mean > middle_of_range:
    leg_loc = 'upper right'
leg = plt.legend(loc=leg_loc)

plt.setp(ax1.get_xticklabels(), visible=False)

ax2 = plt.subplot(212, sharex=ax1)
axes.append(ax2)

y_ratio = [d_i/y_i if y_i else 0. for d_i, y_i in zip(data, ytotal)]

plt.errorbar(bincenters, y_ratio, #yerr=y_ratio_err,
    label='ratio',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    )

ax2.set_ylabel('Data / Model')
ax2.set_ylim(0.7, 1.3) # Hack
plt.xlabel('Dependent variable [unit]')

fig.subplots_adjust(wspace=0, hspace=0)

In [None]:
#bins = list(np.arange(-0.5, 5.5, 1.0))
bins = list(range(6))
bin_width = 1
y = [ [1,4,6,8,9], [2,2,7,10,12], [2,8,5,10,6] ]
labels = ['A','B','C']
data = [6, 12, 19, 30, 26]

data.reverse()
for y_i in y:
    y_i.reverse()

#################################

n_bins = len(bins)-1
n_samples = len(y)

for y_i in y:
    assert n_bins == len(y_i)

ytotal  = [sum(i) for i in zip(*y)]
#yerr    = [math.sqrt(_y) for _y in ytotal]
#dataerr = [math.sqrt(_y) for _y in data]
yerr    = [
            [0.2*poisson_error_down(_y) for _y in ytotal],
            [0.2*poisson_error_up(_y) for _y in ytotal],
          ]
dataerr = [
            [poisson_error_down(_y) for _y in data],
            [poisson_error_up(_y) for _y in data],
          ]

colors = [plt.cm.Spectral(i/float(n_samples-1)) for i in range(n_samples)]
binned = [bins[:-1] for _ in range(n_samples)]
weights = y
bincenters = np.mean(np.vstack([bins[0:-1],bins[1:]]), axis=0)

fig = plt.figure()
axes = list()
gs = fig.add_gridspec(2, 1,  height_ratios=(2, 1),
#                      left=0.1, right=0.9, bottom=0.1, top=0.9,
                      wspace=0, hspace=0.04)
ax1 = fig.add_subplot(gs[0, 0])
axes.append(ax1)

plt.hist(binned, bins, weights=weights,
    stacked=True,
    density=False,
    color=colors,
    label=labels,
    )

plt.ylabel('Events / (%i unit)' % bin_width)

plt.errorbar(bincenters, ytotal, yerr=yerr,
    label='Uncert.',
    fmt='none',
    ecolor='darkgray',
    elinewidth=60., # HACK
    capsize=0,
    alpha=0.4,
    )

plt.errorbar(bincenters, data, yerr=dataerr, 
    label='Data',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    )

total_mean = sum([y_i*x_i for y_i, x_i in zip(ytotal, bincenters)])/n_bins
middle_of_range = (bins[-1] - bins[0])/2

leg_loc = 'upper left'
if total_mean > middle_of_range:
    leg_loc = 'upper right'
leg = plt.legend(loc=leg_loc)

plt.setp(ax1.get_xticklabels(), visible=False)

ax2 = fig.add_subplot(gs[1, 0], sharex=ax1)
axes.append(ax2)

plt.axhline(y=1.0, color='lightgray', linestyle='-', zorder=-1)

y_ratio = [d_i/y_i if y_i else 0. for d_i, y_i in zip(data, ytotal)]
y_ratio_band = [
    [ye_i/y_i for y_i, ye_i in zip(ytotal, yerr[0])],
    [ye_i/y_i for y_i, ye_i in zip(ytotal, yerr[1])],
]
y_ratio_err = [
    [de_i/d_i for d_i, de_i in zip(data, dataerr[0])],
    [de_i/d_i for d_i, de_i in zip(data, dataerr[1])],
]

plt.errorbar(bincenters, [1.0]*n_bins, yerr=y_ratio_band,
    label='ratio_band',
    fmt='none',
    ecolor='darkgray',
    elinewidth=60., # HACK
    capsize=0,
    alpha=0.4,
    )

plt.errorbar(bincenters, y_ratio, yerr=y_ratio_err,
    label='ratio',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    )

ax2.set_ylabel('Data / Model')
#ax2.set_ylim(0.7, 1.3) # Hack
plt.xlabel('Dependent variable [unit]')

fig.subplots_adjust(wspace=0, hspace=0)

In [None]:
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
from matplotlib.patches import Patch

def make_error_boxes(ax, xdata, ydata, xerror, yerror,
                     facecolor='darkgray',
                     edgecolor='none',
                     alpha=0.4,
                     hatch=None,
                     zorder=20):
    """
    From:
    https://matplotlib.org/3.1.0/gallery/statistics/errorbars_and_boxes.html
    """

    # Create list for all the error patches
    errorboxes = []
    
    xerror = np.asarray(xerror)
    yerror = np.asarray(yerror)

    # Loop over data points; create box from errors at each point
    for x, y, xe, ye in zip(xdata, ydata, xerror.T, yerror.T):
        rect = Rectangle((x - xe[0], y - ye[0]), xe.sum(), ye.sum())
        errorboxes.append(rect)

    # Create patch collection with specified colour/alpha
    pc = PatchCollection(errorboxes,
                         facecolor=facecolor,
                         alpha=alpha,
                         edgecolor=edgecolor,
                         hatch=hatch,
                         zorder=zorder)

    # Add collection to axes
    ax.add_collection(pc)
    
    return pc

    # Plot errorbars
#    artists = ax.errorbar(xdata, ydata, xerr=xerror, yerr=yerror,
#                          fmt='None', ecolor='k')

#    return artists

In [None]:
#bins = list(np.arange(-0.5, 5.5, 1.0))
bins = list(range(6))
bin_width = 1
y = [ [1,4,6,8,9], [2,2,7,10,12], [2,8,5,10,6] ]
labels = ['A','B','C']
data = [6, 12, 19, 30, 26]

data.reverse()
for y_i in y:
    y_i.reverse()

#################################

y.reverse()
labels.reverse()

n_bins = len(bins)-1
n_samples = len(y)

for y_i in y:
    assert n_bins == len(y_i)

ytotal  = [sum(i) for i in zip(*y)]
#yerr    = [math.sqrt(_y) for _y in ytotal]
#dataerr = [math.sqrt(_y) for _y in data]
yerr    = [
            [0.2*poisson_error_down(_y) for _y in ytotal], # HACK
            [0.2*poisson_error_up(_y) for _y in ytotal],
          ]
dataerr = [
            [poisson_error_down(_y) for _y in data],
            [poisson_error_up(_y) for _y in data],
          ]

colors = [plt.cm.Spectral(i/float(n_samples-1)) for i in range(n_samples)]
binned = [bins[:-1] for _ in range(n_samples)]
weights = y
bincenters = np.mean(np.vstack([bins[0:-1],bins[1:]]), axis=0)
binwidths = [bins[i+1]-bins[i] for i in range(n_bins)]

fig = plt.figure()
axes = list()
gs = fig.add_gridspec(2, 1,  height_ratios=(3, 1),
                      wspace=0, hspace=0.04)
ax1 = fig.add_subplot(gs[0, 0])
axes.append(ax1)

plt.hist(binned, bins, weights=weights,
    stacked=True,
    density=False,
    color=colors,
    label=labels,
    )

plt.ylabel('Events / (%g unit)' % binwidths[0])

#plt.errorbar(bincenters, ytotal, yerr=yerr,
#    label='Uncert.',
#    fmt='none',
#    ecolor='darkgray',
#    elinewidth=60., # HACK
#    capsize=0,
#    alpha=0.4,
#    )

xerr = [
        [w/2 for w in binwidths],
        [w/2 for w in binwidths],
    ]
uncert_boxes = make_error_boxes(ax1, bincenters, ytotal, xerr, yerr,
                                hatch='///')

plt.errorbar(bincenters, data, yerr=dataerr, 
    label='Data',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    zorder=100,
    )

total_mean = sum([y_i*x_i for y_i, x_i in zip(ytotal, bincenters)])/n_bins
middle_of_range = (bins[-1] - bins[0])/2

leg_handles, leg_labels = ax1.get_legend_handles_labels()
data_handle = leg_handles.pop()
data_label = leg_labels.pop()
assert data_label == 'Data'
leg_handles.reverse()
leg_labels.reverse()
leg_handles.append(Patch(facecolor='darkgray',
                        edgecolor='none',
                        alpha=0.4,
                        hatch='///'))
leg_labels.append('Uncert.')
leg_handles.append(data_handle)
leg_labels.append(data_label)

leg_loc = 'upper left'
if total_mean > middle_of_range:
    leg_loc = 'upper right'
leg = plt.legend(leg_handles, leg_labels, loc=leg_loc)

plt.setp(ax1.get_xticklabels(), visible=False)

ax2 = fig.add_subplot(gs[1, 0], sharex=ax1)
axes.append(ax2)

plt.axhline(y=1.0, color='lightgray', linestyle='-', zorder=-1)

y_ratio = [d_i/y_i if y_i else 0. for d_i, y_i in zip(data, ytotal)]
y_ratio_band = [
    [ye_i/y_i for y_i, ye_i in zip(ytotal, yerr[0])],
    [ye_i/y_i for y_i, ye_i in zip(ytotal, yerr[1])],
]
y_ratio_err = [
    [de_i/d_i for d_i, de_i in zip(data, dataerr[0])],
    [de_i/d_i for d_i, de_i in zip(data, dataerr[1])],
]

#plt.errorbar(bincenters, [1.0]*n_bins, yerr=y_ratio_band,
#    label='ratio_band',
#    fmt='none',
#    ecolor='darkgray',
#    elinewidth=60., # HACK
#    capsize=0,
#    alpha=0.4,
#    )

xerr = [
        [w/2 for w in binwidths],
        [w/2 for w in binwidths],
    ]
make_error_boxes(ax2, bincenters, [1.0]*n_bins, xerr, y_ratio_band)

plt.errorbar(bincenters, y_ratio, yerr=y_ratio_err,
    label='ratio',
    fmt='o',
    color='black',
    ecolor='black',
    elinewidth=2,
    capsize=0,
    markersize=8,
    zorder=100,
    )

ax2.set_ylabel('Data / Model')
#ax2.set_ylim(0.7, 1.3) # HACK
plt.xlabel('Dependent variable [unit]')

fig.subplots_adjust(wspace=0, hspace=0)

In [None]:
poisson_error_up(16)

In [None]:
poisson_error_up(16*16)

In [None]:
poisson_error_down(16*16)

In [None]:
poisson_error_up(100*100)

In [None]:
poisson_error_up(1000*1000)

In [None]:
poisson_error_up(10000*10000)

In [None]:
poisson_error_down(10000*10000)

In [None]:
poisson_error_up(100000**2)

## See also

-   <https://scikit-hep.org/pyhf/>
-   Cowan, G. et al. (2010). Asymptotic formulae for likelihood-based tests of new physics. https://arxiv.org/abs/1007.1727
-   Cranmer, K. et al. (2012). HistFactory: A tool for creating statistical models for use with RooFit and RooStats. CERN-OPEN-2012-016. <https://cds.cern.ch/record/1456844>
-   Feickert, M. (2018). pyhf: a pure Python implementation of HistFactory with tensors and autograd. <https://indico.cern.ch/event/759480/>
-   <http://github.com/CoffeaTeam/coffea/blob/84314e9e0b05df328cd6de6c7192a51e1f68be18/coffea/hist/plot.py#L115>