In [1]:
import zfit
import math
from zfit import z
import numpy as np
import tensorflow as tf
from scipy.optimize import minimize

zfit.settings.options['numerical_grad'] = True
class HistPDF(zfit.pdf.BasePDF):

    def __init__(self, hist_args, hist_bins, obs, name='HistPDF'):
        self.rv_hist = scipy.stats.rv_histogram([hist_args, hist_bins])
        super().__init__(obs=obs, name=name)

    def _unnormalized_pdf(self, x):
        x = z.unstack_x(x)
        probs =  z.py_function(func=self.rv_hist.pdf, inp=[x], Tout=tf.float64)
        probs.set_shape(x.shape)
        return probs

This includes Python itself. Therefore, Python 3.6 will be dropped in the near future (beginning of May 2021)
and 3.9 will be added to the supported versions.

Feel free to contact us in case of problems to upgrade to a more recent version of Python.
  """Entry point for launching an IPython kernel.


In [2]:
# mu2 = zfit.Parameter("mu2", 5., step_size=0)
# sigma2 = zfit.Parameter("sigma2", 1., step_size=0)
# lambd2 = zfit.Parameter("lambda2", -0.2, step_size=0)
# frac2 = zfit.Parameter("fraction2", 0.5, 0, 1)
# frac1 = zfit.Parameter("fraction1", 0.5, step_size=0)
# create space
obs1 = zfit.Space("x", limits=(0, 10))
obs2 = zfit.Space("x", limits=(0, 10))

# parameters
mu1 = zfit.Parameter("mu1", 5., 1, 10, step_size=0)
sigma1 = zfit.Parameter("sigma1", 1., 0.1, 10, step_size=0)
lambd1 = zfit.Parameter("lambda1", -0.2, -1, -0.01, step_size=0)
frac1 = zfit.Parameter("fraction1", 0.5, 0, 1)

mu2 = zfit.Parameter("mu2", 5., step_size=0)
sigma2 = zfit.Parameter("sigma2", 1., step_size=0)
lambd2 = zfit.Parameter("lambda2", -0.2, step_size=0)
frac2 = zfit.Parameter("fraction2", 0.5, step_size=0)


gauss1 = zfit.pdf.Gauss(mu=mu1, sigma=sigma1, obs=obs1)
exponential1 = zfit.pdf.Exponential(lambd1, obs=obs1)
model1 = zfit.pdf.SumPDF([gauss1, exponential1], fracs=frac1)


gauss2 = zfit.pdf.Gauss(mu=mu2, sigma=sigma2, obs=obs2)
exponential2 = zfit.pdf.Exponential(lambd2, obs=obs2)
model2 = zfit.pdf.SumPDF([gauss2, exponential2], fracs=frac2)

In [3]:
n_sample = 1000000

exp_data = exponential2.sample(n=n_sample * (1 - frac1)).numpy()

gauss_data = gauss2.sample(n=n_sample * frac1).numpy()

data = model1.create_sampler(n_sample, limits=obs1)
data.resample()

In [4]:
data_np = data[:, 0].numpy()
exp_data_np = exp_data[:, 0]
gauss_data_np = gauss_data[:, 0]

In [1]:
data_hist = np.histogram(data_np, bins=1000)
exp_data_hist = np.histogram(exp_data_np, bins=1000)
gauss_data_hist = np.histogram(gauss_data_np, bins=1000)
sim_hists = []
sim_hists.append(exp_data_hist)
sim_hists.append(gauss_data_hist)

NameError: name 'np' is not defined

In [589]:
import scipy
import tensorflow.experimental.numpy as tnp  # TODO 1: replace (possible) np calls with tnp
class FractionFitterV3:

    def __init__(self, data_hist, sim_hists, P):
        self.data_hist = data_hist
        self.P = np.array(P)  # vectorization 3
        self.sim_hists = [hist for hist in sim_hists]
        self.d = np.array(self.data_hist[0]) # where d[i] amount of events in bin from data
        self.N_D = np.sum(self.d)#all observable data amount

        # vectorization 3
        self.N = np.array([np.sum(h[0]) for h in sim_hists])# amount of simulation data from sources e.g. N[0] from source 0 .. N[j] from source j
        self.sources_num = len(P)
        self.bins_num = len(data_hist[0])
        self.p = self.N_D * self.P / self.N
        self.a = np.array([self.sim_hists[j][0] for j in range(self.sources_num)])
        self.nonzero_indices = np.where(self.d != 0)[0]
        zfit.run.set_autograd_mode(False)
        zfit.run.set_graph_mode(False)
        
    def norma(self, v):
        return math.sqrt(np.sum(v ** 2))
    #function to minimize for finding optimat t according to (15) from the paper        
    def f(self, t, a, p, i):
        return np.sum((p * a[:, i] / (1 + p * t))) - self.d[i]/(1 - t)
    
    def f_vectorized(self, t, a, p, i):  # add an axis argumend to sum
        term1 = np.sum((p[:, None] * a[:, i] / (1 + p[:, None] * t[None, :])),
                       axis=0)
        term2 = self.d[i]/(1 - t)
        return term1 - term2
    
    def jac_f(self, t, a, p, i):
        return np.diag(np.sum((p[:, None] * a[:, i] * p[:, None])/(1 + p[:, None] * t[None, :])**2, axis=0) + self.d[i]/(1 - t)**2)
    
    def sqF(self, p):
        t = np.ones_like(self.d)
        x0 = t[self.nonzero_indices] * 0 #may be 0.01?
        p = np.asarray(p)
        # TODO 3: replace with https://www.tensorflow.org/probability/api_docs/python/tfp/math/find_root_chandrupatla
        t_solved = scipy.optimize.root(self.f_vectorized,
                                       x0=x0, 
                                    #    x0=0.1 * np.ones_like(nonzero_indices), 
                                       args=(self.a, p, self.nonzero_indices),
                                       jac=self.jac_f,
                                       method='hybr',  # 'krylov',
                                       tol=None,
                                       callback=None,
                                       options={}).x # bounds(-1/max(p), 1)
        t[self.nonzero_indices] = t_solved
        #A[j][i] fitted amount of observations in i bin from j source
        A = self.a/(1 + p[:, None]*t[None, :])
        A = np.maximum(A, 0.05)
        
        return np.sum(np.sum((self.d[None, :] * A)/np.sum(p[None, :] * A.transpose(), axis=1) - A, axis=1)**2, axis=0)
    sqF.errordef = 0.5
    
    def fit(self, eps):
        minimizer = zfit.minimize.Minuit(tol=eps)  # 2: 
        p_new = np.array(minimizer.minimize(self.sqF, self.p).params)   
        print(np.abs(self.norma(p_new) - self.norma(self.p)))
        return p_new

In [590]:
fitter = FractionFitterV3(data_hist=data_hist, sim_hists=sim_hists, P=[0.4, 0.6])

In [505]:
p = fitter.fit(1e-4)

0.02656691298060676


In [591]:
%%timeit for _ in range(2): True
p = fitter.fit(1e-4)

0.02656442626066613
0.02656442626066613
0.02656442626066613
0.02656442626066613
0.02656442626066613
0.02656442626066613
0.02656442626066613
0.02656442626066613
15.6 s ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [887]:
P = []
for j in range(len(p)):
    P.append(p[j] * fitter.N[j]/fitter.N_D)

In [888]:
P

[25059.392597172402, 6305.526908127644]

In [None]:
a = tf.constant([[1, 2], [3, 4]])                 
b = tf.add(a, 1)

"""This example illustrates how to minimize an arbitrary Python function using the zfit minimizer.
17.4 s ± 514 ms per loop
This may has some overhead in the beginning and won't be instantly fast compared to other libraries if run once.

Copyright (c) 2021 zfit
"""

#  Copyright (c) 2021 zfit

import numpy as np

import zfit

# set everything to numpy mode
zfit.run.set_autograd_mode(False)
zfit.run.set_graph_mode(False)

# create our favorite minimizer
# minimizer = zfit.minimize.IpyoptV1()


# minimizer = zfit.minimize.Minuit()
# minimizer = zfit.minimize.ScipyTrustKrylovV1()
minimizer = zfit.minimize.NLoptLBFGSV1()


def func(x):
    x = np.array(x)  # make sure it's an array
    return np.sum((x - 0.1) ** 2 + x[1] ** 4)


# we can also use a more complicated function instead
# from scipy.optimize import rosen as func


# we need to set the errordef, the definition of "1 sigma"
func.errordef = 0.5

# initial parameters
params = [1, -3, 2, 1.4, 11]
# or for a more fine-grained control
# params = {
#     'value': [1, -3, 2, 1.4, 11],  # mandatory
#     'lower': [-2, -5, -5, -10, -15],  # lower bound, can be omitted
#     'upper': [2, 4, 5, 10, 15],  # upper bound, can be omitted
#     'step_size': [0.1] * 5,  # initial step size, can be omitted
# }

# minimize
result = minimizer.minimize(func, params)

# estimate errors
result.hesse()
result.errors()
print(np.arraresult.params)

In [None]:
data.resample()
data_np = data[:, 0].numpy()
data_hist = np.histogram(data_np, bins=30)

In [None]:
a = np.array([[1,2,3,4,5], [1,2,3,4,5], [1,2,3,4,5], [1,2,3,4,5], [1,2,3,4,5], [1,2,3,4,5], [1,2,3,4,5]])
p = np.array([1,2,3,4,5,6,7])
d = np.array([1,2,3,4,5])
t = np.array([1,2,3,4,5])

A = a/(1 + p[:, None]*t[None, :])

In [None]:
np.sum(np.sum((d * A)/np.sum(p * A.transpose(), axis=1) - A, axis=1)**2, axis=0)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
bins = [math.log2(10), math.log2(100), math.log2(500), math.log2(1000)]
times = [60.5, 123, 7000, 53700]
plt.figure(figsize=(7, 7))
plt.xlabel('log(bins_num) - количество бинов')
plt.ylabel('T - время работы программы, ms')
plt.scatter(bins, times)
plt.plot(bins, times, 'r', label='T(bins_num)')
plt.legend(bbox_to_anchor=(0.6, 1), loc=4)
plt.show()


In [None]:
import zfit 
import numpy as np
import tensorflow as tf
import scipy.stats
from zfit import z
import random
obs = zfit.Space("x", limits=(0, 10))

mu = zfit.Parameter("mu", 5., step_size=0)
sigma = zfit.Parameter("sigma", 1., step_size=0)
lambd = zfit.Parameter("lambda", -0.2, step_size=0)
frac = zfit.Parameter("fraction", 0.5, step_size=0)

mu1 = zfit.Parameter("mu2", 5., 0, 10)
sigma1 = zfit.Parameter("sigma2", 1., 0.1, 2)
lambd1 = zfit.Parameter("lambda2", -0.2, -0.5, 0.5)
frac1 = zfit.Parameter("fraction2", 0.5, 0, 1)

gauss = zfit.pdf.Gauss(mu=mu, sigma=sigma, obs=obs)
exponential = zfit.pdf.Exponential(lambd, obs=obs)
model = zfit.pdf.SumPDF([gauss, exponential], fracs=frac)

gauss1 = zfit.pdf.Gauss(mu=mu1, sigma=sigma1, obs=obs)
exponential1 = zfit.pdf.Exponential(lambd1, obs=obs)
model1 = zfit.pdf.SumPDF([gauss1, exponential1], fracs=frac1)
# data
n_sample = 100

exp_data = exponential.sample(n=n_sample * (1 - frac)).numpy()
gauss_data = gauss.sample(n=n_sample * frac).numpy()
data = model.create_sampler(n_sample, limits=obs)

data.resample()
n_sim = 100

In [None]:
zfit.run.set_graph_mode(False)
zfit.settings.options['numerical_grad'] = True
from scipy.stats import norm
import matplotlib.pyplot as plt
c = 0
arr = np.empty((3, 3), dtype="float32")
arn_sim = [100, 500, 1000]

for i in range(0, 3):
    n_sim = arn_sim[i]
    print(n_sim)
    gauss_hist = gauss.create_sampler(n_sim, limits=obs)
    exp_hist = exponential.create_sampler(n_sim, limits=obs)
    print(exp_hist)
    minimizer = zfit.minimize.Minuit()
    mean_bias = []
    minimizer = zfit.minimize.Minuit(verbosity = 0)
    for m in range(0, 30):
        print("Прогон номер ", m)
        res = []
        models = []
        gauss_hist.resample()
        data_np = gauss_hist[:, 0].numpy()
        histogramm = np.histogram(data_np, bins=100)
        gauss_hist_init = HistPDF(histogramm[0], histogramm[1], obs = obs)
#         exp_hist.resample()
#         exp_data_np = exp_hist[:, 0].numpy()
#         exp_histogramm = np.histogram(exp_data_np, bins=100)
#         exp_hist_init = HistPDF(exp_histogramm[0], exp_histogramm[1], obs = obs)
        modelHist = zfit.pdf.SumPDF([gauss_hist_init, exponential1], fracs=frac1)
        nll2 = zfit.loss.UnbinnedNLL(model=modelHist, data=data) 
        for i in range(0, 30):
            data.resample()
            result1 = minimizer.minimize(nll2, params=[frac1])  
            value_stat = list(result1.error().keys())[0] 
            error_stat = list(result1.error().values())[0] 
            del result1
            frac_value = float(value_stat.value())  
            res.append(final_value) 
        (mu_ar, sigma_ar) = norm.fit(res)
        mean_bias.append(mu_ar)
    (mu_si, sigma_si) = norm.fit(mean_bias)
    arr[c, 0] = n_sim
    arr[c, 1] = mu_si
    arr[c, 2] = sigma_si
    c = c + 1

In [None]:
zfit.run.set_graph_mode(False)
zfit.settings.options['numerical_grad'] = True
from scipy.stats import norm
import matplotlib.pyplot as plt
c = 0
arr = np.empty((7, 3), dtype="float32")
arn_sim = [10, 25, 50, 100, 500, 1000, 3000]

for i in range(0, 7):
    n_sim = arn_sim[i]
    print(n_sim)
    data_hist = gauss.create_sampler(n_sim, limits=obs)
    # data1 = hist_init.create_sampler(n_sample, limits=obs)
    # data1.resample()
    # data_sim = gauss.create_sampler(n_sim, limits=obs)
    minimizer = zfit.minimize.Minuit()
    mean_bias = []
    minimizer = zfit.minimize.Minuit(verbosity = 0)
    # nll1 = zfit.loss.UnbinnedNLL(model=hist_init, data=data_sim)
    for m in range(0, 30):
        print("Прогон номер ", m)
        res = []
        models = []
        data_hist.resample()
        data_np = data_hist[:, 0].numpy()
        histogramm = np.histogram(data_np, bins=100)
        hist_init = HistPDF(histogramm[0], histogramm[1], obs = obs)
        modelHist = zfit.pdf.SumPDF([hist_init, exponential1], fracs=frac1)
        nll2 = zfit.loss.UnbinnedNLL(model=modelHist, data=data) 
        for i in range(0, 30):
            data.resample()
            result1 = minimizer.minimize(nll2, params=[frac1])  
            value_stat = list(result1.error().keys())[0] 
            error_stat = list(result1.error().values())[0] 
            del result1
            frac_value = float(value_stat.value()) 
            frac_up_error = float(error_stat['upper']) 
            frac_low_error = float(error_stat['lower']) 
            if(frac_value - 0.5 > 0): 
                frac_error = frac_up_error 
            else: 
                frac_error = abs(frac_low_error) 
            final_value = (0.5-frac_value)/(frac_error) 
            res.append(final_value)
         
        (mu_ar, sigma_ar) = norm.fit(res)
        mean_bias.append(mu_ar)
    (mu_si, sigma_si) = norm.fit(mean_bias)

    arr[c, 0] = n_sim
    arr[c, 1] = mu_si
    arr[c, 2] = sigma_si
    c = c + 1
arr

In [653]:
%%timeit for _ in range(2): True
np.sum(np.ones((100,100,100,100, 2)) * np.ones((100,100,100,100, 2)), axis=0)

1.92 s ± 7.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [709]:
%%timeit for _ in range(2): True
tnp.sum(tnp.array(np.ones((100,100,100,100, 2))) * tnp.array(np.ones((100,100,100,100, 2))), axis=0)

SyntaxError: invalid syntax (<ipython-input-709-6f613f3da4fa>, line 3)

In [900]:
#tnp version!!!
import scipy
import tensorflow.experimental.numpy as tnp  # TODO 1: replace (possible) np calls with tnp
from tensorflow_probability.python.math.root_search import find_root_chandrupatla
class FractionFitterV3:

    def __init__(self, data_hist, sim_hists, P):
        self.data_hist = data_hist
        self.P = np.array(P)  # vectorization 3
        self.sim_hists = [hist for hist in sim_hists]
        self.d = np.array(self.data_hist[0]) # where d[i] amount of events in bin from data
        self.N_D = np.sum(self.d)#all observable data amount

        # vectorization 3
        self.N = np.array([np.sum(h[0]) for h in sim_hists])# amount of simulation data from sources e.g. N[0] from source 0 .. N[j] from source j
        self.sources_num = len(P)
        self.bins_num = len(data_hist[0])
        self.p = self.N_D * self.P / self.N
        #a[j][i] amount of observations in i bin from j source
        self.a = tf.constant([self.sim_hists[j][0] for j in range(self.sources_num)], dtype="int32")
        #self.a = tnp.array([self.sim_hists[j][0] for j in range(self.sources_num)], dtype="int64")
        self.nonzero_indices = np.where(self.d != 0)[0]
        self.nonzero_indices_tf = np.array([[i] for i in self.nonzero_indices])
        zfit.run.set_autograd_mode(False)
        zfit.run.set_graph_mode(False)
        
    def norma(self, v):
        return math.sqrt(sum(v ** 2))
    #function to minimize for finding optimat t according to (15) from the paper        
    
    def f_vectorized(self, t, p):  # add an axis argumend to sum
        term1 = tnp.sum(p[:, None]* tf.gather(self.a, self.nonzero_indices, axis=1) / (1 + p[:, None] * t[None, :]), axis=0)
        term2 = self.d[self.nonzero_indices]/(1 - t)
        return term1 - term2
    
    def jac_f(self, t, p):
            return tnp.diag(tnp.sum((p[:, None] * tf.gather(self.a, self.nonzero_indices, axis=1) * p[:, None])/(1 + p[:, None] * t[None, :])**2, axis=0) + self.d[self.nonzero_indices]/(1 - t)**2)
    
    def sqF(self, p):
        t = tnp.ones_like(self.d, dtype="int32")
        x0 = t[self.nonzero_indices] * 0 
        p = tnp.array(np.asarray(p))
        # TODO 3: replace with https://www.tensorflow.org/probability/api_docs/python/tfp/math/find_root_chandrupatla
        def func_to_minimize(t):
            return self.f_vectorized(t, p)
        t_solved = scipy.optimize.root(self.f_vectorized,
                                       x0=x0, 
                                    #    x0=0.1 * np.ones_like(nonzero_indices), 
                                       args=(p),
                                       jac=self.jac_f,
                                       method='hybr',  # 'krylov',
                                       tol=None,
                                       callback=None,
                                       options={}).x # bounds(-1/max(p), 1)
        t = tf.tensor_scatter_nd_update(t, self.nonzero_indices_tf, t_solved)
#         A = tf.constant(self.a/(1 + p[:, None]*t[None, :]), dtype="float64") 
        A = tnp.maximum(self.a/(1 + p[:, None]*t[None, :]), 0.05)
        
        return tnp.sum(tnp.sum(self.d[None, :] *  A / tnp.sum(p[None, :] * A.transpose(), axis=1) - A, axis=1)**2, axis=0)
    sqF.errordef = 0.5
    
    def fit(self, eps):
        minimizer = zfit.minimize.Minuit(tol=eps)  # 2: 
        p_new = np.array(minimizer.minimize(self.sqF, self.p).params)   
        print(np.abs(self.norma(p_new) - self.norma(self.p)))
        return p_new

In [901]:
fitter = FractionFitterV3(data_hist=data_hist, sim_hists=sim_hists, P=[0.4, 0.6])

In [902]:
# %%timeit for _ in range(2): True
p = fitter.fit(1e-4)

0.02656691298060676


In [905]:
#tnp version!!!
import scipy
import tensorflow.experimental.numpy as tnp 
from tensorflow_probability.python.math.root_search import find_root_chandrupatla
class FractionFitterV4:

    def __init__(self, data_hist, sim_hists, P):
        self.data_hist = data_hist
        self.P = np.array(P)  # vectorization 3
        self.sim_hists = [hist for hist in sim_hists]
        self.d = np.array(self.data_hist[0]) # where d[i] amount of events in bin from data
        self.N_D = np.sum(self.d)#all observable data amount

        # vectorization 3
        self.N = np.array([np.sum(h[0]) for h in sim_hists])# amount of simulation data from sources e.g. N[0] from source 0 .. N[j] from source j
        self.sources_num = len(P)
        self.bins_num = len(data_hist[0])
        self.p = self.N_D * self.P / self.N
        #a[j][i] amount of observations in i bin from j source
        self.a = tf.constant([self.sim_hists[j][0] for j in range(self.sources_num)], dtype="float64")
        #self.a = tnp.array([self.sim_hists[j][0] for j in range(self.sources_num)], dtype="int64")
        self.nonzero_indices = np.where(self.d != 0)[0]
        self.nonzero_indices_tf = np.array([[i] for i in self.nonzero_indices])
        zfit.run.set_autograd_mode(False)
        zfit.run.set_graph_mode(False)
        
    def norma(self, v):
        return math.sqrt(sum(v ** 2))
    #function to minimize for finding optimal t according to (15) from the paper        
    
    def f_vectorized(self, t, p):  # add an axis argumend to sum
        term1 = tnp.sum(p[:, None]* tf.gather(self.a, self.nonzero_indices, axis=1) / (1 + p[:, None] * t[None, :]), axis=0)
        term2 = self.d[self.nonzero_indices]/(1 - t)
        return term1 - term2
    
    def jac_f(self, t, p):
            return tnp.diag(tnp.sum((p[:, None] * tf.gather(self.a, self.nonzero_indices, axis=1) * p[:, None])/(1 + p[:, None] * t[None, :])**2, axis=0) + self.d[self.nonzero_indices]/(1 - t)**2)
    
    def sqF(self, p):
        t = tnp.ones_like(self.d, dtype="float64")
        x0 = t[self.nonzero_indices] * 0 
        p = tnp.array(np.asarray(p))
        high_bound = tnp.ones_like(t)
        low_bound = high_bound * (-1/tnp.max(p))
        # TODO 3: replace with https://www.tensorflow.org/probability/api_docs/python/tfp/math/find_root_chandrupatla
        def func_to_minimize(t):
            return self.f_vectorized(t, p)
        t_solved = find_root_chandrupatla(func_to_minimize, low_bound, high_bound).estimated_root
        t = tf.tensor_scatter_nd_update(t, self.nonzero_indices_tf, t_solved)
#         A = tf.constant(self.a/(1 + p[:, None]*t[None, :]), dtype="float64") 
        A = tnp.maximum(self.a/(1 + p[:, None]*t[None, :]), 0.05)
        
        return tnp.sum(tnp.sum(self.d[None, :] *  A / tnp.sum(p[None, :] * A.transpose(), axis=1) - A, axis=1)**2, axis=0)
    sqF.errordef = 0.5
    
    def fit(self, eps):
        minimizer = zfit.minimize.Minuit(tol=eps)  # 2: 
        p_new = np.array(minimizer.minimize(self.sqF, self.p).params)   
        print(np.abs(self.norma(p_new) - self.norma(self.p)))
        return p_new

In [906]:
fitter = FractionFitterV4(data_hist=data_hist, sim_hists=sim_hists, P=[0.4, 0.6])
p = fitter.fit(1e-4)

0.02652955209427721
