In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
import datetime
import gc
import warnings
import time
import csv
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.ticker as mticker
import pylandau
from pylandau import langau
from importlib import reload
from scipy.optimize import curve_fit, fsolve
from scipy.integrate import quad

In [3]:
fit_data_loc = r'./data/stat_fit_data_full.csv'
# fit_data_loc = r'./data/fit_data_full.csv'
fitdata = pd.read_csv(fit_data_loc)
display(fitdata)

Unnamed: 0,mpv,eta,sigma,A,e_min,e_max
0,1.803007,0.093791,0.119056,46997.004267,0.100015,0.775131
1,1.798057,0.089401,0.119167,47823.332739,0.775131,1.051753
2,1.802907,0.088229,0.11979,48442.106488,1.051753,1.329333
3,1.815841,0.090575,0.118259,47281.687815,1.329333,1.61618
4,1.827044,0.089453,0.119206,46258.901199,1.61618,1.922888
5,1.840702,0.088889,0.123704,46919.792355,1.922888,2.261627
6,1.846874,0.089786,0.119922,46861.289391,2.261627,2.633586
7,1.858266,0.091372,0.124476,45887.131602,2.633586,3.050361
8,1.861948,0.090268,0.120616,46809.40842,3.050361,3.529028
9,1.869164,0.091751,0.122918,45821.256348,3.529028,4.081656


This fixes an issue with the <code>pylandau.langau_pdf</code> function normalization. I redefine my own langau.

In [4]:
def langau_pdf(dedx, mpv, eta, sig):
    return eta * pylandau.get_langau_pdf(dedx, mpv, eta, sig)

In [66]:
import cer_util
cer = cer_util.CER()
cer.load_muons()
cer.slim_muons() 

Loading Data...
Loaded!
Sorting into array of muons...
Done!
Removed 5661 muons.


The likelihood will only depend on the dedxs and the landau fit parameters. The true energies are calculated for reference. For a given dedx ($x_i$) the probability that it corresponds to a measurement from energy bin $j$ is given by:
$$p_j(x_i)=\frac{f_j(x_i)}{\sum_{k}f_k(x_i)}$$
where $f_j$ is the langau pdf associated with energy bin $j$ and the denominator is the sum of the langau pdfs at $x_i$ over all energy bins. **Assuming the $x_i$ are independent (which is wrong)**, the likelihood that all the data $x_i$ correspond to energy bin $j$ is:
$$\mathscr{L}_j=\prod_i\frac{f_j(x_i)}{\sum_k f_k(x_i)}$$
We use the log-likelihood:
$$\log\mathscr{L}_j=\sum_i\left[\log f_j(x_i)-\log \sum_k f_k(x_i)\right]$$
This uses an incorrect assumption that the $x_i$ are all independent. In principle, the $x_i$ should follow the Bethe-Bloch curve as each subsequent $x_i$ corresponds to an energy loss from a muon of slightly lower kinetic energy. Corrections for this effect will be implemented later. Then the matter of maximizing the likelihood is akin to selecting energy bin $j$ with the highest $\mathscr{L}$.

In [67]:
def like_max(dedxs):
    landau_params = np.array([ fitdata.iloc[i][:3] for i in range(fitdata.shape[0]) ])
    
    # One big list comprehension for maximum calculation speed
    loglike = np.array([ np.sum([ np.log(langau_pdf(xi, *fj_params)) - np.log(np.sum([ langau_pdf(xi, *fk_params) for fk_params in landau_params])) for xi in dedxs ]) for fj_params in landau_params])
    
    jtilde = np.argmax(loglike)
    e_min_tilde, e_max_tilde = fitdata.iloc[jtilde,-2:]
    return e_min_tilde, e_max_tilde, loglike

In [68]:
def reconstruct_e(muon):  
    es, dedxs = cer.generate_eloss(muon)
    e_min_tilde, e_max_tilde, loglike = like_max(dedxs)
    return e_min_tilde, e_max_tilde, loglike

In [73]:
reload(cer_util)
cer = cer_util.CER()

In [75]:
truth = []
reconstructed = []
loglikes = []
p_count = 0

tot_particles = len(muons)
pcnt_per_count = 100./tot_particles
count_per_pcnt = 1/pcnt_per_count
running_count_for_pcnt_increment = 0

print("Generating elosses and reconstructing energy...")
start = time.perf_counter()
for muon in muons:
    if p_count > running_count_for_pcnt_increment:
        print(f"{(running_count_for_pcnt_increment / tot_particles)*100:.0f}%   ", end = '\r', flush=True)
        running_count_for_pcnt_increment += count_per_pcnt
        
    p_count += 1
    e_min, e_max, loglike = reconstruct_e(muon)
    
    true_e = muon['backtracked_e']
    truth.append(true_e)
    
    guess_e = (e_min, e_max)
    reconstructed.append(guess_e)
    loglikes.append(loglike)
    
end = time.perf_counter()
t = end-start
print(f"Done! Analysis time: {int(t//60)}m {t%60:0.1f}s")

Generating elosses and reconstructing energy...
Done! Analysis time: 2m 25.8s


In [111]:
like_data_dict = []
for i in range(len(truth)):
    t = truth[i]
    re_min = reconstructed[i][0]
    re_max = reconstructed[i][1]
    
    this_dict = {'truth': t, 'reconstructed_min': re_min, 'reconstructed_max': re_max}
    
    for j in range(len(loglikes[i])):
        like = loglikes[i][j]
        this_dict[f'L{j}'] = like
    
    like_data_dict.append(this_dict)
    
like_data = pd.DataFrame.from_dict(like_data_dict)

In [None]:
like_data.to_csv(r'./data/stat_binned_likelihood_data.csv', index=False, header=True)

## Notes
- Obviously, this likelihood is pretty much useless, although there are further things to do to hopefully make it work better.
     - Could langau fits binned by statistics, rather than binned logarithmically. The likelihood can be sensitive to uncertainties, so having each bin have the same statistics might fix this. This is borne out by the fact that the likelihood
     - Could implement an improved likelihood function, which accounts for the loss of energy each step, although I am not quite sure how to do this.
     - Adding another dimension to the likelihood in the form of the pitch may also help to fix this, although this is a large project.

In [95]:
correct = 0
for i in range(len(truth)):
    t = truth[i]
    r = reconstructed[i]
    if t > r[0] and t < r[1]:
        correct += 1
print(correct)
print(len(truth))
print(correct/len(truth)*100)

4
200
2.0


In [97]:
a,b = zip(*reconstructed)

In [105]:
len(loglikes)

200