In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [93]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib as mpl

mpl.rc("text", usetex=True)
mpl.rcParams["text.latex.preamble"] = (
    r'\usepackage{helvet}'
    r'\usepackage{sansmath}'
    r'\sansmath'
)
# P_3b: Gaussian distribution N(0, 1)
# P_4b: Gaussian distribution N(0.1, 1) 
# Signal: Gaussian distribution N(1, 0.1)


# Generate data
np.random.seed(0)
mu_3b, sigma_3b = 1, 4
mu_4b, sigma_4b = -1, 4
mu_s, sigma_s = 7, 0.4

signal_ratio = 0.05
n_samples = 100000

P_3b_samples = np.random.normal(mu_3b, sigma_3b, n_samples)
B_4b_samples = np.random.normal(mu_4b, sigma_4b, n_samples - int(n_samples * signal_ratio))
signal_samples = np.random.normal(mu_s, sigma_s, int(n_samples * signal_ratio))

# Plot histograms
nbins = 50
all_samples = np.concatenate([P_3b_samples, B_4b_samples, signal_samples])
bins = np.linspace(all_samples.min(), all_samples.max(), nbins)
plt.figure(figsize=(10, 6))
plt.hist(P_3b_samples, bins=bins, histtype='step', label="3b")
plt.hist(B_4b_samples, bins=bins, histtype='step', label="bg4b")
plt.hist(signal_samples, bins=bins, histtype='step', label="signal")
plt.legend()
plt.show()

# pdf of P_3b
x = np.linspace(all_samples.min(), all_samples.max(), 10000)

smear_sigma = 1


pdf_P_3b = stats.norm.pdf(x, mu_3b, sigma_3b)
pdf_B_4b = stats.norm.pdf(x, mu_4b, sigma_4b)
pdf_signal = stats.norm.pdf(x, mu_s, sigma_s)
pdf_P_4b = (1 - signal_ratio) * pdf_B_4b + signal_ratio * pdf_signal

pdf_P_3b_smeared = stats.norm.pdf(x, mu_3b, np.sqrt(sigma_3b**2 + smear_sigma**2))
pdf_B_4b_smeared = stats.norm.pdf(x, mu_4b, np.sqrt(sigma_4b**2 + smear_sigma**2))
pdf_signal_smeared = stats.norm.pdf(x, mu_s, np.sqrt(sigma_s**2 + smear_sigma**2))
pdf_4b_smeared = (1 - signal_ratio) * pdf_B_4b_smeared + signal_ratio * pdf_signal_smeared

grid = GridSpec(6, 2)
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(grid[:3, 0])

ax.plot(x, pdf_P_3b, label="3b")
ax.plot(x, pdf_P_4b, label="4b")
ax2 = plt.gca().twinx()
ax2.plot(x, pdf_P_4b / pdf_P_3b, label="4b / 3b", color='red')
ax.legend(loc="upper right")
ax2.legend(loc="upper left")
ax.set_xticks([])
ax2.set_xticks([])

ax = fig.add_subplot(grid[3:, 0])
ax.plot(x, pdf_P_3b_smeared, label="Smeared 3b")
ax.plot(x, pdf_B_4b_smeared, label="Smeared 4b")
ax2 = plt.gca().twinx()
ax2.plot(x, pdf_B_4b_smeared / pdf_P_3b_smeared, label="Smeared 4b / 3b", color='red')
ax.legend(loc="upper right")
ax2.legend(loc="upper left")

ax = fig.add_subplot(grid[0:2, 1])
ax.plot(x, pdf_P_4b / pdf_P_3b, label="Original 4b / 3b")
ax.legend()
ax.set_xticks([])

ax = fig.add_subplot(grid[2:4, 1])
ax.plot(x, pdf_B_4b_smeared / pdf_P_3b_smeared, label="Smeared 4b / 3b")
ax.legend()
ax.set_xticks([])

ax = fig.add_subplot(grid[4:, 1])
ax.plot(x, (pdf_P_4b / pdf_P_3b) / (pdf_B_4b_smeared / pdf_P_3b_smeared), label="Original / Smeared")
ax.legend()


plt.savefig("smear_toy.pdf", dpi=300)
plt.show()
plt.close()




# # Train a classifier
# # logistic regression

# from sklearn.model_selection import train_test_split

# # Prepare data
# X = np.concatenate([P_3b_samples, B_4b_samples, signal_samples]).reshape(-1, 1)
# y = np.concatenate([np.zeros(len(P_3b_samples)), np.ones(len(B_4b_samples)), np.ones(len(signal_samples))])
# is_signal = np.concatenate([np.zeros(len(P_3b_samples) + len(B_4b_samples)), np.ones(len(signal_samples))])

# X_train, X_test, y_train, y_test, is_signal_train, is_signal_test = train_test_split(X, y, is_signal, test_size=0.5)
# X_train_1, X_train_2, y_train_1, y_train_2, is_signal_train_1, is_signal_train_2 = train_test_split(X_train, y_train, is_signal_train, test_size=0.5)

# # smear X_train_2
# X_train_2 = X_train_2 + np.random.normal(0, smear_sigma, X_train_2.shape)

# # Train a classifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import roc_auc_score

# clf_1 = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=100)
# clf_1.fit(X_train_1, y_train_1)

# clf_2 = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=100)
# clf_2.fit(X_train_2, y_train_2)

# # Evaluate the classifier
# # sort X_test for plotting
# sorted_indices = np.argsort(X_test[:, 0])
# X_test = X_test[sorted_indices]
# y_test = y_test[sorted_indices]
# is_signal_test = is_signal_test[sorted_indices]

# y_pred_1 = clf_1.predict_proba(X_test)[:, 1]
# dr_1 = y_pred_1 / (1 - y_pred_1)
# y_pred_2 = clf_2.predict_proba(X_test)[:, 1]
# dr_2 = y_pred_2 / (1 - y_pred_2)

# dr_ratio = dr_1 / dr_2

# # bins = np.linspace(dr_ratio.min(), dr_ratio.max(), nbins)
# # plt.hist(dr_ratio[(is_signal_test == 0) & (y_test == 1)], bins=bins, histtype='step', label="background")
# # plt.hist(dr_ratio[is_signal_test == 1], bins=bins, histtype='step', label="signal")
# # plt.legend()
# # plt.show()

# # bins = np.linspace(dr_1.min(), dr_1.max(), nbins)
# # plt.hist(dr_1[(is_signal_test == 0) & (y_test == 1)], bins=bins, histtype='step', label="background")
# # plt.hist(dr_1[is_signal_test == 1], bins=bins, histtype='step', label="signal")
# # plt.legend()
# # plt.show()

# fig, ax = plt.subplots(3, 1, figsize=(5, 6))
# xlim = (-10, 10)
# ax[0].set_xlim(xlim)
# ax[0].set_ylim((0, 4))
# ax[1].set_xlim(xlim)
# ax[1].set_ylim((0, 4))
# ax[2].set_xlim(xlim)
# ax[0].plot(X_test, dr_1, label="Original density ratio", color='blue')
# ax[1].plot(X_test, dr_2, label="Smeared density ratio", color='green')
# ax[2].plot(X_test, dr_1 / dr_2, label="Original / Smeared", color='red')
# for i in range(3):
#     ax[i].legend()
# plt.savefig("smear_toy.pdf", dpi=300)
# plt.show()


RuntimeError: Failed to process string with tex because latex could not be found

<Figure size 1000x600 with 1 Axes>

RuntimeError: Failed to process string with tex because latex could not be found

Error in callback <function _draw_all_if_interactive at 0x7fef2119ac00> (for post_execute), with arguments args (),kwargs {}:


RuntimeError: Failed to process string with tex because latex could not be found

RuntimeError: Failed to process string with tex because latex could not be found

<Figure size 1000x600 with 7 Axes>