### Adding noise to electron and muon energy signal

The aim of this notebook is to add noise to the energy images of electrons/muons by the addition of both energy and noise numpy arrays

In [1]:
import os, shutil
import matplotlib.pyplot as plt
import numpy as np
import glob

In [2]:
path = '/gpfs/projects/damic/cropped_muons'

Selecting the energy and noise of muons:

In [3]:
muons = glob.glob1(path,"*13.npz") #all the muons

len_mu = len(muons)

all_muon_energy = [np.load(os.path.join(path, muons[_]))['energy'] for _ in range(len_mu)]
all_muon_noise = [np.load(os.path.join(path, muons[_]))['noise'] for _ in range(len_mu)]

In [4]:
all_muon_energy_ = np.dstack(all_muon_energy)
all_muon_energy_ = np.rollaxis(all_muon_energy_,-1)
print(all_muon_energy_.shape)

all_muon_noise_ = np.dstack(all_muon_noise)
all_muon_noise_ = np.rollaxis(all_muon_noise_, -1)
print(all_muon_noise_.shape)

(801, 296, 286)
(801, 296, 286)


In [5]:
#now we add noise to energy images:

muon_noisy_energy = all_muon_energy_+ all_muon_noise_
muon_noisy_energy.shape

(801, 296, 286)

Similary for the electrons:

In [6]:
base_dir = '/gpfs/projects/damic/electrons_padded'

In [7]:
electrons = glob.glob1(base_dir,"*11.npz") #all the muons

elements = len(electrons)

all_electron_energy = [np.load(os.path.join(base_dir, electrons[_]))['energy'] for _ in range(elements)]
all_electron_noise = [np.load(os.path.join(base_dir, electrons[_]))['noise'] for _ in range(elements)]

In [8]:
all_electron_energy_ = np.dstack(all_electron_energy)
all_electron_energy_ = np.rollaxis(all_electron_energy_,-1)
print(all_electron_energy_.shape)

all_electron_noise_ = np.dstack(all_electron_noise)
all_electron_noise_ = np.rollaxis(all_electron_noise_, -1)
print(all_electron_noise_.shape)

(2628, 296, 286)
(2628, 296, 286)


In [9]:
#now we add noise to energy images:

electron_noisy_energy = all_electron_energy_+ all_electron_noise_
electron_noisy_energy.shape

(2628, 296, 286)

We put together the noise:

In [10]:
#putting together all the noise
all_noise = np.concatenate((all_electron_noise_, all_muon_noise_), axis=0) 

Finally, we create the labels:

In [11]:
labels_electron = np.repeat(1, muon_noisy_energy.shape[0])
labels_muon = np.repeat(2, electron_noisy_energy.shape[0])
labels_noise = np.repeat(0, all_noise.shape[0])

We insert everything into `X` and `y`:

In [12]:
X = np.concatenate((electron_noisy_energy, muon_noisy_energy, all_noise), axis=0) #energy followed by noise
y = np.concatenate((labels_electron, labels_muon, labels_noise), axis=0) #energy labels followed by noise labels

In [13]:
print("%f gb of data+labels" % ((X.size * X.itemsize + y.size * y.itemsize) *10**(-9)))

4.644622 gb of data+labels


In [14]:
np.savez_compressed('/gpfs/projects/damic/eVSmuVSn_noisy1', data=X, labels=y)

Now we proceed to **balance** the data:

In [1]:
import imblearn
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

Using TensorFlow backend.


In [3]:
import numpy as np

In [4]:
loaded = np.load('/gpfs/projects/damic/eVSmuVSn_noisy1.npz')
X = loaded['data']
y = loaded['labels']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Let's see how many samples are available in the train datset

In [7]:
print(Counter(y_train))

Counter({0: 2560, 2: 1977, 1: 606})


In [8]:
aux_X = X_train.reshape((X_train.shape[0], X_train.shape[1]* X_train.shape[2]))

In [9]:
#undersample the noise to aprox the sample of electrons

under = RandomUnderSampler(sampling_strategy={0: 1977} )

X_under, y_under = under.fit_resample(aux_X, y_train)

print(Counter(y_under))

Counter({0: 1977, 2: 1977, 1: 606})


In [10]:
oversample = RandomOverSampler(sampling_strategy='minority')

X_over, y_over = oversample.fit_resample(X_under, y_under)

print(Counter(y_over))

Counter({0: 1977, 1: 1977, 2: 1977})


We can observe how the number of samples have been increased

In [11]:
X_over.shape, y_over.shape

((5931, 84656), (5931,))

In [12]:
X_train.shape, y_train.shape

((5143, 296, 286), (5143,))

Finally we reshape the X_train

In [13]:
X_train = X_over.reshape((X_over.shape[0], X_train.shape[1], X_train.shape[2]))

In [14]:
X_train.shape

(5931, 296, 286)

In [15]:
y_train = y_over

We have the same amount of sample of each class! Now we save these arrays (for training and test)

In [16]:
np.savez_compressed('/gpfs/projects/damic/eVSmuVSn_noisytr1', data=X_train, labels=y_train)

In [17]:
np.savez_compressed('/gpfs/projects/damic/eVSmuVSn_noisyte1', data_test=X_test, labels_test=y_test)