# Uncertainty estimation in deep learning based-classifiers of High Energy Physics events using Monte Carlo Dropout.
-----
## Higgs Dataset

R. Pezoa (UV, CCTVal-USM), S. Bórquez(USM), W. Brooks (USM), L. Salinas (USM), C. Torres (USM)

## Libraries

In [1]:
import time

import os
import tqdm
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


2022-09-19 18:22:48.848551: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-19 18:22:48.852976: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-19 18:22:48.853299: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=7120)])
    except RuntimeError as e:
        print(e)

In [3]:
tf.config.run_functions_eagerly(True)

## Data
-----
Hggs dataset

- Data is obtained from: https://www.openml.org/d/23512
- Each event is represented by a set of 28 features, including 21 low-level features corresponding to physics properties measured by the detector, and 7 high-level features derived from the previous ones.

In [4]:
#data_path = "/mnt/storage-large/dataset/higgs/phpZLgL9q.csv"
data_path = "/mnt/storage-large/dataset/higgs/HIGGS.csv"
name_columns = [
    "class","lepton_pT","lepton_eta","lepton_phi","missing_energy_magnitude","missing_energy_phi","jet1pt","jet1eta","jet1phi","jet1b-tag","jet2pt","jet2eta","jet2phi","jet2b-tag","jet3pt","jet3eta","jet3phi","jet3b-tag","jet4pt","jet4eta","jet4phi","jet4b-tag","m_jj","m_jjj","m_lv","m_jlv","m_bb","m_wbb","m_wwbb"
]

In [5]:
seed_=420
# Read data file
df = pd.read_csv(data_path, names=name_columns)
df.rename(columns = {'class': 'label'}, inplace = True)

# Features names
y = df["label"]
X = df.iloc[:,1:]
features_names = list(X.columns)

# Scaling data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)

del X
del df

X = scaled_data
dataset_indices = np.arange(len(X))

In [6]:
# Training, validation, and testing data
idx_train, idx_test, _, _ = train_test_split(dataset_indices, dataset_indices, shuffle=True, test_size=0.005, random_state=seed_)

In [7]:
print("# X_train: %s" % (idx_train.shape[0]))
print("# X_test: %s" % (idx_test.shape[0]))

# X_train: 10945000
# X_test: 55000


## Save dataset in TF records

In [9]:
def serialize_example(idx):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    idx = idx.numpy()
    features = {}
    example_X = X[idx]
    example_y = y[idx]
    features = {
        'label': tf.train.Feature(float_list=tf.train.FloatList(value=[example_y]))
    }
    for i, feature_name in enumerate(features_names):
        features[feature_name] = tf.train.Feature(float_list=tf.train.FloatList(value=[example_X[i]]))
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=features))
    return example_proto.SerializeToString()


In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices(idx_train)
test_dataset = tf.data.Dataset.from_tensor_slices(idx_test)

2022-09-19 18:24:05.222900: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-19 18:24:05.229816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-19 18:24:05.230289: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-19 18:24:05.230588: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [11]:
def write_tf_record(dataset, dataset_size, dataset_path, tf_filename, tf_record_size = 500_000):
    writer = None
    tf_file_id = 0
    for n, idx in enumerate(tqdm.tqdm(dataset, total=dataset_size)):
        if (n % tf_record_size) == 0:
            if writer is not None:
                writer.close()
            writer = tf.io.TFRecordWriter(
                os.path.join(dataset_path, f'{tf_filename}_{tf_file_id:03}.tfrecord')
            )
            tf_file_id += 1
        example = serialize_example(idx)
        writer.write(example)
    writer.close()

In [12]:
write_tf_record(
    dataset = train_dataset,
    dataset_size = len(idx_train),
    dataset_path = "/mnt/storage-large/dataset/higgs/higgs_tfrecords/train",
    tf_filename = 'train',
)

  0%|                                                                                                                                        | 0/10945000 [00:00<?, ?it/s]2022-09-19 18:24:06.637693: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 87560000 exceeds 10% of free system memory.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10945000/10945000 [43:04<00:00, 4234.89it/s]


In [15]:
write_tf_record(
    dataset = test_dataset,
    dataset_size = len(idx_test),
    dataset_path = "/mnt/storage-large/dataset/higgs/higgs_tfrecords/test",
    tf_filename = 'test',
    tf_record_size = 10_000,
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55000/55000 [00:13<00:00, 4193.95it/s]


## Load tf records

In [16]:
filenames = tf.io.gfile.glob("/mnt/storage-large/dataset/higgs/higgs_tfrecords/test/*.tfrecord")
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset



<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [65]:
from functools import partial

autotune_ = tf.data.AUTOTUNE

feature_names = [
   'lepton_pT',
   'lepton_eta',
   'lepton_phi',
   'missing_energy_magnitude',
   'missing_energy_phi',
   'jet1pt',
   'jet1eta',
   'jet1phi',
   'jet1b-tag',
   'jet2pt',
   'jet2eta',
   'jet2phi',
   'jet2b-tag',
   'jet3pt',
   'jet3eta',
   'jet3phi',
   'jet3b-tag',
   'jet4pt',
   'jet4eta',
   'jet4phi',
   'jet4b-tag',
   'm_jj',
   'm_jjj',
   'm_lv',
   'm_jlv',
   'm_bb',
   'm_wbb',
   'm_wwbb',
]


def _parse_function(example_proto, with_label=True):
    # Create a description of the features.
    feature_description = {
        feature_name: tf.io.FixedLenFeature([], tf.float32, default_value=0)
        for feature_name in features_names
    }
    if with_label:
        feature_description['label'] = tf.io.FixedLenFeature([], tf.float32, default_value=0)

    # Parse the input `tf.train.Example` proto using the dictionary above.
    example = tf.io.parse_single_example(example_proto, feature_description)
    X = [example[feature_name] for feature_name in features_names]
    if with_label:
        y = example['label']
        return X, y
    return X,

def get_dataset(filenames_template="/mnt/storage-large/dataset/higgs/higgs_tfrecords/test/*.tfrecord", with_label=True, batch_size=32):
    filenames = tf.io.gfile.glob(filenames_template)
    raw_dataset = tf.data.TFRecordDataset(filenames)
    parser = partial(_parse_function, with_label=with_label)
    dataset = raw_dataset.map(parser, num_parallel_calls=autotune_)
    dataset = dataset.shuffle(seed_)
    dataset = dataset.prefetch(buffer_size=autotune_)
    dataset = dataset.batch(batch_size)
    return dataset


In [66]:
parsed_dataset = get_dataset()
for i, (X, y) in enumerate(parsed_dataset.take(2)):
    print(f'{i}) {X.shape=}, {y.shape=}')

0) X.shape=TensorShape([32, 28]), y.shape=TensorShape([32])
1) X.shape=TensorShape([32, 28]), y.shape=TensorShape([32])


In [68]:
parsed_dataset = get_dataset(with_label=False)
for i, (X,) in enumerate(parsed_dataset.take(2)):
    print(f'{i}) {X.shape=}')


0) X.shape=TensorShape([32, 28])
1) X.shape=TensorShape([32, 28])


