In [1]:
import glob
import os 
import tensorflow as tf
import uproot 
import numpy as np
import awkward as ak

In [28]:
model_config = {
    "model": {
        "deep_sets": {
            "type": "mlp",  # "mlp" or "resnet"
            "activation": "relu",
            "initializer": "he_normal",
            "head": {
                "units": [1024, 512, 256, 128, 64],
                "batch_norm": False,
                "dropout": 0
            },
            "ch": {
                "units": [64, 128, 256],
                "batch_norm": True,
                "dropout": 0
            },
            "ne": {
                "units": [64, 128, 256],
                "batch_norm": True,
                "dropout": 0
            },
            "sv": {
                "units": [32, 64, 128],
                "batch_norm": True,
                "dropout": 0
            }
        },
        "particle_net": {
            "activation": "relu",
            "initializer": "he_normal",
            "batch_norm": True,
            "shortcut": True,
            "pooling": "average",  # "average" or "max"
            "ch": {
                "K": 16,
                "channels": [
                    [64, 64, 64],
                    [128, 128, 128],
                    [256, 256, 256]
                ]
            },
            "ne": {
                "K": 16,
                "channels": [
                    [64, 64, 64],
                    [128, 128, 128],
                    [256, 256, 256]
                ]
            },
            "sv": {
                "K": 8,
                "channels": [
                    [32, 32, 32],
                    [64, 64, 64],
                    [128, 128, 128]
                ]
            },
            "dropout": 0,
            "units": [512, 256, 128, 64]
        }
    }
}


In [2]:
data_features = {
    "data": {
        "batch_size": 1024,
        "train_size": 0.6,
        "validation_size": 0.2,
        "test_size": 0.2,

        "features": {
            "jet": {
                "numerical": ["pt", "eta", "phi", "mass", "area", "rho", "num_pv"],
                "categorical": [],
                "synthetic": ["log_pt", "mult", "ptD", "axis2"]
            },
            "ch": {
                "num_points": 64,
                "numerical": [
                    "ch_pt", "ch_eta", "ch_phi", "ch_dxy", "ch_dxy_significance",
                    "ch_dz", "ch_num_hits", "ch_num_pixel_hits", "ch_lost_hits", "ch_norm_chi2"
                ],
                "categorical": ["ch_id", "ch_pv_ass"],
                "synthetic": ["ch_rel_pt", "ch_rel_eta", "ch_rel_phi"]
            },
            "ne": {
                "num_points": 64,
                "numerical": ["ne_pt", "ne_eta", "ne_phi", "ne_hcal_frac"],
                "categorical": ["ne_id"],
                "synthetic": ["ne_rel_pt", "ne_rel_eta", "ne_rel_phi"]
            },
            "sv": {
                "num_points": 16,
                "numerical": [
                    "sv_mass", "sv_pt", "sv_eta", "sv_phi",
                    "sv_distance", "sv_significance", "sv_num_tracks"
                ],
                "categorical": [],
                "synthetic": ["sv_rel_pt", "sv_rel_eta", "sv_rel_phi"]
            }
        },

        "transforms": {
            "categorical": {
                "ch_id": [-211, -13, -11, 11, 13, 211],
                "ch_pv_ass": [0, 1, 4, 5, 6, 7],
                "ne_id": [1, 2, 22, 130]
            }
        }
    }
}


In [3]:
root_files = glob.glob(os.path.join("jec-gnn/data/shards", '*.root'))

In [4]:
root_files

[]

In [5]:
def _retrieve_data(net, path, jet, ch, ne, sv):
    jet_fields = jet['numerical'] + jet['categorical']
    ch_fields = ch['numerical'] + ch['categorical']
    ne_fields = ne['numerical'] + ne['categorical']
    sv_fields = sv['numerical'] + sv['categorical']

    pf = {
        'numerical': ch['numerical'] + ne['numerical'] + sv['numerical'],
        'categorical': ch['categorical'] + ne['categorical'] + sv['categorical']
    }
    pf_fields = pf['numerical'] + pf['categorical']

    fields = ['target'] + jet_fields + pf_fields
    inp = [
        net, path, jet['numerical'], jet['categorical'], 
        pf['numerical'], pf['categorical'],
        ch['num_points'], ne['num_points'], sv['num_points']
    ]
    Tout = (
        [tf.float32] + 
        [tf.float32] * len(jet['numerical']) +
        [tf.int32] * len(jet['categorical']) +
        [tf.float32] * len(pf['numerical']) +
        [tf.int32] * len(pf['categorical'])
    )

    if net == 'deep_sets':
        Tout.extend([tf.int32, tf.int32, tf.int32])
        fields.extend(['ch_size', 'ne_size', 'sv_size'])

    data = tf.numpy_function(_retrieve_np_data, inp=inp, Tout=Tout)

    data = {key: value for key, value in zip(fields, data)}

    target = data.pop('target')
    target.set_shape((None,))

    for field in jet_fields:
        # Shape from <unknown> to (None,)
        data[field].set_shape((None,))
        # Shape from (None,) to (None, 1)
        data[field] = tf.expand_dims(data[field], axis=1)

    if net == 'deep_sets':
        ch_size = data.pop('ch_size')
        ch_size.set_shape((None,))

        for field in ch_fields:
            # Shape from <unknown> to (None,)
            data[field].set_shape((None,))
            # shape from (None,) to (None, None)
            data[field] = tf.RaggedTensor.from_row_lengths(data[field], row_lengths=ch_size)
            # Shape from (None, None) to (None, None, 1)
            data[field] = tf.expand_dims(data[field], axis=2)
        
        ne_size = data.pop('ne_size')
        ne_size.set_shape((None,))

        for field in ne_fields:
            # Shape from <unknown> to (None,)
            data[field].set_shape((None,))
            # shape from (None,) to (None, None)
            data[field] = tf.RaggedTensor.from_row_lengths(data[field], row_lengths=ne_size)
            # Shape from (None, None) to (None, None, 1)
            data[field] = tf.expand_dims(data[field], axis=2)
        
        sv_size = data.pop('sv_size')
        sv_size.set_shape((None,))

        for field in sv_fields:
            # Shape from <unknown> to (None,)
            data[field].set_shape((None,))
            # shape from (None,) to (None, None)
            data[field] = tf.RaggedTensor.from_row_lengths(data[field], row_lengths=sv_size)
            # Shape from (None, None) to (None, None, 1)
            data[field] = tf.expand_dims(data[field], axis=2)
    
    if net == 'particle_net':
        for field in ch_fields:
            # Shape from <unknown> to (None, P)
            data[field].set_shape((None, ch['num_points']))
            # Shape from (None, P) to (None, P, 1)
            data[field] = tf.expand_dims(data[field], axis=2)

        for field in ne_fields:
            # Shape from <unknown> to (None, P)
            data[field].set_shape((None, ne['num_points']))
            # Shape from (None, P) to (None, P, 1)
            data[field] = tf.expand_dims(data[field], axis=2)
        
        for field in sv_fields:
            # Shape from <unknown> to (None, P)
            data[field].set_shape((None, sv['num_points']))
            # Shape from (None, P) to (None, P, 1)
            data[field] = tf.expand_dims(data[field], axis=2)

    return (data, target)


def _retrieve_np_data(
        net, path, global_numerical, global_categorical,
        constituent_numerical, constituent_categorical,
        ch_num_points, ne_num_points, sv_num_points
    ):
    # Decode bytestrings
    net = net.decode()
    path = path.decode()
    global_numerical = [field.decode() for field in global_numerical]
    global_categorical = [field.decode() for field in global_categorical]
    constituent_numerical = [field.decode() for field in constituent_numerical]
    constituent_categorical = [field.decode() for field in constituent_categorical]
    num_points = {'ch': ch_num_points, 'ne': ne_num_points, 'sv': sv_num_points}

    jets = uproot.open(path)['Jets'].arrays()

    target = np.log(jets.pt_gen / jets.pt)

    data = [ak.to_numpy(target).astype(np.float32)]

    for field in global_numerical:
        data.append(ak.to_numpy(jets[field]).astype(np.float32))

    for field in global_categorical:
        data.append(ak.to_numpy(jets[field]).astype(np.float32))

    if net == 'deep_sets':
        for field in constituent_numerical:
            data.append(ak.to_numpy(ak.flatten(jets[field])).astype(np.float32))

        for field in constituent_categorical:
            data.append(ak.to_numpy(ak.flatten(jets[field])).astype(np.int32))

        data.append(ak.to_numpy(jets.ch_size).astype(np.int32))
        data.append(ak.to_numpy(jets.ne_size).astype(np.int32))
        data.append(ak.to_numpy(jets.sv_size).astype(np.int32))

    if net == 'particle_net':
        for field in constituent_numerical:
            prefix = field[:2]
            none_padded_constituent = ak.pad_none(jets[field], target=num_points[prefix], clip=True, axis=1)
            zero_padded_constituent = ak.to_numpy(none_padded_constituent).filled(0)
            data.append(zero_padded_constituent.astype(np.float32))

        for field in constituent_categorical:
            prefix = field[:2]
            none_padded_constituent = ak.pad_none(jets[field], target=num_points[prefix], clip=True, axis=1)
            zero_padded_constituent = ak.to_numpy(none_padded_constituent).filled(0)
            data.append(zero_padded_constituent.astype(np.int32))

    return data


In [6]:
path = "/Users/sandeeppradhan/Desktop/ML_DGCNN_PNet_Regression/052.root"

In [7]:
features = data_features["data"]["features"]

In [8]:
jet = features["jet"] ; ch = features["ch"] ; ne = features["ne"] ; sv = features["sv"]

In [9]:
jet

{'numerical': ['pt', 'eta', 'phi', 'mass', 'area', 'rho', 'num_pv'],
 'categorical': [],
 'synthetic': ['log_pt', 'mult', 'ptD', 'axis2']}

In [10]:
ch

{'num_points': 64,
 'numerical': ['ch_pt',
  'ch_eta',
  'ch_phi',
  'ch_dxy',
  'ch_dxy_significance',
  'ch_dz',
  'ch_num_hits',
  'ch_num_pixel_hits',
  'ch_lost_hits',
  'ch_norm_chi2'],
 'categorical': ['ch_id', 'ch_pv_ass'],
 'synthetic': ['ch_rel_pt', 'ch_rel_eta', 'ch_rel_phi']}

In [11]:
ne

{'num_points': 64,
 'numerical': ['ne_pt', 'ne_eta', 'ne_phi', 'ne_hcal_frac'],
 'categorical': ['ne_id'],
 'synthetic': ['ne_rel_pt', 'ne_rel_eta', 'ne_rel_phi']}

In [12]:
jet_fet = jet['numerical'] + jet['categorical']
ch_fet = ch['numerical'] + ch['categorical']
ne_fet = ne['numerical'] + ne['categorical']
sv_fet = sv['numerical'] + sv['categorical']
n_ch = ch["num_points"]
n_ne = ne["num_points"]
n_sv = sv["num_points"]

# Get the data

In [13]:
global_categorical = jet['categorical']
global_numerical = jet['numerical']
constituents_numerical = ch['numerical'] + ne['numerical'] + sv['numerical']
constituents_categorical =  ch['categorical'] + ne['categorical'] + sv['categorical']
fields = ["target"] + global_categorical + global_numerical + constituents_numerical + constituents_categorical

In [14]:
file_jet = uproot.open(f"{path}:Jets")

In [15]:
file_array = file_jet.arrays()

In [16]:
np_jet_data = file_jet.arrays(file_jet.keys() , library = "np")

In [17]:
target = np.log(np_jet_data["pt_gen"]/np_jet_data["pt"])

In [18]:
target

array([ 0.01380491,  0.1220466 , -1.0946437 , ..., -0.0716151 ,
        0.3110676 ,  0.2419985 ], dtype=float32)

In [19]:
data =  [ak.to_numpy(target).astype(np.float32)]

In [20]:
for field in global_categorical:
    data.append(np_jet_data[field])
for field in global_numerical:
    data.append(np_jet_data[field])


In [21]:
ch_pt = file_array["ch_pt"]

In [22]:
ch_pt_64_padd_ = ak.pad_none(ch_pt, target=n_ch, clip=True, axis=1)
ch_pt_64_padd = ak.to_numpy(ch_pt_64_padd_).filled(0)

In [23]:
for field in constituents_numerical:
    none_padded_const = ak.pad_none(file_array[field], target=64, clip=True, axis=1)
    data.append(ak.to_numpy(none_padded_const).filled(0))
for field in constituents_categorical:
    none_padded_const = ak.pad_none(file_array[field], target=64, clip=True, axis=1)
    data.append(ak.to_numpy(none_padded_const).filled(0))

In [24]:
data

[array([ 0.01380491,  0.1220466 , -1.0946437 , ..., -0.0716151 ,
         0.3110676 ,  0.2419985 ], dtype=float32),
 array([435.26413 , 354.909   ,  13.580299, ...,  24.200798, 135.93718 ,
        102.89647 ], dtype=float32),
 array([ 0.31911004, -1.1410998 ,  4.6064377 , ..., -1.3200201 ,
         1.0947984 ,  1.4639744 ], dtype=float32),
 array([ 2.5459013 , -0.9286977 , -1.2750071 , ..., -0.45190284,
        -2.3452923 ,  2.4640775 ], dtype=float32),
 array([41.262806 , 36.98716  ,  3.5321884, ...,  6.1180177, 19.011692 ,
        12.154471 ], dtype=float32),
 array([0.47871888, 0.4986655 , 0.5385587 , ..., 0.52858543, 0.52858543,
        0.42885232], dtype=float32),
 array([15.76019  ,  6.3316035, 29.033247 , ..., 28.593819 , 15.2773075,
        20.323658 ], dtype=float32),
 array([15, 14, 29, ..., 23, 19, 27], dtype=int32),
 array([[156.875    ,  81.25     ,  11.8359375, ...,   0.       ,
           0.       ,   0.       ],
        [164.375    ,  35.8125   ,  26.34375  , ...,   0. 

In [25]:
data_dict = {key : value for key , value in zip(fields,data)}

In [26]:
data_dict

{'target': array([ 0.01380491,  0.1220466 , -1.0946437 , ..., -0.0716151 ,
         0.3110676 ,  0.2419985 ], dtype=float32),
 'pt': array([435.26413 , 354.909   ,  13.580299, ...,  24.200798, 135.93718 ,
        102.89647 ], dtype=float32),
 'eta': array([ 0.31911004, -1.1410998 ,  4.6064377 , ..., -1.3200201 ,
         1.0947984 ,  1.4639744 ], dtype=float32),
 'phi': array([ 2.5459013 , -0.9286977 , -1.2750071 , ..., -0.45190284,
        -2.3452923 ,  2.4640775 ], dtype=float32),
 'mass': array([41.262806 , 36.98716  ,  3.5321884, ...,  6.1180177, 19.011692 ,
        12.154471 ], dtype=float32),
 'area': array([0.47871888, 0.4986655 , 0.5385587 , ..., 0.52858543, 0.52858543,
        0.42885232], dtype=float32),
 'rho': array([15.76019  ,  6.3316035, 29.033247 , ..., 28.593819 , 15.2773075,
        20.323658 ], dtype=float32),
 'num_pv': array([15, 14, 29, ..., 23, 19, 27], dtype=int32),
 'ch_pt': array([[156.875    ,  81.25     ,  11.8359375, ...,   0.       ,
           0.       , 

In [27]:
# target_ = ak.data_dict.pop('target') 
# target_.set_shape((None,))

AttributeError: module 'awkward' has no attribute 'data_dict'

In [None]:
# import tensorflow as tf

# target = tf.constant([[1, 2, 3, 4],[13, 22, 35, 54]])       # shape = (4,)
# target.set_shape((None,3))               # Now shape is (None,)

In [None]:
target

In [None]:
# tf.constant([[1, 2, 3, 4],[13, 22, 35, 54]]) 

In [29]:
from tensorflow.keras import Input

In [31]:
ch_fts = Input(name='charged_constituents', shape=(20, 5))

In [35]:
ch_fts.shape

(None, 20, 5)

In [38]:
ch_ft = Input(name='charged_constituents', shape=(20,1))

In [39]:
ch_ft.shape

(None, 20, 1)

In [40]:
'''
model = Model(
        inputs=[
            ch_fts, ch_mask, ch_coord_shift, ch_points,
            ne_fts, ne_mask, ne_coord_shift, ne_points,
            sv_fts, sv_mask, sv_coord_shift, sv_points,
            globals
        ], outputs=outputs
    )
'''


'\nmodel = Model(\n        inputs=[\n            ch_fts, ch_mask, ch_coord_shift, ch_points,\n            ne_fts, ne_mask, ne_coord_shift, ne_points,\n            sv_fts, sv_mask, sv_coord_shift, sv_points,\n            globals\n        ], outputs=outputs\n    )\n'