# Can Max learn the new dataset format?

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import uproot as ur

In [3]:
file_path = '/Users/swiatlow/Data/caloml/graph_data/user.angerami.24559744.OutputStream._000225.root'


In [4]:
file = ur.open(file_path)

In [5]:
tree = file['EventTree']

In [6]:
cell_energy = tree['cluster_cell_E'].array()

In [7]:
import awkward as ak

In [8]:
# get rid of the first index, the event. now we will be indexed first by cluster. no event concept!
cell_energy_flat = ak.flatten(cell_energy, axis = 0)

In [9]:
# now, remove the 'bonus' empty index at the end. this contains no information, weird uproot feature
cell_energy_flat_flat = ak.flatten(cell_energy_flat,axis=2)

In [10]:
# axis 0 is the clusters, axis 1 is the cells. this pads the cells up to length 1253 with the 'None' Value
# Why 1253? I had earlier found that was the max cells of a cluster in this file. your mileage may vary
cell_energy_flat_flat_pad = ak.pad_none(cell_energy_flat_flat, 1253,axis=1)

In [11]:
import numpy as np

In [12]:
# we do a deep copy when we convert to numpy. Otherwise the data is not modifiable, which we need for the next step
numpy_version = np.copy(cell_energy_flat_flat_pad.to_numpy())

In [13]:
# The -1 value in a 'masked' array is 'Masked' value. Here, we just set it to 0. This sets all the Nones to 0.
numpy_version[-1] = 0

In [14]:
# Check the shape-- and it's a nice, well defined 20k x 1253 array
numpy_version.shape

(20000, 1253)

In [15]:
# print the first cluster...
numpy_version[0]

array([0.24690098, 0.00604178, 0.18410762, ..., 0.        , 0.        ,
       0.        ])

In [16]:
# print the length of the cluster...
len(numpy_version[0])

1253

In [17]:
# everything seems to work!

In [18]:
numpy_version

array([[0.24690098, 0.00604178, 0.18410762, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [2.09433913, 0.09909376, 0.23437567, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.88116634, 0.09177048, 1.85253012, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [20]:
import sys
sys.path.append('/Users/swiatlow/Code/ML4P/LCStudies')
from  util import graph_util as gu

In [21]:
cell_id = gu.loadBranchFlat('cluster_cell_ID', tree, 1253)
cell_e  = gu.loadBranchFlat('cluster_cell_E', tree, 1253)

In [22]:
geoDict = gu.loadGraphDictionary(file['CellGeo'])

In [23]:
cell_phi = gu.convertIDToGeo(cell_id, 'cell_geo_eta', geoDict)
cell_eta = gu.convertIDToGeo(cell_id, 'cell_geo_phi', geoDict)

In [24]:
cell_phi.shape

(20000, 1253)

In [25]:
cell_phi[0]

array([0.51627523, 0.51620591, 0.51314598, ..., 0.        , 0.        ,
       0.        ])

In [26]:
cell_id[0]

array([765544462, 765544460, 765543950, ...,         0,         0,
               0])

In [27]:
len(cell_id[0])

1253

In [None]:
# what's missing?
# normalization of the eta/phi
# column stack

In [29]:
X = np.stack((cell_e, cell_eta, cell_phi), axis = 2)

In [34]:
X[0][0]

array([0.24690098, 0.73187315, 0.51627523])

In [None]:
# X is now the right format for PFN!

In [33]:
X[-1].shape

(1253, 3)

In [35]:
X.shape[-1]

3