In [1]:
import uproot as urt
import numpy as np

import uproot.behaviors.TBranch as utb

## Obtain binary data of events

In ROOT files, `TTree`, `TBranch` only stores header of event data. It is `TBasket` who stores binary data of all events.

In `uproot`, `TBranch.array()` will call `basket_array()` method of its `Interpretation`, with `TBasket`'s binary data.

Therefore, we can make a pseudo `Interpretation` and intercept basket's binary data coming from `TBranch`.

In [2]:
class MyTestInterpretation(urt.interpretation.Interpretation):
    def final_array(self, basket_arrays, entry_start, entry_stop, entry_offsets, library, branch, options):
        return

    def basket_array(
        self,
        data,
        byte_offsets,
        basket,
        branch,
        context,
        cursor_offset,
        library,
        options,
    ):
        self.data = data
        self.byte_offsets = byte_offsets
        self.basket = basket
        self.branch = branch
        self.context = context
        self.cursor_offset = cursor_offset
        self.library = library
        self.options = options
        raise Exception('stop')

To skip `TBranch`'s value check, we also need some tricks:

In [3]:
f = urt.open('../tests/data/test_full_mc_evt_1.rtraw')
evt = f['Event']

# The selected TBranch must be bottom-level
br = evt['TMcEvent/m_mcParticleCol']

rob = br.entries_to_ranges_or_baskets(0, 10)[0]
decompression_executor, interpretation_executor = utb._regularize_executors(
    None, None, br._file
)

With these necessary variables, we can now use the pseudo `Interpretation` to get binary data of events.

In [4]:
test_ip = MyTestInterpretation()

try:
    utb._ranges_or_baskets_to_arrays(
        br,
        [(br, *rob)],
        {br.cache_key: test_ip},
        0,
        10000,
        decompression_executor,
        interpretation_executor,
        urt.interpretation.library.NumPy,
        {},
        False,
        {},
    )
except:
    pass

Here we have binary data of event `0-9`

In [5]:
test_ip.data

array([64,  0,  7, ...,  0,  0,  0], dtype=uint8)

## Parse binary data

I summarize these rules by so far

### Parse a standalone element

#### C-type (`int`, `float`, `double`, etc.): read `sizeof(T)` bytes, but in big-endian (`bool` is 1 byte)

In [6]:
# We define some shortcuts for the data types, and return remaining data
def read_uint8(data: np.ndarray):
    return np.frombuffer(data[:1], dtype=">u1")[0], data[1:]


def read_uint16(data: np.ndarray):
    return np.frombuffer(data[:2], dtype=">u2")[0], data[2:]


def read_uint32(data: np.ndarray):
    return np.frombuffer(data[:4], dtype=">u4")[0], data[4:]


def read_uint64(data: np.ndarray):
    return np.frombuffer(data[:8], dtype=">u8")[0], data[8:]


def read_int8(data: np.ndarray):
    return np.frombuffer(data[:1], dtype=">i1")[0], data[1:]


def read_int16(data: np.ndarray):
    return np.frombuffer(data[:2], dtype=">i2")[0], data[2:]


def read_int32(data: np.ndarray):
    return np.frombuffer(data[:4], dtype=">i4")[0], data[4:]


def read_int64(data: np.ndarray):
    return np.frombuffer(data[:8], dtype=">i8")[0], data[8:]


def read_float(data: np.ndarray):
    return np.frombuffer(data[:4], dtype=">f4")[0], data[4:]


def read_double(data: np.ndarray):
    return np.frombuffer(data[:8], dtype=">f8")[0], data[8:]


def get_ctype_reader_by_fType(fType: int) -> callable:
    return {
        1: read_int8,
        2: read_int16,
        3: read_int32,
        4: read_int64,
        5: read_float,
        8: read_double,
        11: read_uint8,
        12: read_uint16,
        13: read_uint32,
        14: read_uint64,
    }[fType]


def get_ctype_reader_by_fTypeName(fTypeName: str) -> callable:
    return {
        "char": read_int8,
        "short": read_int16,
        "int": read_int32,
        "long": read_int64,
        "float": read_float,
        "double": read_double,
        "unsigned char": read_uint8,
        "unsigned short": read_uint16,
        "unsigned int": read_uint32,
        "unsigned long": read_uint64,
        "bool": read_uint8,
    }[fTypeName]

#### `fNBytes`: `uint32`, number of remaining bytes, OR'd with `0x40000000`.

"OR'd with `0x40000000`" means that to read out number of remaining bytes, we need to:
1. read an `uint32` value
2. `NBytes = (value &= ~0x40000000)`

We can also check whether our parsing is correct by evaluating `value & 0x40000000 != 0`

In [7]:
def read_fNBytes(data: np.ndarray):
    n, data = read_uint32(data)
    if not n & np.uint32(0x40000000):
        raise Exception('Not an fNBytes')
    return n & ~np.uint32(0x40000000), data

#### `TString`
1. read 1 byte as length `N`
2. if `N==255`, read more 4 bytes as length `N`
3. then read `N` bytes as string, no `\0` at the end

In [8]:
def read_TString(data: np.ndarray):
    size, data = read_uint8(data)
    if size == 255:
        size, data = read_uint32(data)
    return data[:size].tobytes().decode('utf-8'), data[size:]

#### `TObject`: read `fVersion(2), fUniqueID(4), fBits(4)` (total 10 bytes)

In [9]:
def read_TObject(data: np.ndarray):
    fVersion, data = read_uint16(data)
    fUniqueID, data = read_uint32(data)
    fBits, data = read_uint32(data)
    return (fVersion, fUniqueID, fBits), data

#### `vector<T>`: Memory layout: `[header, val1, val2, ..., valN]`
1. read `fBytes(4), fVersion(2)`
2. read `fSize(4)` as length `N`
3. read `N` elements with `T`'s parsing rule

In [10]:
def read_vector(data: np.ndarray, read_element: callable):
    fNBytes, data = read_fNBytes(data)
    fVersion, data = read_uint16(data)

    size, data = read_uint32(data)
    res = []
    for i in range(size):
        element, data = read_element(data)
        res.append(element)
    return res, data

#### `map<TKey,TVal>, multimaps<TKey,TVal>`: `[header, keys, values]`
1. read `fBytes(4), fVersion(2)`
2. skip 6 bytes since I don't know what they are
3. read `fSize(4)` as length `N`
4. read `N` key elements with `T`'s parsing rule
5. read `N` value elements with `T`'s parsing rule

In [11]:
def read_map(data: np.ndarray, read_key: callable, read_value: callable):
    fNBytes, data = read_fNBytes(data)
    fVersion, data = read_uint16(data)

    # skip 6 bytes
    data = data[6:]

    size, data = read_uint32(data)

    keys = []
    vals = []

    for i in range(size):
        key, data = read_key(data)
        keys.append(key)

    for i in range(size):
        val, data = read_value(data)
        vals.append(val)

    return dict(zip(keys, vals)), data

#### TObjArray

1. read `TObject`
2. read `fName(1)`, `fSize(4)`, `fLowerBound(4)`
4. read sub-object content sequentially

In [12]:
def read_TObjArray_header(data: np.ndarray):
    _, data = read_TObject(data)
    fName, data = read_uint8(data)
    fSize, data = read_uint32(data)
    fLowerBound, data = read_int32(data)
    return (fName, fSize, fLowerBound), data

#### Object

1. if in containers, read object header:
    1. read `fNBytes(4)`, `fTag(4)`
    2. if `fTag==-1`, read `fClassName(null-terminated)`

2. read object content sequentially by its streamer information

In [13]:
def read_ObjHeader(data: np.ndarray):
    fNBytes, data = read_fNBytes(data)
    fTag, data = read_int32(data)
    if fTag == -1:
        start = 0
        end = 0
        while data[end] != 0:
            end += 1
        fClassName = data[start:end].tobytes().decode("utf-8")
        data = data[end + 1 :]
    else:
        fClassName = ""
    return (fNBytes, fTag, fClassName), data

### C-Type Array of any class

"C-Type Array" means variable defined as `T[N]`, where `T` is any class, and `N` is a constant. When defining a C-Type Array, some classes may perform different:

#### Base types `<T>` (`int`, `float`, etc.)
repeatedly read `sizeof(T)` bytes for `N` times

In [14]:
def read_ctype_array(data: np.ndarray, read_element: callable, n_elements: int):
    res = []
    for i in range(n_elements):
        element, data = read_element(data)
        res.append(element)
    return res, data

#### `TString`
When `TString` is in a C-Type Array, it will have a header with `fNBytes(4), fVersion(2)`.

In [15]:
def read_TString_array(data: np.ndarray, n_elements: int):
    fNBytes, data = read_fNBytes(data)
    fVersion, data = read_uint16(data)

    res = []
    for i in range(n_elements):
        element, data = read_TString(data)
        res.append(element)
    return res, data

#### `vector`, `map`
Header appears once, then sequentially read `N` times its body

In [16]:
def read_vector_array(data: np.ndarray, read_element: callable, n_elements: int):
    fNBytes, data = read_fNBytes(data)
    fVersion, data = read_uint16(data)

    res = []
    for i in range(n_elements):
        size, data = read_uint32(data)
        sub_res = []
        for j in range(size):
            element, data = read_element(data)
            sub_res.append(element)
        res.append(sub_res)
    return res, data


def read_map_array(
    data: np.ndarray, read_key: callable, read_value: callable, n_elements: int
):
    fNBytes, data = read_fNBytes(data)
    fVersion, data = read_uint16(data)

    res = []
    for i in range(n_elements):
        size, data = read_uint32(data)

        keys = []
        vals = []

        for i in range(size):
            key, data = read_key(data)
            keys.append(key)

        for i in range(size):
            val, data = read_value(data)
            vals.append(val)

        res.append(dict(zip(keys, vals)))

    return res, data

### Classes derived from `TObject`, including "Custom Class"

1. read its streamers information, get its `fElements`, which should be a list of dictionaries
2. read sub-objects according to the list of dictionaries
    1. determine the type of the sub-object by `fType`, `fCtype`, `fTypeName`, `fName` (for example, a `vector<MyClass>` will have `fType=500`, `fCtype=?`, `fTypeName="vector<MyClass>"`, `fName=?`)
    2. if the type is also a class derived from `TObject`, recursively read the sub-object

In [17]:
print("{version: streamer-info}:", br.file.streamers["TMcParticle"])
print()
streamer_info = br.file.streamers["TMcParticle"][1].member("fElements")
sub_streamer_list = [i.all_members for i in streamer_info]
print("sub_streamer_list:")
for i in sub_streamer_list:
    print(i)

{version: streamer-info}: {1: <TStreamerInfo for TMcParticle version 1 at 0x7f508c0d6610>}

sub_streamer_list:
{'@fUniqueID': 0, '@fBits': 16777216, 'fName': 'TObject', 'fTitle': 'Basic ROOT object', 'fType': 66, 'fSize': 0, 'fArrayLength': 0, 'fArrayDim': 0, 'fMaxIndex': array([          0, -1877229523,           0,           0,           0],
      dtype='>i4'), 'fTypeName': 'BASE', 'fBaseVersion': 1}
{'@fUniqueID': 0, '@fBits': 16777216, 'fName': 'm_particleID', 'fTitle': '', 'fType': 3, 'fSize': 4, 'fArrayLength': 0, 'fArrayDim': 0, 'fMaxIndex': array([0, 0, 0, 0, 0], dtype='>i4'), 'fTypeName': 'int'}
{'@fUniqueID': 0, '@fBits': 16777216, 'fName': 'm_trackIndex', 'fTitle': '', 'fType': 3, 'fSize': 4, 'fArrayLength': 0, 'fArrayDim': 0, 'fMaxIndex': array([0, 0, 0, 0, 0], dtype='>i4'), 'fTypeName': 'int'}
{'@fUniqueID': 0, '@fBits': 16777216, 'fName': 'm_vertexIndex0', 'fTitle': '', 'fType': 3, 'fSize': 4, 'fArrayLength': 0, 'fArrayDim': 0, 'fMaxIndex': array([0, 0, 0, 0, 0], dtype='>

#### Full example

In [18]:
def read_TMcParticle(data: np.ndarray):
    obj_header, data = read_ObjHeader(data)
    print("[0] obj_header:", obj_header)

    # Start of the TMcParticle
    fNBytes, data = read_fNBytes(data)
    fVersion, data = read_uint16(data)

    # TObject
    t_obj, data = read_TObject(data)
    print("[1] TObject:", t_obj)

    # 'fName': 'm_particleID', 'fTypeName': 'int'}
    particle_id, data = read_int32(data)
    print("[2] m_particleID:", particle_id)

    # 'fName': 'm_trackIndex', 'fTypeName': 'int'}
    track_index, data = read_int32(data)
    print("[3] m_trackIndex:", track_index)

    # 'fName': 'm_vertexIndex0', 'fTypeName': 'int'}
    vertex_index0, data = read_int32(data)
    print("[4] m_vertexIndex0:", vertex_index0)

    # 'fName': 'm_vertexIndex1', 'fTypeName': 'int'}
    vertex_index1, data = read_int32(data)
    print("[5] m_vertexIndex1:", vertex_index1)

    # 'fName': 'm_statusFlags', 'fTypeName': 'unsigned int'}
    status_flags, data = read_uint32(data)
    print("[6] m_statusFlags:", status_flags)

    # 'fName': 'm_xInitialPosition', 'fTypeName': 'double'}
    x_initial_position, data = read_double(data)
    print("[7] m_xInitialPosition:", x_initial_position)

    # 'fName': 'm_yInitialPosition', 'fTypeName': 'double'}
    y_initial_position, data = read_double(data)
    print("[8] m_yInitialPosition:", y_initial_position)

    # 'fName': 'm_zInitialPosition', 'fTypeName': 'double'}
    z_initial_position, data = read_double(data)
    print("[9] m_zInitialPosition:", z_initial_position)

    # 'fName': 'm_tInitialPosition', 'fTypeName': 'double'}
    t_initial_position, data = read_double(data)
    print("[10] m_tInitialPosition:", t_initial_position)

    # 'fName': 'm_xFinalPosition', 'fTypeName': 'double'}
    x_final_position, data = read_double(data)
    print("[11] m_xFinalPosition:", x_final_position)

    # 'fName': 'm_yFinalPosition', 'fTypeName': 'double'}
    y_final_position, data = read_double(data)
    print("[12] m_yFinalPosition:", y_final_position)

    # 'fName': 'm_zFinalPosition', 'fTypeName': 'double'}
    z_final_position, data = read_double(data)
    print("[13] m_zFinalPosition:", z_final_position)

    # 'fName': 'm_tFinalPosition', 'fTypeName': 'double'}
    t_final_position, data = read_double(data)
    print("[14] m_tFinalPosition:", t_final_position)

    # 'fName': 'm_xInitialMomentum', 'fTypeName': 'double'}
    x_initial_momentum, data = read_double(data)
    print("[15] m_xInitialMomentum:", x_initial_momentum)

    # 'fName': 'm_yInitialMomentum', 'fTypeName': 'double'}
    y_initial_momentum, data = read_double(data)
    print("[16] m_yInitialMomentum:", y_initial_momentum)

    # 'fName': 'm_zInitialMomentum', 'fTypeName': 'double'}
    z_initial_momentum, data = read_double(data)
    print("[17] m_zInitialMomentum:", z_initial_momentum)

    # 'fName': 'm_eInitialMomentum', 'fTypeName': 'double'}
    e_initial_momentum, data = read_double(data)
    print("[18] m_eInitialMomentum:", e_initial_momentum)

    # 'fName': 'm_mother', 'fTypeName': 'int'}
    mother, data = read_int32(data)
    print("[19] m_mother:", mother)

    # 'fName': 'm_daughters', 'fTypeName': 'vector<int>', 'fSTLtype': 1, 'fCtype': 3}
    daughters, data = read_vector(data, read_int32)
    print("[20] m_daughters:", daughters)

    return data

In [22]:
# Start of the TObjArray
fNBytes, data = read_fNBytes(test_ip.data)
fVersion, data = read_uint16(data)

obj_arr_header, data = read_TObjArray_header(data)
n_elements = obj_arr_header[1]
print("n_elements:", n_elements)

for i in range(n_elements):
    print()
    print("----> next element")
    data = read_TMcParticle(data)

n_elements: 12

----> next element
[0] obj_header: (np.uint32(170), np.int32(-1), 'TMcParticle')
[1] TObject: (np.uint16(1), np.uint32(0), np.uint32(0))
[2] m_particleID: 23
[3] m_trackIndex: 0
[4] m_vertexIndex0: 0
[5] m_vertexIndex1: 1
[6] m_statusFlags: 5
[7] m_xInitialPosition: 0.20316340945100783
[8] m_yInitialPosition: -0.1867208330247402
[9] m_zInitialPosition: 0.49041959728497264
[10] m_tInitialPosition: 655.9251579109405
[11] m_xFinalPosition: 0.20316340945100783
[12] m_yFinalPosition: -0.1867208330247402
[13] m_zFinalPosition: 0.49041959728497264
[14] m_tFinalPosition: 655.9251579109405
[15] m_xInitialMomentum: 0.034067402274467924
[16] m_yInitialMomentum: 0.0
[17] m_zInitialMomentum: -0.001225077037119604
[18] m_eInitialMomentum: 3.0970991968142387
[19] m_mother: -99
[20] m_daughters: [np.int32(1), np.int32(2)]

----> next element
[0] obj_header: (np.uint32(150), np.int32(-2147483534), '')
[1] TObject: (np.uint16(1), np.uint32(0), np.uint32(0))
[2] m_particleID: 4
[3] m_trac