# Large hddm file splitter
Sometimes it is useful for the sake of parallel processing efficiency to split up a large hddm file into a series of smaller ones. This notebook demonstrates a parallel process for accomplishing this splitting using dask.

In [1]:
from gluex import hddm_s
import dask.distributed
import dask
client = dask.distributed.Client(n_workers=30, threads_per_worker=1, dashboard_address='localhost:8789')

In [2]:
def slicer(infile, slicesize, sliceindex, count=1):
    """
    Reads up to slicesize events starting at event sliceindex*slicesize
    from input hddm file infile and copies them to a new output file with
    a name generated from the name of infile with a subscript sliceindex.
    Return value is the number of events copied, or -1 for error. Output
    files are left on the /local filesystem.
    """
    basepath = infile.split(".hddm")[0]
    if basepath == infile:
        return -1
    hin = hddm_s.istream(infile)
    hin.skip(sliceindex * slicesize)
    ncopied = 0
    for islice in range(count):
        outfile = f"/local/{basepath.split('/')[-1]}_{sliceindex + islice}.hddm"
        hout = hddm_s.ostream(outfile)
        for rec in hin:
            hout.write(rec)
            ncopied += 1
            if ncopied % slicesize == 0:
                break
    return ncopied

In [3]:
infile = "root://cn445.storrs.hpc.uconn.edu/Gluex/resilient/simulation/KLFbeam-8-2024/forced_500k.hddm"
slicesize = 1000
slicecount = 500
results = [dask.delayed(slicer)(infile, slicesize, i*25, 25) for i in range(slicecount//25)]
collection = dask.delayed(sum)(results)
final_count = collection.compute()
print("final event count is", final_count)

final event count is 500000
