In [3]:
import numpy as np
import math

In [4]:
file_offsets = [      0,   20374,   40748,   61122,   81496,  101870,  122244,
        142618,  162992,  183366,  203740,  224114,  244488,  264862,
        285236,  305610,  325984,  346358,  366732,  387106,  407480,
        427854,  448228,  468602,  488976,  509350,  529724,  550098,
        570472,  590846,  611220,  631594,  651968,  672342,  692716,
        713090,  733464,  753838,  774212,  794586,  814960,  835334,
        855708,  876082,  896456,  916830,  937204,  957578,  977952,
        998326, 1018700, 1039074, 1059448, 1079822, 1100196, 1120570,
       1140944, 1161318, 1181692, 1202066, 1222440, 1242814, 1263188,
       1283562, 1303936, 1324310, 1344684, 1365058, 1385432, 1405806,
       1426180, 1446554, 1466928, 1487302, 1507676, 1528050]

batch_size = 5_000

In [5]:

def get_best_batch_size(file_offsets, target_bs=5000):
    """
    Find the best batch size that minimizes the residual when dividing the total number of rows.
    Args:
        file_offsets (np.ndarray): Array of file offsets.
        target_bs (int): Target batch size.
        tol (float): Tolerance for batch size deviation.
    Returns:
        int: Best batch size.
    """
    last_offset = file_offsets[-1]
    d_bs = int(0.5 * target_bs)
    batch_sizes = np.arange(target_bs - d_bs, target_bs + d_bs + 1)

    residuals = last_offset % batch_sizes
    min_res   = residuals.min()

    # All bs giving the minimal residual
    candidates = batch_sizes[residuals == min_res]

    # Prefer the one closest to the target
    idx = np.argmin(np.abs(candidates - target_bs))
    return int(candidates[idx]), min_res

def _build_batch_plan(file_offsets, batch_size, tol = 0.75):
    """
    Pre-compute (row_start, row_end) for every batch.
    If the last batch < 0.5xbatch_size, merge the last two
    and split them evenly, so both new batches are within
    0.5x...1.0xbatch_size.
    """
    total = file_offsets[-1]
    b      = batch_size
    plan   = []
    start  = 0
    while start < total:
        end = min(start + b, total)
        plan.append((start, end))
        start = end

    # Re-balance if the tail is too short
    if len(plan) >= 2:
        last_len = plan[-1][1] - plan[-1][0]
        if last_len < tol * b:
            sec_start = plan[-2][0]
            comb_len  = plan[-1][1] - sec_start
            half      = math.ceil(comb_len / 2)
            plan[-2]  = (sec_start, sec_start + half)
            plan[-1]  = (sec_start + half, sec_start + comb_len)
    return plan


In [6]:
len(file_offsets)

76

In [None]:
def build_batch_metadata(batch_size, file_offsets):
    total_rows = file_offsets[-1]
    num_batches = math.ceil(total_rows / batch_size)
    batch_metadata = []
    
    for batch_index in range(num_batches):
        start_evt = batch_index * batch_size  
        end_evt = min(start_evt + batch_size, total_rows)


        file_idx = np.searchsorted(file_offsets, start_evt, side="right") - 1

        evt_cursor = start_evt

        while (evt_cursor < end_evt):
            file_start = file_offsets[file_idx]
            rel_start = evt_cursor - file_start

            if file_idx + 1 < len(file_offsets):
                file_end = file_offsets[file_idx + 1]
                rel_end = min(end_evt, file_end) - file_start
            else:
                rel_end = end_evt - file_start

            need_rows = rel_end - rel_start


            if batch_index == len(batch_metadata):
                batch_metadata.append({
                    "batch_idx"        : batch_index,
                    "target_batch_size": int(batch_size),
                    "actual_batch_size": 0,
                    "segments"         : []
                })
            seg_rows = int(rel_end - rel_start)
            meta = batch_metadata[batch_index]
            meta["actual_batch_size"] += seg_rows
            meta["segments"].append({
                "file_idx"  : int(file_idx),
                "row_start" : int(rel_start),
                "row_end"   : int(rel_end - 1)
            })

            evt_cursor += need_rows
            file_idx += 1

    return batch_metadata

In [14]:
batch_metadata = build_batch_metadata(batch_size, file_offsets)

In [15]:
batch_metadata

[{'batch_idx': 0,
  'target_batch_size': 5000,
  'actual_batch_size': 5000,
  'segments': [{'file_idx': 0, 'row_start': 0, 'row_end': 4999}]},
 {'batch_idx': 1,
  'target_batch_size': 5000,
  'actual_batch_size': 5000,
  'segments': [{'file_idx': 0, 'row_start': 5000, 'row_end': 9999}]},
 {'batch_idx': 2,
  'target_batch_size': 5000,
  'actual_batch_size': 5000,
  'segments': [{'file_idx': 0, 'row_start': 10000, 'row_end': 14999}]},
 {'batch_idx': 3,
  'target_batch_size': 5000,
  'actual_batch_size': 5000,
  'segments': [{'file_idx': 0, 'row_start': 15000, 'row_end': 19999}]},
 {'batch_idx': 4,
  'target_batch_size': 5000,
  'actual_batch_size': 5000,
  'segments': [{'file_idx': 0, 'row_start': 20000, 'row_end': 20373},
   {'file_idx': 1, 'row_start': 0, 'row_end': 4625}]},
 {'batch_idx': 5,
  'target_batch_size': 5000,
  'actual_batch_size': 5000,
  'segments': [{'file_idx': 1, 'row_start': 4626, 'row_end': 9625}]},
 {'batch_idx': 6,
  'target_batch_size': 5000,
  'actual_batch_size'

In [16]:
for bm in batch_metadata:
    print(bm['actual_batch_size'])

5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000


In [8]:

best_bs, min_residual = get_best_batch_size(file_offsets, target_bs=5_000) 
print(f"Best batch size: {best_bs}, Min residual: {min_residual}")

plan = _build_batch_plan(file_offsets, batch_size=batch_size, tol = 0.9)
print(f"Batch plan: {plan[-5:]}")


Best batch size: 5010, Min residual: 0
Batch plan: [(1505000, 1510000), (1510000, 1515000), (1515000, 1520000), (1520000, 1524025), (1524025, 1528050)]
