In [1]:
import os
import h5py
import numpy as np
from typing import Dict, Tuple


def get_path_in_parent(*args):
    return os.path.abspath(os.path.join(os.getcwd(), '..', *args))

def get_path_in_storage(*args):
    return get_path_in_parent("storage", *args)

def get_logistics_path(*args):
    return get_path_in_storage("logistics", *args)

def save_outputs(model_num: int, outputs: Dict[str, Tuple[np.ndarray, ...]], nms: bool = False, sampled=False) -> None:
    """Save a dict of tuples of arrays to an HDF5 file."""
    path = get_path_in_storage(f"outputs_{model_num}{"_nms" if nms else ""}{"_sampled" if sampled else ""}.h5")
    with h5py.File(path, "w") as f:
        for key, tup in outputs.items():
            grp = f.create_group(str(key))
            for i, arr in enumerate(tup):
                grp.create_dataset(
                    f"array_{i}", data=arr, compression="gzip", compression_opts=1
                )

def load_outputs(model_num: int, nms: bool = False, sampled = False) -> Dict[str, Tuple[np.ndarray, ...]]:
    """Load a dict of tuples of arrays from an HDF5 file."""
    path = get_path_in_storage(f"outputs_{model_num}{"_nms" if nms else ""}.h5")
    outputs_loaded = {}
    with h5py.File(path, "r") as f:
        for key in f.keys():
            grp = f[key]
            arrays = tuple(np.array(grp[subkey]) for subkey in sorted(grp.keys()))
            outputs_loaded[key] = arrays
    return outputs_loaded

In [2]:
import glob

# Collect results in a 1D list
data = []

# Iterate over all .txt files
for filepath in glob.glob(get_logistics_path("*.txt")):
    img_file_name = f"{os.path.splitext(os.path.basename(filepath))[0]}.jpg"

    with open(filepath, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 5:
                # Extract values from line
                obj_id, x, y, w, h = parts[:5]
                # Convert to int or float depending on file format
                obj_id = int(obj_id)
                x, y, w, h = map(float, (x, y, w, h))

                # Append tuple to results
                data.append((obj_id, x, y, w, h, img_file_name))


In [3]:
len(data)

36721

In [None]:
from sklearn.model_selection import train_test_split

# `ids` are the labels we stratify on (the 0th element in each tuple)
ids = [item[-1] for item in data]

# Stratified split
_, sample = train_test_split(
    data,
    test_size=0.2,   # fraction
    stratify=ids,
    random_state=42
)

print(f"Original size: {len(data)}, Sampled size: {len(sample)}")
sample[:10]

Original size: 36721, Sampled size: 7345


[(19,
  0.33046875,
  0.465625,
  0.0265625,
  0.03125,
  '1574676188-741767_jpg.rf.246d2023cef448e477bc88377daf6895.jpg'),
 (13,
  0.47421875,
  0.4953125,
  0.1171875,
  0.20625,
  'ppe_0437_jpg.rf.44e92b769679dc08249b78f98c40b54f.jpg'),
 (2,
  0.85390625,
  0.8859375,
  0.0328125,
  0.03125,
  'Y4Y4IQOWJ9NM_jpg.rf.cdc815ffb2b12fd1715812cde12340bc.jpg'),
 (19,
  0.5671875,
  0.6421875,
  0.103125,
  0.078125,
  '1579163982-3432484_jpg.rf.b75bc8dede42c7ca24554fc21de93e58.jpg'),
 (10,
  0.54921875,
  0.27890625,
  0.0671875,
  0.1703125,
  'ppe_0557_jpg.rf.d3c25c119979adb01c65aba0224a1eb2.jpg'),
 (19,
  0.2484375,
  0.48671875,
  0.096875,
  0.0609375,
  '1564564749643-49_jpg.rf.992a32dab7be12756d39b2ccaab3bc17.jpg'),
 (17,
  0.46015625,
  0.509375,
  0.8203125,
  0.60625,
  'pt-label_6254_jpg.rf.4792b721bc1a462afe574c7f4d2ed9f3.jpg'),
 (2,
  0.67109375,
  0.8765625,
  0.1921875,
  0.09375,
  'img480_jpg.rf.0e698e0adbd3bdcf56464c5ea5fe7864.jpg'),
 (10,
  0.22265625,
  0.6390625,
  0.11

In [7]:
import pickle

# Save list to pickle file
with open(get_path_in_storage("sampled_data.pkl"), "wb") as f:
    pickle.dump(data, f)