In [None]:
import os

root_dir = os.path.join('/mnt', 'datasets', 'openfwi')
data_types = ['CurveVel_A',
 'CurveFault_A',
 'Style_A',
 'FlatVel_B',
 'FlatFault_B',
 'CurveVel_B',
 'Style_B',
 'CurveFault_B',
 'FlatVel_A',
 'FlatFault_A']

In [None]:
import numpy as np
import time
from tqdm import tqdm
# Each of these files are batches of 500 along the first dimension.
# Let's split up the data into 500 chunks and see how much time it takes to load each chunk.

ex_dir = '/mnt/datasets/openfwi/CurveVel_A'

print(f"Loading data from {ex_dir}")
exs_x = [f'{ex_dir}/data/data{i}.npy' for i in range(1, 10)]
exs_y = [f'{ex_dir}/model/model{i}.npy' for i in range(1, 10)]

# Load the data
x = np.concatenate([np.load(ex_x) for ex_x in exs_x])
y = np.concatenate([np.load(ex_y) for ex_y in exs_y])

split_dir = "data/split_example"
os.makedirs(split_dir, exist_ok=True)

# Split the data into 500 chunks
chunk_size = 1
num_chunks = x.shape[0] // chunk_size

print(f"Splitting data into {num_chunks} chunks of size {chunk_size}")

# Save each chunk as a separate .npy file
for i in tqdm(range(num_chunks)):
    chunk_x = x[i*chunk_size:(i+1)*chunk_size]
    chunk_y = y[i*chunk_size:(i+1)*chunk_size]
    
    # Create a directory for this chunk if it doesn't exist
    chunk_dir = os.path.join(split_dir, f'chunk_{i}')
    os.makedirs(chunk_dir, exist_ok=True)
    
    # Save the chunk data
    np.save(os.path.join(chunk_dir, 'data.npy'), chunk_x)
    np.save(os.path.join(chunk_dir, 'model.npy'), chunk_y)

print(f"Data split into {num_chunks} chunks and saved in {split_dir}")

# test loading time for each chunk
chunk_load_times = []
for i in tqdm(range(num_chunks)):
    chunk_dir = os.path.join(split_dir, f'chunk_{i}')
    start_time = time.time()
    chunk_x = np.load(os.path.join(chunk_dir, 'data.npy'))
    chunk_y = np.load(os.path.join(chunk_dir, 'model.npy'))
    end_time = time.time()
    chunk_load_times.append(end_time - start_time)

print(f"Average time to load a single chunk: {np.mean(chunk_load_times):.6f} seconds")

In [None]:
# Compare size of all chunks vs original two files
chunk_sizes = []
for i in range(num_chunks):
    chunk_dir = os.path.join(split_dir, f'chunk_{i}')
    chunk_size = os.path.getsize(os.path.join(chunk_dir, 'data.npy')) + os.path.getsize(os.path.join(chunk_dir, 'model.npy'))
    chunk_sizes.append(chunk_size)

print(f"Total size of all chunks: {sum(chunk_sizes) / (1024 * 1024):.2f} MB")
print(f"Size of original files: {sum(os.path.getsize(ex_x) + os.path.getsize(ex_y) for ex_x, ex_y in zip(exs_x, exs_y)) / (1024 * 1024):.2f} MB")

In [None]:
chunk_x.squeeze().shape, chunk_y.squeeze().shape