In [3]:
!ls /mnt/datasets/openfwi

CurveFault_A  CurveVel_A  FlatFault_A  FlatVel_A  README.md  Style_B
CurveFault_B  CurveVel_B  FlatFault_B  FlatVel_B  Style_A


In [9]:
import os

root_dir = os.path.join('mnt', 'datasets', 'openfwi')
data_types = ['CurveVel_A',
 'CurveFault_A',
 'Style_A',
 'FlatVel_B',
 'FlatFault_B',
 'CurveVel_B',
 'Style_B',
 'CurveFault_B',
 'FlatVel_A',
 'FlatFault_A']

In [7]:
import numpy as np
import time
from tqdm import tqdm
# Each of these files are batches of 500 along the first dimension.
# Let's split up the data into 500 chunks and see how much time it takes to load each chunk.

ex_dir = '/mnt/datasets/openfwi/CurveVel_A'

print(f"Loading data from {ex_dir}")
exs_x = [f'{ex_dir}/data/data{i}.npy' for i in range(1, 10)]
exs_y = [f'{ex_dir}/model/model{i}.npy' for i in range(1, 10)]

# Load the data
x = np.concatenate([np.load(ex_x) for ex_x in exs_x])
y = np.concatenate([np.load(ex_y) for ex_y in exs_y])

split_dir = "data/split_example"
os.makedirs(split_dir, exist_ok=True)

# Split the data into 500 chunks
chunk_size = 1
num_chunks = x.shape[0] // chunk_size

print(f"Splitting data into {num_chunks} chunks of size {chunk_size}")

# Save each chunk as a separate .npy file
for i in tqdm(range(num_chunks)):
    chunk_x = x[i*chunk_size:(i+1)*chunk_size]
    chunk_y = y[i*chunk_size:(i+1)*chunk_size]
    
    # Create a directory for this chunk if it doesn't exist
    chunk_dir = os.path.join(split_dir, f'chunk_{i}')
    os.makedirs(chunk_dir, exist_ok=True)
    
    # Save the chunk data
    np.save(os.path.join(chunk_dir, 'data.npy'), chunk_x)
    np.save(os.path.join(chunk_dir, 'model.npy'), chunk_y)

print(f"Data split into {num_chunks} chunks and saved in {split_dir}")

# test loading time for each chunk
chunk_load_times = []
for i in tqdm(range(num_chunks)):
    chunk_dir = os.path.join(split_dir, f'chunk_{i}')
    start_time = time.time()
    chunk_x = np.load(os.path.join(chunk_dir, 'data.npy'))
    chunk_y = np.load(os.path.join(chunk_dir, 'model.npy'))
    end_time = time.time()
    chunk_load_times.append(end_time - start_time)

print(f"Average time to load a single chunk: {np.mean(chunk_load_times):.6f} seconds")

Loading data from /mnt/datasets/openfwi/CurveVel_A
Splitting data into 4500 chunks of size 1


100%|██████████| 4500/4500 [00:02<00:00, 2226.01it/s]


Data split into 4500 chunks and saved in data/split_example


100%|██████████| 4500/4500 [00:00<00:00, 5205.81it/s]

Average time to load a single chunk: 0.000189 seconds





In [8]:
# Compare size of all chunks vs original two files
chunk_sizes = []
for i in range(num_chunks):
    chunk_dir = os.path.join(split_dir, f'chunk_{i}')
    chunk_size = os.path.getsize(os.path.join(chunk_dir, 'data.npy')) + os.path.getsize(os.path.join(chunk_dir, 'model.npy'))
    chunk_sizes.append(chunk_size)

print(f"Total size of all chunks: {sum(chunk_sizes) / (1024 * 1024):.2f} MB")
print(f"Size of original files: {sum(os.path.getsize(ex_x) + os.path.getsize(ex_y) for ex_x, ex_y in zip(exs_x, exs_y)) / (1024 * 1024):.2f} MB")

Total size of all chunks: 6093.36 MB
Size of original files: 6092.26 MB


In [12]:
chunk_x.squeeze().shape, chunk_y.squeeze().shape

((5, 1000, 70), (70, 70))

In [10]:
!rm -rf data/split_example

In [None]:
fault_files = [(x, x.split('_')[-1].split('.')[0]) for x in os.listdir('/mnt/datasets/openfwi/CurveFault_A/')]
fault_files = sorted(fault_files, key=lambda x: int(x[1]))

print(fault_files[:10])

reg_files = [(x, x.split('.')[0][4:]) for x in os.listdir('/mnt/datasets/openfwi/CurveVel_A/data/')]
reg_files = sorted(reg_files, key=lambda x: int(x[1]))

print(reg_files[:10])

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/datasets/openfwi/CurveFault_A/'

In [None]:
# Let's split all the data into chunks, identified by the current filename prefixed by the data type, and save them in a directory

data_dir = 'data/openfwi_unpacked'

def create_data_id(data_type, file_name):
    # Create a unique identifier for the data based on the data type and file name
    return f"{data_type}_{os.path.basename(file_name).split('.')[0]}"
def save_data_chunks(data_dir, data_types, root_dir):
    os.makedirs(data_dir, exist_ok=True)
    for data_type in data_types:
        ex_dir = os.path.join(root_dir, data_type)
        if 'Fault' in data_type:
            all_filenames = os.listdir(ex_dir)
            exs_x = sorted([os.path.join(ex_dir, f) for f in all_filenames if 'seis' in f])
            exs_y = sorted([os.path.join(ex_dir, f) for f in all_filenames if 'vel' in f])
        else:
            num_files = len(os.listdir(f'{ex_dir}/data'))
            exs_x = [f'{ex_dir}/data/data{i}.npy' for i in range(num_files)]
            exs_y = [f'{ex_dir}/model/model{i}.npy' for i in range(num_files)]

        # Load the data
        x = np.concatenate([np.load(ex_x) for ex_x in exs_x])
        y = np.concatenate([np.load(ex_y) for ex_y in exs_y])

        # Split the data into 500 chunks
        chunk_size = 1
        num_chunks = x.shape[0] // chunk_size

        # Save each chunk as a separate .npy file
        for i in tqdm(range(num_chunks)):
            chunk_x = x[i*chunk_size:(i+1)*chunk_size]
            chunk_y = y[i*chunk_size:(i+1)*chunk_size]

            # Create a directory for this chunk if it doesn't exist
            chunk_id = create_data_id(data_type, f'data_{i}')
            chunk_dir = os.path.join(data_dir, chunk_id)
            os.makedirs(chunk_dir, exist_ok=True)

            # Save the chunk data
            np.save(os.path.join(chunk_dir, 'data.npy'), chunk_x)
            np.save(os.path.join(chunk_dir, 'model.npy'), chunk_y)
    