In [1]:
import hydra
with hydra.initialize(config_path=".", version_base="1.3"):
    cfg = hydra.compose(config_name="config_new")
cfg

{'dataset': {'base_path': '/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0', 'samples_per_file': 1460}, 'sample': {'height': 721, 'width': 1440, 'channels': 21}, 'include': {'static_data': True, 'spatial_info': True, 'temporal_info': True, 'solar_radiation': True}, 'datapipe': {'batch_size': 1, 'num_threads': 2, 'prefetch_queue_depth': 2}, 'num_testing_steps': 8, 'num_testing_samples_per_rank': 1, 'input_channels': 21, 'additional_input_channels': 10, 'output_channels': 21, 'mesh_level': 6, 'activation_fn': 'silu', 'hidden_dim': 512, 'hidden_layers': 1, 'aggregation_op': 'sum', 'processor_layers': 16, 'dtype': 'bfloat16', 'phase1_iters': 1000, 'phase2_iters': 9000, 'phase3_iters': 11000, 'phase3_increments': 11, 'lr': 0.001, 'phase3_lr': 0.0001, 'testing_frequency': 50, 'cache_dir': '/iopsstor/scratch/cscs/stefschu/DSM500/cache', 'wb_mode': 'online', 'wb_entity': 'schups', 'wb_watch_model': False, 'checkpoint_enabled': True, 'checkpoint_frequency': 5, 'checkpoint_folder': 

## List files

In [2]:
import os
import glob

def list_h5_files(directory):
    return glob.glob(os.path.join(directory, '**', '*.h5'), recursive=True)

h5_files = list_h5_files(cfg.dataset.base_path)
for file in h5_files:
    print(file)
len(h5_files)

/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/out_of_sample/2018.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/test/2017.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/test/2016.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2008.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2009.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2012.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2006.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2014.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2000.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/1996.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/2005.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/1982.h5
/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/train/1998.h5
/iopsstor/scratch/c

40

In [None]:
import numpy as np

climatology = np.zeros((cfg.dataset.samples_per_file, cfg.sample.channels, cfg.sample.height, cfg.sample.width), dtype=np.float32)
climatology.shape

(1460, 21, 721, 1440)

In [None]:
temp_container = np.zeros((len(h5_files), cfg.sample.channels, cfg.sample.height, cfg.sample.width), dtype=np.float32)
temp_container.shape

(40, 21, 721, 1440)

In [10]:
temp_container

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

In [None]:
import h5py
from pathlib import Path

# For each timestep in a year
for idx_in_year in range(cfg.dataset.samples_per_file):
    print(f"{idx_in_year=} {idx_in_year/cfg.dataset.samples_per_file*100:.2f}%")
    # Open each file and pull that time of the year
    for file_i, file in enumerate(h5_files):
        #print("\t", file_i)
        with h5py.File(file, 'r') as f:
            temp_container[file_i] = f["fields"][idx_in_year]
    climatology[idx_in_year] = np.mean(temp_container, axis=0)

np.save(Path(cfg.dataset.base_path) / "climatology.npy", climatology)


idx_in_year=0 0.00%
idx_in_year=1 0.07%
idx_in_year=2 0.14%
idx_in_year=3 0.21%
idx_in_year=4 0.27%


KeyboardInterrupt: 

In [4]:
import h5py
with h5py.File("/iopsstor/scratch/cscs/stefschu/DSM500/data/FCN_ERA5_data_v0/climatology.h5", 'r') as f:
    print(f.keys(), f["climatology"].dtype)

<KeysViewHDF5 ['climatology']> float32
