# Training data preparation

This notebook prepares the training data for the training of the emulator model.

## Approach

In order to allow the emulator to be applied to arbitrary atmospheric grids, the input profile are interpolated to equidistant points along the pencil beam. Since the simulations assume a slant pencil beam through a plane parallel atmosphere, the effective distance along the beam must be calculated from the layer distance (500 m).

All profiles are interpolated to 36 equidistant points along the pencil beam.

> *NOTE*: For simplicity, I extended the profile to be defined on the layer boundaries instead of in between.

In [9]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from pathlib import Path
profile_path = Path('/edata2/spencer/eddington_emulator/era5_profiles_ocean_500m/')
profile_files = sorted(list(profile_path.glob("*_500m.bin")))
sim_files = []
matched_profile_files = []
for profile_file in profile_files:
    sim_file = profile_file.parent / profile_file.name.replace(".bin", "_simobs.bin")
    if sim_file.exists():
        sim_files.append(sim_file)
        matched_profile_files.append(profile_file)

In [12]:
len(matched_profile_files)

119

In [14]:
from poem.emulator.training_data import read_profile_file, read_sim_file
from filelock import FileLock
from tqdm import tqdm

# The number and distance between the equidistant steps along the pencil beam.
N_STEPS = 36
STEP_DISTANCE = 1.0

def extract_training_data(
    profile_file: Path,
    sim_file: Path,
    output_file: Path
) -> None:
    """
    Extract training data from profile/sim file pairs.

    Args:
        profile_file: A path object pointing to the profile file.
        sim_file: A path object pointing to the sim file.
        output_file: The file to which to write the training data.
    """
    output_file = Path(output_file)
    
    profiles = read_profile_file(profile_file)
    simulations = read_sim_file(sim_file)

    profile_inds = np.random.permutation(profiles.profiles.size)

    training_profiles = []

    for profile_ind in tqdm(profile_inds):
        profile_data = profiles[{"profiles": profile_ind}]
        sim_data = simulations[{"profiles": profile_ind}]
        
        eia = profile_data.earth_incidence_angle.data
        path_distance = profile_data.height.data / np.cos(np.deg2rad(eia))

        profile_data["levels"] = (("levels"), path_distance)
        profile_data = profile_data.interp(
            levels=STEP_DISTANCE * np.arange(N_STEPS),
            kwargs={"fill_value": 0.0}
        )

        training_profile = xr.merge([profile_data, sim_data])
        training_profiles.append(training_profile)

    training_profiles = xr.concat(training_profiles, dim="profiles")

    enc = {
        var: {"dtype": "float32", "zlib": True}
        for var in training_profiles.variables
    }

    lock = FileLock(str(output_file) + ".lock")
    with lock:
        if output_file.exists():
            existing = xr.load_dataset(output_file)
            training_profiles = xr.concat([existing, training_profiles], dim="profiles")
            training_profiles = training_profiles[
                {"profiles": np.random.permutation(training_profiles.profiles.size)}
            ]
        training_profiles.to_netcdf(output_file, encoding=enc)

In [15]:
inds = np.random.permutation(len(sim_files))

In [16]:
for ind in inds[:4]:
    extract_training_data(matched_profile_files[ind], sim_files[ind], "/home/simon/data/poem/full/validation_data.nc")

100%|██████████████████████████████████████████████████████████| 548036/548036 [26:38<00:00, 342.88it/s]
100%|██████████████████████████████████████████████████████████| 543400/543400 [26:32<00:00, 341.29it/s]
100%|██████████████████████████████████████████████████████████| 541684/541684 [26:10<00:00, 344.85it/s]
100%|██████████████████████████████████████████████████████████| 540104/540104 [26:19<00:00, 341.91it/s]


In [None]:
for ind in inds[4:]:
    extract_training_data(matched_profile_files[ind], sim_files[ind], "/home/simon/data/poem/training_data.nc")

100%|██████████████████████████████████████████████████████████| 543368/543368 [26:23<00:00, 343.09it/s]
100%|██████████████████████████████████████████████████████████| 535848/535848 [25:41<00:00, 347.55it/s]
100%|██████████████████████████████████████████████████████████| 547424/547424 [26:27<00:00, 344.80it/s]
100%|██████████████████████████████████████████████████████████| 547848/547848 [26:23<00:00, 346.06it/s]
100%|██████████████████████████████████████████████████████████| 539684/539684 [25:55<00:00, 346.90it/s]
100%|██████████████████████████████████████████████████████████| 547572/547572 [26:19<00:00, 346.58it/s]
100%|██████████████████████████████████████████████████████████| 549128/549128 [26:17<00:00, 348.07it/s]
100%|██████████████████████████████████████████████████████████| 542584/542584 [25:55<00:00, 348.91it/s]
100%|██████████████████████████████████████████████████████████| 553928/553928 [26:44<00:00, 345.18it/s]
100%|██████████████████████████████████████████████████

In [None]:
from poem.emulator.training_data import TrainingData
training_data = TrainingData("/home/simon/data/poem/training_data.nc")

# 

In [87]:
x, y = training_data[0]

In [None]:
1