## TESTING Padding 
The LR-Data needs to be padded with nan-values to avoid a dimension mismatch while fitting the unet model.
In this file i validated that the functionality works correclty. 

In [2]:
import numpy as np
import pandas as pd
import xarray as xr
from data_loader.data_service import pad_lr_data_to_match_hr


In [7]:
# Create coordinates for the smaller example
lon_era5 = np.array([10, 15])
lat_era5 = np.array([30, 35, 40])
time_era5 = pd.date_range("2006-01-01", periods=3, freq="D")

lon_cerra = np.array([10, 12.5, 15, 17.5])
lat_cerra = np.array([30, 32, 35, 37, 40])

# Create smaller example datasets for ERA5 (LR) and CERRA (HR)
era5_data = np.random.rand(len(time_era5), len(lat_era5), len(lon_era5))
cerra_data = np.random.rand(len(time_era5), len(lat_cerra), len(lon_cerra))

era5_ds = xr.Dataset(
    {"t2m": (("time", "latitude", "longitude"), era5_data)},
    coords={"longitude": lon_era5, "latitude": lat_era5, "time": time_era5},
)

cerra_ds = xr.Dataset(
    {"t2m": (("time", "latitude", "longitude"), cerra_data)},
    coords={"longitude": lon_cerra, "latitude": lat_cerra, "time": time_era5},
)

# Pad the LR data with additional features
padded_lr_data = pad_lr_data_to_match_hr(cerra_ds, era5_ds)

# Print the padded LR dataset
display(era5_ds)
display(cerra_ds)
display(padded_lr_data)


In [8]:
# Additional features (you can add more features if needed)
lsm_data_lr = np.random.rand(len(time_era5), len(lat_era5), len(lon_era5))
lsm_ds_lr = xr.Dataset(
    {"lsm": (("time", "latitude", "longitude"), lsm_data_lr)},
    coords={"longitude": lon_era5, "latitude": lat_era5, "time": time_era5},
)

lsm_data_hr = np.random.rand(len(time_era5), len(lat_cerra), len(lon_cerra))
lsm_ds_hr = xr.Dataset(
    {"lsm": (("time", "latitude", "longitude"), lsm_data_hr)},
    coords={"longitude": lon_cerra, "latitude": lat_cerra, "time": time_era5},
)

padded_lsm_data = pad_lr_data_to_match_hr(lsm_ds_hr, lsm_ds_lr)
display(lsm_ds_hr)
display(lsm_ds_lr)
display(padded_lsm_data)


In [10]:
# Additional features (you can add more features if needed)
lsm_data_lr = np.random.rand(len(time_era5), len(lat_era5), len(lon_era5))
z_data_lr = np.random.rand(len(time_era5), len(lat_era5), len(lon_era5))
lsm_zds_lr = xr.Dataset(
    {"z": (("time", "latitude", "longitude"), z_data_lr), 
     'lsm':(("time", "latitude", "longitude"), lsm_data_lr)},
    coords={"longitude": lon_era5, "latitude": lat_era5, "time": time_era5},
)

lsm_data_hr = np.random.rand(len(time_era5), len(lat_cerra), len(lon_cerra))
orog_data_hr = np.random.rand(len(time_era5), len(lat_cerra), len(lon_cerra))
lsm_ds_hr = xr.Dataset(
    {"orog": (("time", "latitude", "longitude"), orog_data_hr), 
     'lsm':(("time", "latitude", "longitude"), lsm_data_hr)},
    coords={"longitude": lon_cerra, "latitude": lat_cerra, "time": time_era5},
)

# Pad the LR data with additional features
padded_lsm_z_data = pad_lr_data_to_match_hr(lsm_ds_hr, lsm_zds_lr)

# Print the padded LR dataset
display(lsm_ds_hr)
display(lsm_zds_lr)
display(padded_lsm_z_data)

In [5]:
t2m_era5_padded = padded_lr_data['t2m']
t2m_era5 = era5_ds['t2m']
t2m_cerra = cerra_ds['t2m']


# Total number of values in t2m
total_values_era5 = np.prod(t2m_era5.shape)
total_values_cerra = np.prod(t2m_cerra.shape)
diff = total_values_cerra - total_values_era5

total_values_era5_padded = np.prod(t2m_era5_padded.shape)



# Count the number of NaN values in t2m
nan_count = np.sum(np.isnan(t2m_era5_padded.values))
non_nan_count = np.sum(~np.isnan(t2m_era5_padded.values))

print(f"Number values in cerra t2m: {total_values_cerra}")
print(f"Number values in era5 t2m: {total_values_era5}")
print(f"Diff: {diff}")



print(f"Number values in era5 t2m padded: {total_values_era5_padded}")
print(f"Number of NaN values in t2m padded: {nan_count}")
print(f"Number of non NaN values in t2m padded: {non_nan_count}")

Number values in cerra t2m: 60
Number values in era5 t2m: 18
Diff: 42
Number values in era5 t2m padded: 60
Number of NaN values in t2m padded: 42
Number of non NaN values in t2m padded: 18
