## Data Processing - Large Dataset (10y)
contains steps for loading, preprocessing and storing climate data of the large data set (10y)

### Imports

In [1]:
from downscaling.pipeline import DownscalingPipeline
from data_operations.data_loader import DataLoader
from data_operations.utility import store_to_disk, split_dataset
from data.urls import cerra_url, era5_url
from IPython.display import display
import os
import xarray as xr



### Settings

In [2]:
# general settings
data_path = './data/climate_data/'
result_path='./results/'
data_split_path = './data/data_split/'
preprocessed_path = './data/preprocessed_data/'
era5_lsm_z_file = f'era5_lsm_z'
cerra_lsm_orog_file = f'cerra_lsm_orog'

In [None]:
# STANDARDIZED ANOMALIES
normalization_type = 'standardized_anomalies'
file_cerra = 'cerra_sa_4y'
file_era = 'era5_sa_4y'
stats_file = preprocessed_path+'climatology_stats_sa_4y.json'


In [None]:
# MINMAX
normalization_type = 'min_max'
stats_file =  preprocessed_path+'climatology_stats_mm_4y.json'
file_cerra = 'cerra_mm_4y'
file_era = 'era5_mm_4y'

### Code to Load via URL

In [None]:
pipeline = DownscalingPipeline(normalization_type)
cerra_t2m, cerra_lsm_orog, era5_t2m, era5_lsm_z = pipeline.load_climate_data()

slow loader

In [None]:
data_laoder = DataLoader()
era5_add_ds = data_laoder.load_via_url(era5_url.lsm_geop_url)
store_to_disk(era5_lsm_z_file, era5_add_ds, data_path)

cerra_add_ds = data_laoder.load_via_url(cerra_url.lsm_orog_url)
store_to_disk(cerra_lsm_orog_file, cerra_add_ds, data_path)

In [None]:
data_laoder = DataLoader()
urls = era5_url.t2m_urls

for url, idx_era5 in enumerate(urls):
  era5_ds = data_laoder.load_via_url([url])
  store_to_disk("era50"+str(idx_era5), era5_ds, data_path)


In [None]:
data_laoder = DataLoader()
urls = cerra_url.t2m_urls

for url, idx_cerra in enumerate(urls):
  era5_ds = data_laoder.load_via_url([url])
  store_to_disk("cerra0"+str(idx_era5), era5_ds, data_path)


### Concat Climate Data

In [None]:
idx_cerra = 131
file_paths = [os.path.join(data_path, f'cerra0{i}.nc') for i in range(0, idx_cerra)]
cerra_ds = xr.open_mfdataset(file_paths)

size_in_bytes = cerra_ds.nbytes
size_in_gb = size_in_bytes / (1024**3)

print(f"Size of the xarray dataset: {size_in_gb:.2f} GB")

In [None]:
idx_era5 = 131
file_paths = [os.path.join(data_path, f'era50{i}.nc') for i in range(0, idx_era5)]
cerra_ds = xr.open_mfdataset(file_paths)

size_in_bytes = cerra_ds.nbytes
size_in_gb = size_in_bytes / (1024**3)

print(f"Size of the xarray dataset: {size_in_gb:.2f} GB")


#### Preprocess Data

In [11]:
lon_min = 8.3
lat_min = 43.8
lon_max = 14.5
lat_max = 51.5
crop_area = [lon_min, lat_min, lon_max, lat_max]

pipeline = DownscalingPipeline(normalization_type)
preprocessed_lr_data, preprocessed_hr_data = pipeline.preprocess_data(era5_ds, cerra_ds, era5_add_ds, cerra_add_ds, crop_region=crop_area, stats_filename=stats_file)

#### Store preprocessed data
To avoid repeating the preprocessing steps every time to train the model.

In [15]:
era5_preprocessed_file = f'era5_preprocessed_{normalization_type}_10y'
cerra_preprocessed_file = f'cerra_preprocessed_{normalization_type}_10y'

store_to_disk(era5_preprocessed_file, preprocessed_lr_data, preprocessed_path)
store_to_disk(cerra_preprocessed_file, preprocessed_hr_data, preprocessed_path)

Writing to ./data/era5_preprocessed_standardized_anomalies_10y.nc
Writing to ./data/cerra_preprocessed_standardized_anomalies_10y.nc


In [14]:
lr_train_data, lr_val_data, lr_test_data, hr_train_data, hr_val_data, hr_test_data = split_dataset(preprocessed_lr_data, preprocessed_hr_data)

train_data = [lr_train_data, hr_train_data]
val_data = [lr_val_data, hr_val_data]
test_data = [lr_test_data, hr_test_data]

store_to_disk('lr_train_sa_4y', lr_train_data, data_split_path)
store_to_disk('hr_train_sa_4y', hr_train_data, data_split_path)

store_to_disk('lr_val_sa_4y', lr_val_data, data_split_path)
store_to_disk('hr_val_sa_4y', hr_val_data, data_split_path)

store_to_disk('lr_test_sa_4y', lr_test_data, data_split_path)
store_to_disk('hr_test_sa_4y', hr_test_data, data_split_path)

Writing to ./data_split/lr_train_sa_same_stats_hr_10y.nc
Writing to ./data_split/hr_train_sa_same_stats_hr_10y.nc
Writing to ./data_split/lr_val_sa_same_stats_hr_10y.nc
Writing to ./data_split/hr_val_sa_same_stats_hr_10y.nc
Writing to ./data_split/lr_test_sa_same_stats_hr_10y.nc
Writing to ./data_split/hr_test_sa_same_stats_hr_10y.nc
