## Downscaling Experiment (4 years)

### Set-up

##### Imports

In [2]:
from IPython.display import display
import tensorflow as tf
import os
import xarray as xr

from downscaling.pipeline import DownscalingPipeline
from data_operations.utility import store_to_disk, split_dataset, store_to_disk
from data_operations.data_loader import DataLoader
from downscaling.modelconfig import UNetModelConfiguration

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
tf.compat.v1.Session(config=config)

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

##### Settings

In [3]:
# general settings
data_path = './data/climate_data/'
result_path='./results/'
data_split_path = './data/data_split/'
preprocessed_path = './data/preprocessed_data/'
era5_lsm_z_file = f'era5_lsm_z'
cerra_lsm_orog_file = f'cerra_lsm_orog'

In [4]:
# STANDARDIZED ANOMALIES
normalization_type = 'standardized_anomalies'
file_cerra = 'cerra_sa_4y'
file_era = 'era5_sa_4y'
stats_file = preprocessed_path+'climatology_stats_sa_4y.json'


In [None]:
# MINMAX
normalization_type = 'min_max'
stats_file =  preprocessed_path+'climatology_stats_mm_4y.json'
file_cerra = 'cerra_mm_4y'
file_era = 'era5_mm_4y'

### Preprocessing

#### Load from Disk

In [5]:
# Loading addtional features (lsm, z) from disk
era5_add_ds = DataLoader.load_from_disk(era5_lsm_z_file, data_path)
cerra_add_ds = DataLoader.load_from_disk(cerra_lsm_orog_file, data_path)

Cannot find the ecCodes library


In [6]:
# Loading era5 and cerra data from disk
idx_era5 =131
start = 84 #4 y starting with index 84, index 0 would be start of 10 y
idx_cerra = idx_era5

# --- CERRA ---
file_paths = [os.path.join(data_path, f'cerra0{i}.nc') for i in range(start, idx_cerra)]
cerra_ds = xr.open_mfdataset(file_paths)

# Get the size in bytes
size_in_bytes = cerra_ds.nbytes
size_in_gb = size_in_bytes / (1024**3)  
print(f"Size of the xarray dataset: {size_in_gb:.2f} GB")

# --- ERA5 ---
file_paths = [os.path.join(data_path, f'era50{i}.nc') for i in range(start, idx_era5)]
era5_ds = xr.open_mfdataset(file_paths)

# Get the size in bytes
size_in_bytes = era5_ds.nbytes
size_in_gb = size_in_bytes / (1024**3)
print(f"Size of the xarray dataset: {size_in_gb:.2f} GB")


Size of the xarray dataset: 17.10 GB
Size of the xarray dataset: 2.65 GB


#### Preprocess Data

In [7]:
lon_min = 8.3
lat_min = 43.8
lon_max = 14.5
lat_max = 51.5
crop_area = [lon_min, lat_min, lon_max, lat_max]

pipeline = DownscalingPipeline(normalization_type)
preprocessed_lr_data, preprocessed_hr_data = pipeline.preprocess_data(era5_ds, cerra_ds, era5_add_ds, cerra_add_ds, crop_region=crop_area, stats_filename=stats_file)

In [None]:
display(preprocessed_lr_data, preprocessed_hr_data)

##### Store preprocessed data
To avoid repeating the preprocessing steps every time to train the model.

In [None]:
era5_preprocessed_file = f'era5_sa_4y'
cerra_preprocessed_file = f'cerra_sa_4y'

store_to_disk(era5_preprocessed_file, preprocessed_lr_data, preprocessed_path)
store_to_disk(cerra_preprocessed_file, preprocessed_hr_data, preprocessed_path)

##### Loading Preprocessed Data

In [None]:
preprocessed_lr_data = DataLoader.load_from_disk(file_era, preprocessed_path)
preprocessed_hr_data = DataLoader.load_from_disk(file_cerra, preprocessed_path)

In [None]:
display(preprocessed_hr_data)
display(preprocessed_lr_data)

### Data Splitting

In [8]:
lr_train_data, lr_val_data, lr_test_data, hr_train_data, hr_val_data, hr_test_data = split_dataset(preprocessed_lr_data, preprocessed_hr_data)

train_data = [lr_train_data, hr_train_data]
val_data = [lr_val_data, hr_val_data]
test_data = [lr_test_data, hr_test_data]

In [None]:
# Calculate ratios
total_lr_data = len(preprocessed_lr_data.time)
total_hr_data = len(preprocessed_lr_data.time)

ratio_lr_train = len(lr_train_data.time) / total_lr_data
ratio_lr_val = len(lr_val_data.time) / total_lr_data
ratio_lr_test = len(lr_test_data.time) / total_lr_data

ratio_hr_train = len(hr_train_data.time) / total_hr_data
ratio_hr_val = len(hr_val_data.time) / total_hr_data
ratio_hr_test = len(hr_test_data.time) / total_hr_data

# Print ratios
print(f"Low-Resolution Data Ratios:")
print(f"Train: {ratio_lr_train:.2%}, Validation: {ratio_lr_val:.2%}, Test: {ratio_lr_test:.2%}")

print("\nHigh-Resolution Data Ratios:")
print(f"Train: {ratio_hr_train:.2%}, Validation: {ratio_hr_val:.2%}, Test: {ratio_hr_test:.2%}")

##### Storing Data Split

In [None]:
store_to_disk('lr_train_sa_4y', lr_train_data, data_split_path)
store_to_disk('hr_train_sa_4y', hr_train_data, data_split_path)

store_to_disk('lr_val_sa_4y', lr_val_data, data_split_path)
store_to_disk('hr_val_sa_4y', hr_val_data, data_split_path)

store_to_disk('lr_test_sa_4y', lr_test_data, data_split_path)
store_to_disk('hr_test_sa_4y', hr_test_data, data_split_path)

##### Loading Data Split

In [None]:
import xarray as xr

lr_train_data = DataLoader.load_from_disk('lr_train_sa_4y', data_split_path)
hr_train_data = DataLoader.load_from_disk('hr_train_sa_4y', data_split_path)

lr_val_data = DataLoader.load_from_disk('lr_val_sa_4y', data_split_path)
hr_val_data = DataLoader.load_from_disk('hr_val_sa_4y', data_split_path)

lr_test_data = DataLoader.load_from_disk('lr_test_sa_4y', data_split_path)
hr_test_data = DataLoader.load_from_disk('hr_test_sa_4y', data_split_path)

train_data = [lr_train_data, hr_train_data]
val_data = [lr_val_data, hr_val_data]
test_data = [lr_test_data, hr_test_data]

### Hyperparameter Optimization

#### Parameter Setting

In [9]:
last_idx=0

normalization_types = ['standardized_anomalies', 'min_max']
scheduler_types = ['step_decay', 'exponential_decay', 'time_decay'] 
learning_rate_values = [1e-1, 1e-2, 1e-3, 1e-4]
loss_types = ['mse', 'mae', 'huber_loss'] 
num_epochs_list = [2, 4, 10, 15, 20, 30, 50]
batch_sizes = [2, 4, 8, 16, 32, 64]
initial_filters = [16, 32, 56, 64]

model_configuration = UNetModelConfiguration()

### Hyperparameter Optimization for Standardized Anomalies

In [None]:
filename_suffix = 'sa_4y_'+str(last_idx)

normalization_type = normalization_types[0]
scheduler_type = scheduler_types[2]
learning_rate_value = learning_rate_values[1]
num_epochs = num_epochs_list[5]
batch_size = batch_sizes[1]
loss_type = loss_types[2]
initial_filter = initial_filters[2]
filters = model_configuration.generate_filters(initial_filter)


model_setup = {
    'scheduler_type': scheduler_type,
    'learning_rate_value': learning_rate_value,
    'num_epochs': num_epochs,
    'batch_size': batch_size,
    'loss_type': loss_type,
    'filters': filters,
    'activation_function': 'tanh',
    'note': '4y, cropped area'
}

pipeline = DownscalingPipeline(normalization_type)
pipeline.run_downscaling_pipeline(normalization_type=normalization_type, train_data=train_data, val_data=val_data, lr_test_data=lr_test_data, hr_test_data=hr_test_data, model_setup=model_setup, filename_suffix=filename_suffix, result_path=result_path, stats_file=stats_file)
last_idx += 1