# Convert climate datasets for ML algorithms

Here, we will convert the CliMT outputs to a format for ML training. Also we will create normalization files for neural network training.

In [1]:
import xarray as xr
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
input_vars = ['air_temperature', 'specific_humidity', 'eastward_wind', 'northward_wind', 
              'air_pressure']
output_vars = [
    'air_temperature_tendency_from_convection', 
    'specific_humidity_tendency_from_convection', 
    'eastward_wind_tendency_from_convection', 
    'northward_wind_tendency_from_convection',
    'convective_precipitation_rate'
]

In [3]:
inputs = xr.open_mfdataset(
    'inputs_ref.nc', chunks={'time': 10}, combine='by_coords'
).transpose('time', 'mid_levels', 'lat', 'lon')
outputs = xr.open_mfdataset(
    'outputs_ref.nc', chunks={'time': 10}, combine='by_coords'
).transpose('time', 'mid_levels', 'lat', 'lon')

## Stack data

[sample, stacked_levels]

In [4]:
#EXPORT
def convert_data(
    raw_input_fn, raw_output_fn, conv_input_fn, conv_output_fn,
    input_vars=input_vars, output_vars=output_vars
):
    inputs = xr.open_mfdataset(
        raw_input_fn, chunks={'time': 10}, combine='by_coords'
    ).transpose('time', 'mid_levels', 'lat', 'lon')
    outputs = xr.open_mfdataset(
        raw_output_fn, chunks={'time': 10}, combine='by_coords'
    ).transpose('time', 'mid_levels', 'lat', 'lon')
    
    inputs = xr.concat(
        [inputs[v] for v in input_vars], dim='mid_levels'
    ).rename('inputs')
    outputs = xr.concat(
        [outputs[v] for v in output_vars], dim='mid_levels'
    ).rename('outputs')
    
    inputs = inputs.stack(
        sample=('time', 'lat', 'lon')
    ).transpose().reset_index('sample')
    outputs = outputs.stack(
        sample=('time', 'lat', 'lon')
    ).transpose().reset_index('sample')
    
    inputs.to_netcdf(conv_input_fn)
    outputs.to_netcdf(conv_output_fn)
    
    return inputs, outputs

In [5]:
!ls

1.0-Reference-simulations.ipynb  nn_data.nc
1.1-Convert-data-for-ML.ipynb	 norm_arrs.pkl
2.0-Train-ML-models.ipynb	 outputs_modified_entrainment.nc
Climt-Copy1.ipynb		 outputs_norm.nc
Climt.ipynb			 outputs_old.nc
CliMT-ML.ipynb			 outputs_ref.nc
functions.py			 __pycache__
inputs_modified_entrainment.nc	 ref_data.nc
inputs_norm.nc			 stacked_inputs_ref.nc
inputs_old.nc			 stacked_outputs_ref.nc
inputs_ref.nc			 state.pkl


In [6]:
convert_data(
    'inputs_ref.nc', 'outputs_ref.nc',
    'stacked_inputs_ref.nc', 'stacked_outputs_ref.nc'
)

(<xarray.DataArray 'inputs' (sample: 2949120, mid_levels: 50)>
 dask.array<shape=(2949120, 50), dtype=float64, chunksize=(20480, 10)>
 Coordinates:
     time     (sample) datetime64[ns] 2001-10-30T13:00:00 ... 2002-04-28T10:00:00
     lat      (sample) int64 0 0 0 0 0 0 0 0 0 0 ... 31 31 31 31 31 31 31 31 31
     lon      (sample) int64 0 1 2 3 4 5 6 7 8 9 ... 55 56 57 58 59 60 61 62 63
 Dimensions without coordinates: sample, mid_levels,
 <xarray.DataArray 'outputs' (sample: 2949120, mid_levels: 41)>
 dask.array<shape=(2949120, 41), dtype=float64, chunksize=(20480, 10)>
 Coordinates:
     time     (sample) datetime64[ns] 2001-10-30T13:30:00 ... 2002-04-28T10:30:00
     lat      (sample) int64 0 0 0 0 0 0 0 0 0 0 ... 31 31 31 31 31 31 31 31 31
     lon      (sample) int64 0 1 2 3 4 5 6 7 8 9 ... 55 56 57 58 59 60 61 62 63
 Dimensions without coordinates: sample, mid_levels)

In [7]:
convert_data(
    'inputs_modified_entrainment.nc', 'outputs_modified_entrainment.nc',
    'stacked_inputs_modified_entrainment.nc', 'stacked_outputs_modified_entrainment.nc'
)

(<xarray.DataArray 'inputs' (sample: 2949120, mid_levels: 50)>
 dask.array<shape=(2949120, 50), dtype=float64, chunksize=(20480, 10)>
 Coordinates:
     time     (sample) datetime64[ns] 2001-10-30T13:00:00 ... 2002-04-28T10:00:00
     lat      (sample) int64 0 0 0 0 0 0 0 0 0 0 ... 31 31 31 31 31 31 31 31 31
     lon      (sample) int64 0 1 2 3 4 5 6 7 8 9 ... 55 56 57 58 59 60 61 62 63
 Dimensions without coordinates: sample, mid_levels,
 <xarray.DataArray 'outputs' (sample: 2949120, mid_levels: 41)>
 dask.array<shape=(2949120, 41), dtype=float64, chunksize=(20480, 10)>
 Coordinates:
     time     (sample) datetime64[ns] 2001-10-30T13:30:00 ... 2002-04-28T10:30:00
     lat      (sample) int64 0 0 0 0 0 0 0 0 0 0 ... 31 31 31 31 31 31 31 31 31
     lon      (sample) int64 0 1 2 3 4 5 6 7 8 9 ... 55 56 57 58 59 60 61 62 63
 Dimensions without coordinates: sample, mid_levels)

## Compute normalization files

To avoid weird effect from dividing by small numbers, I will compute the std for each variable over all levels and use this.

In [47]:
def compute_means_stds(ds, sampling_interval=100):
    means = {v: ds[v].isel(time=slice(0, None, sampling_interval)).mean().values 
             for v in ds} 
    stds = {v: ds[v].isel(time=slice(0, None, sampling_interval)).std().values 
             for v in ds} 
    return means, stds

In [60]:
def broadcast_norm(ds, stat, var):
    arr = []
    for v in var:
        arr += [stat[v]] * (len(ds[v].mid_levels) if hasattr(ds[v], 'mid_levels') else 1)
    return np.array(arr)

In [61]:
input_means, input_stds = [broadcast_norm(inputs, stat, input_vars) 
                           for stat in compute_means_stds(inputs)]

In [62]:
output_means, output_stds = [broadcast_norm(outputs, stat, output_vars) 
                           for stat in compute_means_stds(outputs)]

In [63]:
with open('norm_arrs.pkl', 'wb') as f:
    pickle.dump((input_means, input_stds, output_means, output_stds), f)