This notebook examines the functionality of our dataset standardization approach.

In [3]:
import sys
import paths
from itertools import chain
import numpy as np

from TRMM import TRMM
from ERA import ERA
from ModelHelpers import ModelHelpers

def train_test_split(datasets,
                         prediction_ts,
                         prediction_ts_test,
                         onset_ts,
                         years=range(1979, 2018),
                         years_train=range(1979, 2010),
                         years_dev=range(2010, 2013),
                         years_test=range(2013, 2018)):
        """
        Prepare data to be in a digestible format for the model

        :datasets: List of datasets to use as features
        :outcomes: Outcomes as generated by the base model

        :return:
        """

        # generate outcomes
        outcomes = ModelHelpers.generate_outcomes(prediction_ts, onset_ts, chain(years_train, years_dev), numerical=True, sequence=True)
        outcomes_test = ModelHelpers.generate_outcomes(prediction_ts_test, onset_ts, years_test, numerical=True, sequence=True)
        print(outcomes_test)

        # generate training data
        X_train = ModelHelpers.prepare_datasets(years_train, datasets, prediction_ts)
        y_train = ModelHelpers.stack_outcomes(outcomes, years_train, augmented=True)
        print(X_train[0][0][0])
        print('> X_train', X_train.shape, 'y_train', y_train.shape)

        X_train, X_mean, X_std = ModelHelpers.normalize_channels(X_train, seperate=True)
        print(X_mean)
        print(X_std)

        # generate test data
        X_test = ModelHelpers.prepare_datasets(years_test, datasets, prediction_ts_test)
        y_test = ModelHelpers.stack_outcomes(outcomes_test, years_test, augmented=True)
        print(X_test.shape)
        print('> X_test', X_test.shape, 'y_test', y_test.shape)

        X_test = ModelHelpers.normalize_channels(X_test, mean=X_mean, std=X_std)
        
def normalize_channels(arr, standardize=True, seperate=False, mean=None, std=None):
        """ Normalize or standardize channels of a 4D tensor """

        # normalize each channel seperately
        # axes 1 and 2 should be fixed (lat and lon)
        # axes 0 and 2 should be variable (for each channel seperately over all images)
        # see: https://stackoverflow.com/questions/42460217/how-to-normalize-a-4d-numpy-array
        # and: https://stackoverflow.com/questions/40956114/numpy-standardize-2d-subsets-of-a-4d-array

        # the channels should be standardized to zero mean and unit variance seperately
        if mean is not None and std is not None:
            return (arr - mean) / std


        mean = np.mean(arr, axis=(0, 1), keepdims=True)
        std = np.std(arr, axis=(0, 1), keepdims=True)

        return (arr - mean) / std, mean, std

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
YEARS = range(2010, 2018)
YEARS_TRAIN = range(2010, 2014)
YEARS_TEST = range(2014, 2018)

# load onset dates
onset_dates, onset_ts = ModelHelpers.load_onset_dates(version='v2', objective=True)

# prepare prediction timestamps
# generate a sequence of timestamps for train and validation (and, optionally, test)
prediction_ts = ModelHelpers.generate_prediction_ts('{}-05-22', YEARS_TRAIN, onset_dates=onset_dates, sequence_length=29, sequence_offset=1, example_length=60)
prediction_ts_test = ModelHelpers.generate_prediction_ts('{}-05-22', YEARS_TEST, onset_dates=onset_dates, sequence_length=29, sequence_offset=1, example_length=60)

# setup a filter function
# this later prevents any data after the prediction timestamp from being fed as input
# we do this externally to allow overriding or extending the filter function if needed
def filter_fun(df, year):
    return ModelHelpers.filter_until(df, onset_ts[year])

# load the ERA dataset
print("> Loading Dataset")
dataset = ERA.load_dataset_v2(range(2010, 2018), invalidate=False, level=1000, variables=['r', 't'], filter_fun=filter_fun, aggregation_resolution=None)

# generate training data
X_train = ModelHelpers.prepare_datasets(YEARS_TRAIN, [dataset['r'],dataset['t']], prediction_ts)
X_test = ModelHelpers.prepare_datasets(YEARS_TEST, [dataset['r'],dataset['t']], prediction_ts_test)

> Loading Dataset
> Loading from cache...
> Loading from cache...
Processed 2010 (30, 61, 49, 49, 2)
Processed 2011 (30, 61, 49, 49, 2)
Processed 2012 (30, 61, 49, 49, 2)
Processed 2013 (30, 61, 49, 49, 2)
Processed 2014 (30, 61, 49, 49, 2)
Processed 2015 (30, 61, 49, 49, 2)
Processed 2016 (30, 61, 49, 49, 2)
Processed 2017 (30, 61, 49, 49, 2)


In [5]:
X_train, X_mean, X_std = normalize_channels(X_train, seperate=True)

In [6]:
X_train.shape

(120, 61, 49, 49, 2)

In [7]:
X_test.shape

(120, 61, 49, 49, 2)

In [8]:
X_mean.shape

(1, 1, 49, 49, 2)

In [9]:
X_std.shape

(1, 1, 49, 49, 2)

In [10]:
X_mean

array([[[[[ 76.03646987, 300.25469346],
          [ 76.00147785, 300.27541551],
          [ 75.75109012, 300.28128868],
          ...,
          [ 83.39855597, 299.93071621],
          [ 88.18274285, 300.52245945],
          [ 88.25598933, 300.92124126]],

         [[ 75.45281735, 300.3042589 ],
          [ 75.34161266, 300.29842613],
          [ 75.42493168, 300.30611566],
          ...,
          [ 81.83860695, 300.19316423],
          [ 80.94718736, 300.49322597],
          [ 78.32655993, 300.74613697]],

         [[ 75.61353495, 300.27163686],
          [ 75.62301163, 300.24182219],
          [ 75.29544047, 300.31048001],
          ...,
          [ 76.23657294, 300.78393032],
          [ 76.09455551, 300.88782531],
          [ 76.23910005, 300.96528742]],

         ...,

         [[ 39.98055257, 293.501531  ],
          [ 40.99420661, 293.75472701],
          [ 42.37022626, 294.0627037 ],
          ...,
          [ 39.69289786, 293.26210441],
          [ 43.75700562, 293.19795277],

In [11]:
np.mean((X_test - X_mean) / X_std)

0.08179388181343744