 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"></ul></div>

In [1]:
# Imports 
from importlib import reload
import utils; reload(utils); from utils import *
import data_generator; reload(data_generator); from data_generator import *
import models; reload(models); from models import *

Using TensorFlow backend.


In [2]:
# Limit Tensorflow GPU memory usage. 
# Note that it's not possible to change the allocation or release memory again.
config = tf.ConfigProto()
config.gpu_options.allow_growth = True   # Allocates as much memory as needed.
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

In [3]:
data_dir = '/project/meteo/w2w/A6/S.Rasp/SP-CAM/preprocessed_data/SOLIN_sample/' # Full dataset

In [4]:
# Define a dictionary containing the feature and target variables
# and the number of dimensions
feature_vars = OrderedDict({
    'TAP': 2,             # Temperature [z, sample]
    'QAP': 2,             # Specific humidity [z, sample]
    'dTdt_adiabatic': 2,  # [z, sample]
    'dQdt_adiabatic': 2,  # [z, sample]
    'SHFLX': 1,           # [sample]
    'LHFLX': 1,           # [sample]
    'SOLIN': 1,
})
target_vars = OrderedDict({
    'SPDT': 2,            # SP temperature tendency [z, sample]
    'SPDQ': 2,            # SP humidity tendency [z, sample]
    'QRL': 2,
    'QRS': 2,
    'PRECT': 1,
    'FLUT': 1,
})

In [5]:
import xarray as xa

In [6]:
ds = xa.open_dataset(data_dir + 'SPCAM_outputs_valid_by_lon_flat.nc', chunks={'sample':10240})

In [7]:
ds

<xarray.Dataset>
Dimensions:         (lev: 21, sample: 703872)
Dimensions without coordinates: lev, sample
Data variables:
    TAP             (lev, sample) float32 dask.array<shape=(21, 703872), chunksize=(21, 10240)>
    QAP             (lev, sample) float32 dask.array<shape=(21, 703872), chunksize=(21, 10240)>
    SHFLX           (sample) float32 dask.array<shape=(703872,), chunksize=(10240,)>
    LHFLX           (sample) float32 dask.array<shape=(703872,), chunksize=(10240,)>
    LAT             (sample) float32 dask.array<shape=(703872,), chunksize=(10240,)>
    dTdt_adiabatic  (lev, sample) float32 dask.array<shape=(21, 703872), chunksize=(21, 10240)>
    dQdt_adiabatic  (lev, sample) float32 dask.array<shape=(21, 703872), chunksize=(21, 10240)>
    QRL             (lev, sample) float32 dask.array<shape=(21, 703872), chunksize=(21, 10240)>
    QRS             (lev, sample) float32 dask.array<shape=(21, 703872), chunksize=(21, 10240)>
    SPDT            (lev, sample) float32 dask

In [25]:
ds.TAP[:, 200000:200512].values

array([[ 218.80735779,  218.93823242,  218.96647644, ...,  193.75048828,
         193.75413513,  194.04728699],
       [ 217.0735321 ,  217.43255615,  217.54283142, ...,  202.76734924,
         202.85719299,  203.112854  ],
       [ 214.65428162,  215.18395996,  214.92056274, ...,  212.1003418 ,
         212.34838867,  212.53092957],
       ..., 
       [ 271.40307617,  270.88552856,  271.00317383, ...,  296.21398926,
         296.59500122,  297.65142822],
       [ 272.71487427,  272.23104858,  272.34503174, ...,  297.5007019 ,
         298.18417358,  299.28262329],
       [ 274.06552124,  273.71673584,  274.06304932, ...,  299.44659424,
         299.99035645,  300.76113892]], dtype=float32)

In [5]:
batch_size = 512
# Determine sizes
n_samples = 703872
n_batches = int(n_samples / batch_size)

# Create ID list
idxs = np.arange(0, n_samples, batch_size)
np.random.shuffle(idxs)

In [9]:
def test():
    for i in range(n_batches): 
        tmp = ds.TAP[:, idxs[i]:idxs[i] + batch_size].values

In [10]:
%timeit test()

87 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
# 87 
ds.close()

In [6]:
f = h5py.File(data_dir + 'SPCAM_outputs_valid_by_lon_flat.nc', 'r')

In [9]:
def test2():
    for i in range(n_batches):
        tmp = f['TAP'][:, idxs[i]:idxs[i] + batch_size]

In [10]:
%timeit test2() # 633

633 ms ± 9.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
def test3():
    for i in range(n_batches):
        tmp = f['TAP'][:, idxs[i]:idxs[i] + batch_size]
        tmp = f['QAP'][:, idxs[i]:idxs[i] + batch_size]
        tmp = f['QRS'][:, idxs[i]:idxs[i] + batch_size]
        tmp = f['QRL'][:, idxs[i]:idxs[i] + batch_size]

In [14]:
%timeit test3() # 2.68 

2.68 s ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
0.633 * 4

2.532

In [11]:
valid_set_lon = DataSet(data_dir, 'SPCAM_outputs_valid_by_lon_flat.nc', 'SPCAM_mean_detailed.nc',
                        'SPCAM_std_detailed.nc', feature_vars.keys(), flat_input=True,
                        target_names=target_vars.keys())

In [25]:
gen1 = data_generator1(data_dir, 'SPCAM_outputs_valid_by_lon_flat.nc', 'SPCAM_mean_detailed.nc',
                        'SPCAM_std_detailed.nc', feature_vars.keys(),
                        target_names=target_vars.keys(), shuffle=False)

In [16]:
valid_set_lon.features[0, :5]

array([ 1.79428971,  1.92524457,  1.75502491,  1.05126357,  0.12397254], dtype=float32)

In [26]:
x1, y1 = next(gen1)

7038720 13747


In [28]:
x1[0, :5]

array([ 1.79428971,  1.92524457,  1.75502491,  1.05126357,  0.12397254], dtype=float32)

In [7]:
train_generator = oldDataGenerator(data_dir_old, 'SPCAM_outputs.nc', 512, feature_vars.keys())

In [8]:
x, y = next(train_generator.generate())

In [9]:
x.shape, y.shape

((512, 150), (512, 42))

In [10]:
%timeit [next(train_generator.generate()) for i in range(1000)]

9.8 s ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
train_generator = oldDataGenerator(data_dir_old, 'SPCAM_outputs.nc', 512, feature_vars.keys())

In [12]:
%timeit [next(train_generator.generate()) for i in range(1000)]

9.58 s ± 70.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
train_generator = test_data_generator(data_dir_old, 'SPCAM_outputs.nc', 512, feature_vars.keys())

In [6]:
tr2 = oldDataGenerator(data_dir_old, 'SPCAM_outputs.nc', 512, feature_vars.keys())

In [39]:
g = train_generator.generate()

AttributeError: 'threadsafe_iter' object has no attribute 'generate'

In [7]:
model = Sequential([
    Dense(42, input_shape=(150,))
])
model.compile(Adam(), loss='mae')

In [8]:
model.fit_generator(tr2.generate(), 9024)

Epoch 1/1


<keras.callbacks.History at 0x7feafc5617b8>

In [51]:
model.fit_generator(train_generator, 9024, workers=8)

Epoch 1/1
1885/9024 [=====>........................] - ETA: 167s - loss: 0.1580

KeyboardInterrupt: 

In [15]:
model.fit_generator(train_generator.generate(), train_generator.n_batches)

Epoch 1/1


<keras.callbacks.History at 0x7fddec0d6dd8>

In [18]:
model.fit_generator(train_generator.generate(), train_generator.n_batches, workers=4)

Epoch 1/1


Exception in thread Thread-8:
Traceback (most recent call last):
  File "/home/s/S.Rasp/.conda/envs/cbrain_gpu/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/s/S.Rasp/.conda/envs/cbrain_gpu/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/s/S.Rasp/.conda/envs/cbrain_gpu/lib/python3.6/site-packages/keras/engine/training.py", line 612, in data_generator_task
    generator_output = next(self._generator)
ValueError: generator already executing



ValueError: output of generator should be a tuple `(x, y, sample_weight)` or `(x, y)`. Found: None