In [1]:
import pickle
import os
import random
import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

2022-04-17 21:23:54.728898: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


# data pre-processing

In [9]:
def wh_ETL(dir_path, idx, mode='train', feature_range=(0, 1), pickle_save_dir='models_save/test1/pickles'):
    # load from folder
    whs = list(Path(dir_path).glob('*.npy'))
    whs = np.stack([np.load(path) for path in whs])
    print(whs.shape)
    nb_test_samples = int(0.2 * whs.shape[0])
    whs = whs[idx]
    if mode is 'train':
        whs = whs[2*nb_test_samples:]
    elif mode is 'val':
        whs = whs[0:nb_test_samples]
    else:
        whs = whs[nb_test_samples:2*nb_test_samples]    
    
    # move sample axis to the last, reshape, and remove the first layer
    whs = np.moveaxis(whs, 1, 3).reshape(-1, 34)[:, 1:]
    
    # Min Max normalization
    scaler = MinMaxScaler(feature_range=feature_range)
    scaled_whs = scaler.fit_transform(whs)
    
    # PCA
    pca = PCA(n_components=0.96, svd_solver='full')
    out = pca.fit_transform(scaled_whs)
    
    # save scaler and pca as pickles
    if mode is 'train':
        Path(pickle_save_dir).mkdir(parents=True, exist_ok=True)
        pickle.dump(scaler, open(f'{pickle_save_dir}/scaler.pkl', 'wb'))
        pickle.dump(pca, open(f'{pickle_save_dir}/pca.pkl', 'wb'))
    return out.astype(np.float32), pca


def load_X(dir_path, feature_range=(0, 1)):
    Vars = []
    names = [p.name for p in Path(X_path).glob('*/')]
    for name in names:
        path = Path(X_path).joinpath(name)
        data = np.stack([np.load(p) for p in path.glob('*.npy')])
        if name == 'cape':
            data = data[:, np.newaxis, ..., np.newaxis]
            data = np.concatenate([data]*34, axis=1)
        else:
            data = data[..., np.newaxis]
        Vars.append(data)
    X = np.concatenate(Vars, axis=-1)[:, 1:, ...]
    X = np.moveaxis(X, 1, -2)
    shape = X.shape
        
    # stdandardlization
    scaler = MinMaxScaler(feature_range=feature_range)
    X = X.reshape(-1, 9)
    X = scaler.fit_transform(X)
    X = X.reshape(shape)
    
    return X.astype(np.float32), scaler


def pad_boundary(arr, kernel_size):
    '''
    arr : input array
    kernel_size : training sample size
    '''

    pad_size = int((kernel_size - 1) / 2)

    if len(arr.shape) == 5:
        pad_arr = np.pad(arr, pad_size, 'wrap')
        pad_arr = pad_arr[pad_size:-pad_size, ..., pad_size:-pad_size, pad_size:-pad_size]
    else:
        raise ValueError('len(arr.shape) should be 5')
    return pad_arr


def X_ETL(path, idx, mode='train', feature_range=(0,1), pickle_save_dir='models_save/test1/pickles'):
    # load_X and select train/val/test set due to lack of memory
    X, scaler = load_X(path, feature_range)
    nb_test_samples = int(0.2 * X.shape[0])
    
    # shuffle
    X = X[idx]
    
    # split
    if mode is 'train':
        X = X[2*nb_test_samples:]
    elif mode is 'val':
        X = X[0:nb_test_samples]
    else:
        X = X[nb_test_samples:2*nb_test_samples]
    
    print(X.shape)
    # CNN input style
    X = pad_boundary(X, 7)
    print(X.shape)
    X = np.lib.stride_tricks.sliding_window_view(X, (7, 7), axis=(1, 2))
    print(X.shape)
    X = np.moveaxis(X, 4, -1)
    print(X.shape)
    
    # save scaler and pca as pickles
    if mode is 'train':
        Path(pickle_save_dir).mkdir(parents=True, exist_ok=True)
        pickle.dump(scaler, open(f'{pickle_save_dir}/scaler_X.pkl', 'wb'))
    return X.reshape(-1, 33, 7, 7, 9)

In [10]:
# split data
seed = 777
idx = np.arange(1314)
random.seed(seed)
random.shuffle(idx)
nb_test_samples = int(0.2 * idx.shape[0])

In [11]:
# y preprocess
wh_path = 'data/target/wh/'
pickle_save_dir='model_save/test1/pickles'
mode = 'test'
y, pca = wh_ETL(wh_path, idx, mode=mode, pickle_save_dir=pickle_save_dir)

KeyboardInterrupt: 

In [64]:
y.shape, y.dtype

((268288, 5), dtype('float32'))

In [45]:
X.shape

(268288, 33, 7, 7, 9)

In [35]:
X_shape

(44402688, 9)

In [34]:
X_shape = X.shape

In [21]:
pca = pickle.load(open('models_save/test1/pickles/pca.pkl', 'rb'),)

In [22]:
pca.inverse_transform(y).shape

(808960, 33)

In [12]:
# X preprocess
mode = 'test'
X_path = 'data/vars/'
X = X_ETL(X_path, idx, mode=mode)

In [13]:
X.shape

(268288, 33, 7, 7, 9)

In [12]:
names = [p.name for p in Path(X_path).glob('*/')]
names

['cape', 'mfd', 'mse', 'q', 't', 'u', 'v', 'w', 'z']

In [13]:
np.save('X_test_ori.npy', X)

In [12]:
X.shape, y.shape

((268288, 33, 7, 7, 9), (268288, 5))

In [60]:
y.dtype

dtype('float64')

# save as tf record dataset

In [65]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a floast_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array

def parse_single_data(feature, label):
    #define the dictionary -- the structure -- of our single example
    data = {
        'feature' : _bytes_feature(tf.io.serialize_tensor(feature).numpy()),
        'label' : _bytes_feature(tf.io.serialize_tensor(label).numpy())
    }

    out = tf.train.Example(features=tf.train.Features(feature=data))
    return out

def write_data_to_tfr_short(datas, labels, filename:str="data"):
    filename= filename+".tfrecords"
    writer = tf.io.TFRecordWriter(filename) #create a writer that'll store our data to disk
    count = 0

    for index in range(len(datas)):

        #get the data we want to write
        current_data = datas[index]
        current_label = labels[index]

        out = parse_single_data(feature=current_data, label=current_label)
        writer.write(out.SerializeToString())
        count += 1

    writer.close()
    print(f"Wrote {count} elements to TFRecord")
    return count

def _parse_data_function(example_proto):
    data_feature_description = {
        'feature' : tf.io.FixedLenFeature([], tf.string),
        'label' : tf.io.FixedLenFeature([], tf.string)
    }

    # Parse the input tf.train.Example proto using the dictionary above.
    features = tf.io.parse_single_example(example_proto, data_feature_description)
    features['feature'] = tf.io.parse_tensor(features['feature'], 'float')
    features['label'] = tf.io.parse_tensor(features['label'], 'float')
    return features


In [73]:
# write data to TFRecord
# write_data_to_tfr_short(X, y, filename='test_dataset_0417')

def _parse_data_function(example_proto):
    data_feature_description = {
        'feature' : tf.io.FixedLenFeature([], tf.string),
        'label' : tf.io.FixedLenFeature([], tf.string)
    }

    # Parse the input tf.train.Example proto using the dictionary above.
    features = tf.io.parse_single_example(example_proto, data_feature_description)
    data = tf.io.parse_tensor(features['feature'], "float") 
    label = tf.io.parse_tensor(features['label'], "float")
    data.set_shape([33,7,7,9])
    label.set_shape([5,])
    return data, label

# read TFRecord
raw_dataset = tf.data.TFRecordDataset('test_dataset_0417.tfrecords')
parsed_dataset = raw_dataset.map(_parse_data_function)

In [84]:
def _parse_data_function(example_proto):
    data_feature_description = {
        'feature' : tf.io.FixedLenFeature([], tf.string),
        'label' : tf.io.FixedLenFeature([], tf.string)
    }

    # Parse the input tf.train.Example proto using the dictionary above.
    features = tf.io.parse_single_example(example_proto, data_feature_description)
    data = tf.io.parse_tensor(features['feature'], "float") 
    label = tf.io.parse_tensor(features['label'], "float")
    data.set_shape([33,7,7,9])
    label.set_shape([5,])
    return data, label


def get_dataset(dataset, BATCH_SIZE):
    dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

train_dataset = raw_dataset.map(_parse_data_function)
train_dataset = get_dataset(train_dataset, BATCH_SIZE)

In [88]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
model = CNN3D(9)

# Train model on dataset
model.fit(train_dataset,
          epochs=1000,
          batch_size=64)


Epoch 1/1000


2022-04-17 23:45:24.025415: W tensorflow/core/common_runtime/bfc_allocator.cc:456] Allocator (GPU_0_bfc) ran out of memory trying to allocate 56.85MiB (rounded to 59609088)requested by op SameWorkerRecvDone
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-04-17 23:45:24.025590: I tensorflow/core/common_runtime/bfc_allocator.cc:991] BFCAllocator dump for GPU_0_bfc
2022-04-17 23:45:24.025633: I tensorflow/core/common_runtime/bfc_allocator.cc:998] Bin (256): 	Total Chunks: 294, Chunks in use: 293. 73.5KiB allocated for chunks. 73.2KiB in use in bin. 19.8KiB client-requested in use in bin.
2022-04-17 23:45:24.025664: I tensorflow/core/common_runtime/bfc_allocator.cc:998] Bin (512): 	Total Chunks: 44, Chunks in use: 44. 22.2KiB allocated for chunks. 22.2KiB in use in bin. 22.0KiB client-requested in use in bin.
2022-04-17 23:45:24.

ResourceExhaustedError:  SameWorkerRecvDone unable to allocate output tensor. Key: /job:localhost/replica:0/task:0/device:CPU:0;271232ff8806f376;/job:localhost/replica:0/task:0/device:GPU:0;edge_149_IteratorGetNext;0:0
	 [[{{node IteratorGetNext/_2}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_3302167]

Function call stack:
train_function


_runtime/bfc_allocator.cc:1046] InUse at 7f8d268d0500 of size 65536 by op Fill action_count 922 step 0 next 285
2022-04-17 23:45:24.035071: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d268e0500 of size 65536 by op Fill action_count 928 step 0 next 293
2022-04-17 23:45:24.035102: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d268f0500 of size 65536 by op Add action_count 864 step 0 next 292
2022-04-17 23:45:24.035135: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26900500 of size 65536 by op Add action_count 888 step 0 next 271
2022-04-17 23:45:24.035170: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26910500 of size 20480 by op Fill action_count 932 step 0 next 280
2022-04-17 23:45:24.035209: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26915500 of size 512 by op Fill action_count 941 step 0 next 302
2022-04-17 23:45:24.035232: I tensorflow/core/common_runtime/bfc_allocat

2022-04-17 23:45:24.036775: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26b26f00 of size 256 by op Fill action_count 1143 step 0 next 363
2022-04-17 23:45:24.036807: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26b27000 of size 256 by op Fill action_count 1145 step 0 next 365
2022-04-17 23:45:24.036839: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26b27100 of size 512 by op Fill action_count 1147 step 0 next 367
2022-04-17 23:45:24.036870: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26b27300 of size 256 by op Fill action_count 1149 step 0 next 369
2022-04-17 23:45:24.036903: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26b27400 of size 256 by op Fill action_count 1151 step 0 next 371
2022-04-17 23:45:24.036935: I tensorflow/core/common_runtime/bfc_allocator.cc:1046] InUse at 7f8d26b27500 of size 512 by op Fill action_count 1153 step 0 next 373
2022-04-17 23:45:24.03

In [67]:
for data in parsed_dataset.take(10):
    print(data['feature'].shape, data['label'].shape)

(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)
(33, 7, 7, 9) (5,)


# testtest

In [7]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler


def pad_boundary(arr, kernel_size):
    '''
    arr : input array
    kernel_size : training sample size
    '''

    pad_size = int((kernel_size - 1) / 2)

    if len(arr.shape) == 5:
        pad_arr = np.pad(arr, pad_size, 'wrap')
        pad_arr = pad_arr[pad_size:-pad_size, pad_size:-pad_size, ..., pad_size:-pad_size]
    else:
        raise ValueError('len(arr.shape) should be 5')
    return pad_arr


def sliding_window_x(arr, z=33, y=7, x=7, f=9, kernel_size=7):
    arr = np.lib.stride_tricks.sliding_window_view(arr,
                                                   (kernel_size, kernel_size),
                                                   axis=(2, 3))
    arr = np.moveaxis(arr, 4, -1)
    arr = np.moveaxis(arr, 1, 3)
    arr = arr.reshape(-1, z, y, x, f)
    return arr


def load_X(dir_path, feature_range=(0, 1)):
    Vars = []
    names = [p.name for p in Path(dir_path).glob('*/')]
    for name in names:
        path = Path(dir_path).joinpath(name)
        data = np.stack([np.load(p) for p in path.glob('*.npy')])
        if name == 'cape':
            data = data[:, np.newaxis, ..., np.newaxis]
            data = np.concatenate([data]*34, axis=1)
        else:
            data = data[..., np.newaxis]
        Vars.append(data)
    X = np.concatenate(Vars, axis=-1)[:, 1:, ...]
    X = np.moveaxis(X, 1, -2)
    shape = X.shape
        
    # stdandardlization
    scaler = MinMaxScaler(feature_range=feature_range)
    X = X.reshape(-1, 9)
    X = scaler.fit_transform(X)
    X = X.reshape(shape)
    
    return X.astype(np.float32), scaler


def X_ETL(path, idx, nb_test_samples, mode='train', feature_range=(0,1), pickle_save_dir='models_save/test1/pickles'):
    # load_X and select train/val/test set due to lack of memory
    X, scaler = load_X(path, feature_range)
    nb_test_samples = int(0.2 * X.shape[0])
    
    # shuffle
    X = X[idx]
    
    # split
    if mode is 'train':
        X = X[2*nb_test_samples:]
    elif mode is 'val':
        X = X[0:nb_test_samples]
    else:
        X = X[nb_test_samples:2*nb_test_samples]
    
    print(X.shape)
    # CNN input style
    X = pad_boundary(X, 7)
    print(X.shape)
    X = np.lib.stride_tricks.sliding_window_view(X, (7, 7), axis=(1, 2))
    print(X.shape)
    X = np.moveaxis(X, 4, -1)
    print(X.shape)
    
    # save scaler and pca as pickles
    if mode is 'train':
        Path(pickle_save_dir).mkdir(parents=True, exist_ok=True)
        pickle.dump(scaler, open(f'{pickle_save_dir}/scaler_X.pkl', 'wb'))
    return X.reshape(-1, 33, 7, 7, 9)


def y_ETL(dir_path, idx, nb_test_samples, mode='train', feature_range=(0, 1), pickle_save_dir='models_save/test1/pickles'):
    # load from folder
    whs = list(Path(dir_path).glob('*.npy'))
    whs = np.stack([np.load(path) for path in whs])
    print(whs.shape)
    nb_test_samples = int(0.2 * whs.shape[0])
    whs = whs[idx]
    if mode is 'train':
        whs = whs[2*nb_test_samples:]
    elif mode is 'val':
        whs = whs[0:nb_test_samples]
    else:
        whs = whs[nb_test_samples:2*nb_test_samples]    
    
    # move sample axis to the last, reshape, and remove the first layer
    whs = np.moveaxis(whs, 1, 3).reshape(-1, 34)[:, 1:]
    
    # Min Max normalization
    scaler = MinMaxScaler(feature_range=feature_range)
    scaled_whs = scaler.fit_transform(whs)
    
    # PCA
    pca = PCA(n_components=0.96, svd_solver='full')
    out = pca.fit_transform(scaled_whs)
    
    # save scaler and pca as pickles
    if mode is 'train':
        Path(pickle_save_dir).mkdir(parents=True, exist_ok=True)
        pickle.dump(scaler, open(f'{pickle_save_dir}/scaler.pkl', 'wb'))
        pickle.dump(pca, open(f'{pickle_save_dir}/pca.pkl', 'wb'))
    return out.astype(np.float32), pca

# save tf_record data
def parse_single_data(feature, label):
    #define the dictionary -- the structure -- of our single example
    data = {
        'feature' : _bytes_feature(tf.io.serialize_tensor(feature).numpy()),
        'label' : _bytes_feature(tf.io.serialize_tensor(label).numpy())
    }

    out = tf.train.Example(features=tf.train.Features(feature=data))
    return out

def write_data_to_tfr_short(datas, labels, filename:str="data"):
    filename= filename+".tfrecords"
    writer = tf.io.TFRecordWriter(filename) #create a writer that'll store our data to disk
    count = 0

    for index in range(len(datas)):

        #get the data we want to write
        current_data = datas[index]
        current_label = labels[index]

        out = parse_single_data(feature=current_data, label=current_label)
        writer.write(out.SerializeToString())
        count += 1

    writer.close()
    print(f"Wrote {count} elements to TFRecord")
    return count


# parse tf_record data
def _parse_data_function(example_proto):
    data_feature_description = {
        'feature' : tf.io.FixedLenFeature([], tf.string),
        'label' : tf.io.FixedLenFeature([], tf.string)
    }

    # Parse the input tf.train.Example proto using the dictionary above.
    features = tf.io.parse_single_example(example_proto, data_feature_description)
    data = tf.io.parse_tensor(features['feature'], "float") 
    label = tf.io.parse_tensor(features['label'], "float")
    data.set_shape([33,7,7,9])
    label.set_shape([5,])
    return data, label


def get_dataset(dataset, BATCH_SIZE):
    dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)

    return dataset

In [8]:
import random

# Settings
mode = 'test'
save_name_suffix = '0421'
X_path = 'data/vars/'
y_path = 'data/target/wh/'
pickle_save_dir='model_save/test2/pickles'

# set seed for reproduce
seed = 777
idx = np.arange(1314)
random.seed(seed)
random.shuffle(idx)
nb_test_samples = int(0.2 * idx.shape[0])

# X pre-process
X = X_ETL(X_path, idx, nb_test_samples, mode=mode, pickle_save_dir=pickle_save_dir)


(262, 32, 32, 33, 9)
(262, 32, 38, 39, 9)
(262, 26, 32, 39, 9, 7, 7)
(262, 26, 32, 39, 7, 7, 9)


ValueError: cannot reshape array of size 3749106816 into shape (33,7,7,9)