# Mcfly using tensorflow.data.Dataset API

In [9]:
import os
import random
import numpy as np
import pandas as pd
from glob import glob
import tensorflow as tf
from sktime.datatypes import convert_to
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sktime.datasets import load_from_arff_to_dataframe
from mcfly.modelgen import generate_models
from mcfly.find_architecture import train_models_on_samples
from mcfly.find_architecture import find_best_architecture

In [2]:
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [3]:
DATA_PATH = "C:/Users/NABS/Downloads/FruitFlies/"
X, y = load_from_arff_to_dataframe(
    os.path.join(DATA_PATH, "FruitFlies_TRAIN.arff")
)
X.head()

Unnamed: 0,dim_0
0,0 0.000244 1 0.001831 2 -0.00...
1,0 -0.000244 1 0.000275 2 -0.00...
2,0 0.002350 1 -0.002441 2 -0.00...
3,0 0.001465 1 0.001221 2 -0.00...
4,0 0.000397 1 -0.000061 2 -0.00...


In [10]:
np.unique(y)

array(['melanogaster', 'suzukii', 'zaprionus'], dtype='<U12')

In [11]:
y_np = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()
y_np

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [6]:
X_np = convert_to(X, to_type="numpy3D")
X_np.shape

(17259, 1, 5000)

In [7]:
X_np = np.swapaxes(X_np, 1, 2)
X_np.shape

(17259, 5000, 1)

In [155]:
%%time 

for i in range(X_np.shape[0]):
    with open(os.path.join(DATA_PATH, "instances", f"instance_{i}_label_{y[i]}_.npy"), 'wb') as f:
        np.save(f, X_np[0, :, :])

CPU times: total: 8.38 s
Wall time: 16.7 s


In [12]:
X_np[0, np.newaxis, :, :].shape

(1, 5000, 1)

## tensorflow Dataset

In [3]:
DATA_PATH = "C:/Users/NABS/Downloads/FruitFlies/"

file_names = os.listdir(os.path.join(DATA_PATH, "instances"))
random.shuffle(file_names)

labels = map(lambda x: x.split("_")[3], file_names)
labels = list(labels)

labels[0:5]

['zaprionus', 'zaprionus', 'zaprionus', 'zaprionus', 'suzukii']

In [4]:
file_names[0:2], labels[0:2]

(['instance_9530_label_zaprionus_.npy', 'instance_12530_label_zaprionus_.npy'],
 ['zaprionus', 'zaprionus'])

In [5]:
train_set, val_set, train_label, val_label = train_test_split(
    file_names, 
    labels, 
    train_size=0.80, 
    random_state=42, 
    stratify=labels
)

In [6]:
labels_set = set(labels)
labels_int = list(range(len(labels_set)))
labels_dict = dict(zip(labels_set, labels_int))
NUM_CLASSES = len(labels_set)

In [20]:
def get_dataset_generator(data_set):
    def get_int_label(file_name):
        string_label = file_name.split("_")[3]
        return labels_dict[string_label]
    
    def get_x_y(file_name):
        x = np.load(os.path.join(DATA_PATH, "instances", file_name))
        y = get_int_label(file_name)
        y = tf.constant(y)
        y = tf.one_hot(y, NUM_CLASSES)
        return x, y
    
    def func(i):
        idx = i.numpy() # Decoding from the EagerTensor object
        x, y = get_x_y(data_set[idx])
        return x, y

    z = list(range(len(data_set)))
    dataset = tf.data.Dataset.from_generator(lambda: z, tf.uint16)

    dataset = dataset.shuffle(buffer_size=len(z), seed=0,  
                              reshuffle_each_iteration=True)
    dataset = dataset.map(lambda i: tf.py_function(func=func, 
                                                   inp=[i], 
                                                   Tout=[tf.float32,
                                                         tf.float32]
                                                   ), 
                          num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(8)#.map(_fixup_shape)
    # dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

train_generator = get_dataset_generator(train_set)
val_generator = get_dataset_generator(val_set)

In [21]:
for x_tmp, y_tmp in val_generator.take(1):
    print(x_tmp.shape, y_tmp)

(8, 5000, 1) tf.Tensor(
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]], shape=(8, 3), dtype=float32)


In [None]:
best_model, best_params, best_model_type, knn_acc = \
            find_best_architecture(
                X_train=train_generator,
                y_train=None,
                X_val=val_generator,
                y_val=None
            )

## Using in-memory dataset

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X_np, 
    y_np, 
    train_size=0.80, 
    random_state=42, 
    stratify=y_np
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((13807, 5000, 1), (3452, 5000, 1), (13807, 3), (3452, 3))

In [None]:
best_model, best_params, best_model_type, knn_acc = \
            find_best_architecture(
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,
                y_val=y_val
            )

ResourceExhaustedError: OOM when allocating tensor with shape[405000,1704] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

## tensorflow Dataset from Numpy array

In [37]:
X_train, X_val, y_train, y_val = train_test_split(
    X_np, 
    y_np, 
    train_size=0.80, 
    random_state=42, 
    stratify=labels
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((13807, 5000, 1), (3452, 5000, 1), (13807, 3), (3452, 3))

In [60]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((13807, 5000, 1), (3452, 5000, 1), (13807, 3), (3452, 3))

In [61]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

In [None]:
best_model, best_params, best_model_type, knn_acc = \
            find_best_architecture(
                X_train=train_dataset,
                y_train=None,
                X_val=val_dataset,
                y_val=None
            )

AttributeError: 'TensorSliceDataset' object has no attribute 'shape'

In [63]:
for x0, y0 in train_dataset.take(1): 
    print(x0.shape, y0)

(5000, 1) tf.Tensor([0. 1. 0.], shape=(3,), dtype=float64)


In [59]:
import gc
gc.collect()

23003

In [None]:
models = generate_models(x_shape=(13807, 5000, 1), 
                         number_of_classes=3,
                         number_of_models=2,
                         metrics=["accuracy"])

In [66]:
histories, val_accuracies, val_losses =\
    train_models_on_samples(train_dataset.batch(8),
                            None,
                            val_dataset.batch(8),
                            None,
                            models,
                            # batch_size=5,
                            #nr_epochs=20,
                            #subset_size=1000,
                            verbose=True,
                            #outputfile=outputfile,
                            metric="accuracy")

Generated models will be trained on subset of the data (subset size: 100).
Training model 0 DeepConvLSTM
Epoch 1/5
   6/1726 [..............................] - ETA: 11:51 - loss: 2.1406 - accuracy: 0.3750

InternalError: Graph execution error:

Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 23, 80, 1, 5000, 8, 80] 
	 [[{{node gradients/CudnnRNN_grad/CudnnRNNBackprop}}]]
	 [[Adam/gradients/PartitionedCall_1]] [Op:__inference_train_function_83995]

## Keras data generator

In [13]:
labels_dict

{'suzukii': 0, 'melanogaster': 1, 'zaprionus': 2}

In [14]:
new_labels = {}
for file_name in train_set:
    lab = file_name.split("_")[3]
    lab = labels_dict[lab]
    new_labels[file_name] = lab

In [136]:
np.empty((8, *(1, 5000), 1)).shape

(8, 1, 5000, 1)

In [15]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=5000, n_channels=1,
                 n_classes=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            DATA_PATH = "C:/Users/NABS/Downloads/FruitFlies/"
            X[i, ] = np.load(os.path.join(DATA_PATH, "instances", ID))
            # Store class
            y[i] = self.labels[ID]

        return X, tf.keras.utils.to_categorical(y, num_classes=self.n_classes)

In [180]:
Xtmp = np.empty((8, 5000, 1))
Xtmp[0, ] = np.load(os.path.join(DATA_PATH, "instances", 'instance_13556_label_zaprionus_.npy'))

In [181]:
Xtmp.shape

(8, 5000, 1)

In [23]:
# Parameters
params = {'dim': 5000,
          'batch_size': 3,
          'n_classes': 3,
          'n_channels': 1,
          'shuffle': True}

# Generators
training_generator = DataGenerator(train_set, new_labels, **params)
validation_generator = DataGenerator(val_set, new_labels, **params)

## Model training

In [4]:
models = generate_models(x_shape=(None, 5000, 1), 
                         number_of_classes=3,
                         number_of_models=2,
                         metrics=["accuracy"])

  super(Adam, self).__init__(name, **kwargs)


In [5]:
models

[(<keras.engine.functional.Functional at 0x13db763f310>,
  {'learning_rate': 0.006191830789697886,
   'regularization_rate': 0.02271788149250915,
   'network_depth': 4,
   'filters_number': 63,
   'max_kernel_size': 90},
  'InceptionTime'),
 (<keras.engine.functional.Functional at 0x13dbf051b50>,
  {'learning_rate': 0.006649371027656127,
   'regularization_rate': 0.00016364120810765338,
   'network_depth': 5,
   'min_filters_number': 92,
   'max_kernel_size': 28},
  'ResNet')]

In [6]:
histories, val_accuracies, val_losses =\
    train_models_on_samples(training_generator,
                            None,
                            validation_generator,
                            None,
                            models,
                            # batch_size=5,
                            #nr_epochs=20,
                            #subset_size=1000,
                            verbose=True,
                            #outputfile=outputfile,
                            metric="accuracy")

NameError: name 'training_generator' is not defined

In [None]:
best_model, best_params, best_model_type, knn_acc = \
            find_best_architecture(
                X_train=training_generator,
                y_train=None,
                X_val=validation_generator,
                y_val=None, 
                number_of_models=5,
                nr_epochs=20
            )

In [16]:
from numba import cuda 
device = cuda.get_current_device()
device.reset()