In [144]:
import tensorflow as tf

tf.__version__

'2.4.1'

In [145]:
from dataset import Dataset;
dataset = Dataset("../preprocessed/row_size25_vector_size2000.csv");

In [265]:
import numpy as np;

X_train, X_test, t_train, t_test = dataset.getXSplit();

X_train = (X_train * 10000000000).astype('float64')
X_test = (X_test * 10000000000).astype('float64')
X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)
# t_train = np.expand_dims(t_train, axis=1)
# t_test = np.expand_dims(t_test, axis=1)

# t_max = max(t_train.max(axis=0), t_test.max(axis=0))
# t_min = min(t_train.min(axis=0), t_test.min(axis=0))

# bucket_increments = (t_max - t_min) / 100
# for i in range(0, 100):
#     temp_min = t_min + (i * bucket_increments)
#     temp_max = t_min + (i * bucket_increments)
#     t_test.values[([t_test.values > temp_min] and [t_test.values < temp_max])[0]] = i
#     t_train.values[([t_train.values > temp_min] and [t_train.values < temp_max])[0]] = i

t_test = t_test.astype('int64')
t_train = t_train.astype('int64')

## Setup Dataset

In [266]:
import tensorflow as tf

def create_tf_dataset(X, t, bs=4):
    X_ds = tf.data.Dataset.from_tensor_slices(X)
    t_ds = tf.data.Dataset.from_tensor_slices(t)
    ds = tf.data.Dataset.zip((X_ds, t_ds))
    ds = ds.map(lambda x, y: {"source": x, "target": y})
    ds = ds.batch(bs).padded_batch(bs)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds


ds = create_tf_dataset(X_train, t_train, bs=20)
val_ds = create_tf_dataset(X_test, t_test, bs=4)

In [267]:
tf.data.Dataset.from_tensor_slices(X_train)

<TensorSliceDataset shapes: (1, 150), types: tf.float64>

In [268]:
X_ds = tf.data.Dataset.from_tensor_slices(X_train)
t_ds = tf.data.Dataset.from_tensor_slices(t_train)
tf.data.Dataset.zip((X_ds, t_ds)).map(lambda x, y: {"source": x, "target": y}).batch(4).prefetch(tf.data.experimental.AUTOTUNE)

<PrefetchDataset shapes: {source: (None, 1, 150), target: (None,)}, types: {source: tf.float64, target: tf.int64}>

## Configure Model

### Layers

In [269]:
import tensorflow as tf
from tensorflow.keras import layers

class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=150):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=1, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=1, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=1, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        print(x)
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)
    
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=100, maxlen=150, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions

In [275]:
from tensorflow import keras

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        return ffn_out_norm

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        print(inputs)
        print(self.att)
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

### Model

In [276]:
class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=150,
        target_maxlen=1,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:]
        dec_target = target[:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:]
        dec_target = target[:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input

## Custom Schedule

In [277]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        init_lr=0.00001,
        lr_after_warmup=0.001,
        final_lr=0.00001,
        warmup_epochs=15,
        decay_epochs=85,
        steps_per_epoch=203,
    ):
        super().__init__()
        self.init_lr = init_lr
        self.lr_after_warmup = lr_after_warmup
        self.final_lr = final_lr
        self.warmup_epochs = warmup_epochs
        self.decay_epochs = decay_epochs
        self.steps_per_epoch = steps_per_epoch

    def calculate_lr(self, epoch):
        """ linear warm up - linear decay """
        warmup_lr = (
            self.init_lr
            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
        )
        decay_lr = tf.math.maximum(
            self.final_lr,
            self.lr_after_warmup
            - (epoch - self.warmup_epochs)
            * (self.lr_after_warmup - self.final_lr)
            / (self.decay_epochs),
        )
        return tf.math.minimum(warmup_lr, decay_lr)

    def __call__(self, step):
        epoch = step // self.steps_per_epoch
        return self.calculate_lr(epoch)

# Run Model

In [281]:
batch = next(iter(val_ds))

model = Transformer(
    num_hid=150,
    num_head=150,
    num_feed_forward=150,
    source_maxlen=150,
    target_maxlen=1,
    num_layers_enc=1,
    num_layers_dec=1,
    num_classes=99,
)
loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True, label_smoothing=0.1,
)
learning_rate = CustomSchedule(
    init_lr=0.00001,
    lr_after_warmup=0.001,
    final_lr=0.00001,
    warmup_epochs=15,
    decay_epochs=85,
    steps_per_epoch=len(ds),
)
optimizer = keras.optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss="mse")
model.build(input_shape=ds.element_spec['source'].shape);

# model.fit(ds, validation_data=val_ds, epochs=1)

Tensor("Placeholder:0", shape=(None, 1, 150), dtype=float32)
Tensor("Placeholder:0", shape=(None, 1, 150), dtype=float32)
<tensorflow.python.keras.layers.multi_head_attention.MultiHeadAttention object at 0x7fa719369fa0>
Tensor("strided_slice:0", shape=(None, 1, 150), dtype=float32)
Tensor("sequential_438/speech_feature_embedding_72/conv1d_211/Relu:0", shape=(None, 1, 150), dtype=float32)
<tensorflow.python.keras.layers.multi_head_attention.MultiHeadAttention object at 0x7fa719369fa0>


ValueError: in user code:

    <ipython-input-155-1116382b478c>:44 call  *
        target_att = self.self_att(target, target, attention_mask=causal_mask)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1012 __call__  **
        outputs = call_fn(inputs, *args, **kwargs)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/keras/layers/multi_head_attention.py:473 call
        attention_output, attention_scores = self._compute_attention(
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/keras/layers/multi_head_attention.py:438 _compute_attention
        attention_scores = self._masked_softmax(attention_scores, attention_mask)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/keras/layers/multi_head_attention.py:399 _masked_softmax
        attention_mask = array_ops.expand_dims(
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/util/deprecation.py:538 new_func
        return func(*args, **kwargs)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:365 expand_dims
        return expand_dims_v2(input, axis, name)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:435 expand_dims_v2
        return gen_array_ops.expand_dims(input, axis, name)
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/ops/gen_array_ops.py:2278 expand_dims
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/framework/op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:590 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:3528 _create_op_internal
        ret = Operation(
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:2015 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /Users/Fuzzy/research/bofa/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:1856 _create_c_op
        raise ValueError(str(e))

    ValueError: dim -5 not in the interval [-4, 3]. for '{{node transformer_decoder_95/multi_head_attention_464/ExpandDims}} = ExpandDims[T=DT_BOOL, Tdim=DT_INT32](transformer_decoder_95/Tile, transformer_decoder_95/multi_head_attention_464/ExpandDims/dim)' with input shapes: [?,?,?], [1] and with computed input tensors: input[1] = <-5>.


In [None]:
model.summary()