In [1]:
import tensorflow as tf
print(tf.__version__)

import sys
from pathlib import Path
current_path = Path.cwd()
root = current_path.parent
sys.path.append(str(root))

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
print(gpus)



2.10.1
[]


In [None]:
import numpy as np
from layers.selective_attention import SelectiveAttention

def positional_encoding(length, depth):
    depth = depth/2
    
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :]/depth
    
    angle_rates = 1/(10000 ** depths)
    angle_rads = positions * angle_rates
    
    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis =-1
    )
    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, length, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(length, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=length, depth=d_model)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

def positional_encoding_2D(length, width, channels, depth):
    depth = np.ceil(channels / 6) * depth
    
    pos_x = np.arange(width)[:, np.newaxis]
    pos_y = np.arange(length)[:, np.newaxis]
    pos_channel = np.arange(channels)[:, np.newaxis]
    depths = np.arange(0, depth, 2)[np.newaxis, :]/ depth
   
    angle_rates = 1/(10000 ** depths)
    angle_rads_x = pos_x * angle_rates
    angle_rads_y = pos_y * angle_rates
    angle_rads_channel = pos_channel * angle_rates
    
    pos_encoding_x = np.concatenate(
        [np.sin(angle_rads_x), np.cos(angle_rads_x)],
        axis =-1
    )
    pos_encoding_x = tf.expand_dims(tf.expand_dims(pos_encoding_x, 1), 1)
    
    pos_encoding_y = np.concatenate(
        [np.sin(angle_rads_y), np.cos(angle_rads_y)],
        axis = -1
    )
    
    pos_encoding_y = tf.expand_dims(tf.expand_dims(pos_encoding_y, 1), 0)

    pos_encoding_channel = np.concatenate(
       [np.sin(angle_rads_channel), np.cos(angle_rads_channel)],
       axis=-1
    )

    pos_encoding_channel = tf.expand_dims(tf.expand_dims(pos_encoding_channel, 0), 0)

    
    pos_encoding_x = tf.tile(pos_encoding_x, (1, length, channels, 1))
    pos_encoding_y = tf.tile(pos_encoding_y, (width, 1, channels, 1))
    pos_encoding_channel = tf.tile(pos_encoding_channel, (width, length, 1, 1))
    pos_encoding = tf.concat([pos_encoding_x, pos_encoding_y, pos_encoding_channel], -1)
    
    return tf.cast(pos_encoding, dtype=tf.float32)


class PositionalEmbedding2D(tf.keras.layers.Layer):
  def __init__(self, length, width, channels, d_model):
    super().__init__()
    self.length = length
    self.width = width
    self.channels = channels
    self.input_dim = length * width * channels
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(input_dim=self.input_dim, output_dim=d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding_2D(length=length, width=width, channels=channels, depth=d_model)


  def call(self, x):
    batch_size = tf.shape(x)[0]
    x = self.embedding(x)
    
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + tf.repeat(self.pos_encoding[tf.newaxis, :self.width, :self.length, :self.channels, :self.d_model], batch_size, axis=0)
    return x


class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = SelectiveAttention(**kwargs)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        
class CrossAttention(BaseAttention):
    def call(self, x, context):
        
        if len(context.shape) == 4:
            context = tf.reduce_mean(context, axis=(1, 2), keepdims=True)
            context = tf.squeeze(context, axis=1)
        elif len(context.shape) == 5:
            context = tf.reduce_mean(context, axis=(1, 2, 3), keepdims=True)
            context = tf.squeeze(context, axis=1)
            context = tf.squeeze(context, axis=1)
        
        att_output, att_scores = self.mha(query=x,
                                          key=context,
                                          value=context,
                                          return_attention_scores=True)
        
        self.last_att_scores = att_scores
        x = self.add([x, att_output])
        x = self.layer_norm(x)
        
        return x
    
class SelfAttention(BaseAttention):
    def call(self, x):
        
        att_output = self.mha(query=x,
                              key=x,
                              value=x)

        x = self.add([x, att_output])
        x = self.layer_norm(x)
        
        return x
    
    
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()
        
    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x
    
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        
        self.self_attention = SelfAttention(
            num_heads=num_heads,
            key_dim=dff,
            dropout=dropout_rate
        )
        
        self.ffn = FeedForward(dff)
        
    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x
    
class Encoder(tf.keras.layers.Layer):
    def __init__(self,
                 *,
                 num_layers, 
                 num_heads, 
                 dff, 
                 length,
                 width,
                 channels,
                 dropout_rate=0.1):
        super().__init__()
        
        self.num_layers = num_layers
        
        self.pos_embedding =PositionalEmbedding2D(length, width, channels, dff)
        
        self.enc_layers = [
            EncoderLayer(num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)
        ]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        
    def call(self, x):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        return x
    
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,
                 *,
                 num_heads,
                 dff,
                 dropout_rate=0.1):
        super(DecoderLayer, self).__init__()
       
        self.self_attention = SelfAttention(
            num_heads=num_heads,
            key_dim=dff,
            dropout=dropout_rate
        )
        
        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim = dff,
            dropout=dropout_rate
        )
        
        self.ffn = FeedForward(dff)
        
    def call(self, x, context):
        x = self.self_attention(x=x)
        x = self.cross_attention(x=x, context=context)
        self.last_att_scores = self.cross_attention.last_att_scores
    
        x = self.ffn(x)
        return x
    
class Decoder(tf.keras.layers.Layer):
    def __init__(self, 
                 *,
                 num_layers,
                 num_heads,
                 dff,
                 num_cat,
                 dropout_rate=0.1):
        super(Decoder, self).__init__()
        
        self.num_layers =num_layers
        
        self.pos_embedding =PositionalEmbedding(
            num_cat,
            dff
        )
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)
        ]
        self.last_att_scores = None
        
    def call(self, x, context):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, context)
        
        self.last_att_scores = self.dec_layers[-1].last_att_scores
        
        return x
    
class Transformer(tf.keras.Model):
    def __init__(self,
                 *,
                 num_layers,
                 num_heads,
                 dff,
                 length,
                 width,
                 channels,
                 num_cat,
                 dropout_rate=0.1):
        super().__init__()
        self.encoder = Encoder(
            num_layers=num_layers,
            num_heads=num_heads,
            dff=dff,
            length=length,
            width=width,
            channels=channels,
            dropout_rate=dropout_rate
        )
        self.decoder = Decoder(
            num_layers=num_layers,
            num_heads=num_heads,
            dff=dff,
            num_cat=num_cat,
            dropout_rate=dropout_rate
        )
        self.final_layer = tf.keras.layers.Dense(num_cat)
    
    def call(self, inputs):
        context, x = inputs
        context = self.encoder(context)
        x = self.decoder(x, context)
        logits = self.final_layer(x)
        
        return logits

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

def masked_loss(label, pred):
  
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer = Transformer(
    num_layers=6,
    num_heads=8,
    dff=16,
    length=16,
    width=16,
    channels=3,
    num_cat=1000,
    dropout_rate=0.1
)

dff = 16
learning_rate = CustomSchedule(dff)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

transformer.compile(
    optimizer=optimizer,
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = ['accuracy']
)
'''
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy],
    run_eagerly=True)

transformer.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
    run_eagerly=False
)

'''

In [None]:
from layers.image import ImageDataGenerator
TRAIN_PATH = '../datasets/ImageNet/train_data/'


aug = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

train_generator = aug.flow_from_directory(
                    TRAIN_PATH,
                    target_size=(16, 16),
                    batch_size=64,
                    class_mode='test'
)


history = transformer.fit_generator(train_generator, steps_per_epoch=20000, epochs=10)

In [None]:
from layers.image import ImageDataGenerator
VAL_PATH = '../datasets/ImageNet/val_data/'


val = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

val_generator = val.flow_from_directory(
                    VAL_PATH,
                    target_size=(16, 16),
                    batch_size=64,
                    class_mode='test'
)

test_loss, test_acc = transformer.evaluate(val_generator, batch_size=16, verbose=2)

print('\nTest accuracy:', test_acc)

In [None]:
transformer2 = Transformer(
    num_layers=6,
    num_heads=8,
    dff=16,
    length=16,
    width=16,
    channels=3,
    num_cat=1000,
    dropout_rate=0.1
)

dff = 16
learning_rate = CustomSchedule(dff)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

transformer2.compile(
    optimizer=optimizer,
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = ['accuracy']
)

In [None]:
from layers.image import ImageDataGenerator
TRAIN_PATH = '../datasets/ImageNet/train_data/'


aug = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

train_generator = aug.flow_from_directory(
                    TRAIN_PATH,
                    target_size=(16, 16),
                    batch_size=128,
                    class_mode='test'
)


history2 = transformer2.fit_generator(train_generator, steps_per_epoch=10000, epochs=10)

In [None]:
from layers.image import ImageDataGenerator
VAL_PATH = '../datasets/ImageNet/val_data/'


val = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

val_generator = val.flow_from_directory(
                    VAL_PATH,
                    target_size=(16, 16),
                    batch_size=64,
                    class_mode='test'
)

test_loss, test_acc = transformer2.evaluate(val_generator, batch_size=128, verbose=2)

print('\nTest accuracy:', test_acc)

In [None]:
transformer3 = Transformer(
    num_layers=6,
    num_heads=8,
    dff=16,
    length=16,
    width=16,
    channels=3,
    num_cat=1000,
    dropout_rate=0.1
)

dff = 16
learning_rate = CustomSchedule(dff)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

transformer3.compile(
    optimizer=optimizer,
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = ['accuracy']
)

In [None]:
from layers.image import ImageDataGenerator
from keras.callbacks import TensorBoard

TRAIN_PATH = '../datasets/ImageNet/train_data/'


aug = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

train_generator = aug.flow_from_directory(
                    TRAIN_PATH,
                    target_size=(16, 16),
                    batch_size=32,
                    class_mode='test'
)

tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)

history3 = transformer3.fit_generator(train_generator, steps_per_epoch=40000, epochs=1, callbacks=[tensorboard_callback])

In [None]:
from keras_flops import get_flops

flops = get_flops(Transformer(), batch_size=32)

print(flops)

In [None]:
from layers.image import ImageDataGenerator
VAL_PATH = '../datasets/ImageNet/val_data/'


val = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

val_generator = val.flow_from_directory(
                    VAL_PATH,
                    target_size=(16, 16),
                    batch_size=32,
                    class_mode='test'
)

test_loss, test_acc = transformer3.evaluate(val_generator, batch_size=128, verbose=2)

print('\nTest accuracy:', test_acc)

In [None]:
from layers.image import ImageDataGenerator
TRAIN_PATH = '../datasets/ImageNet/train_data/'

transformer4 = Transformer(
    num_layers=6,
    num_heads=8,
    dff=16,
    length=16,
    width=16,
    channels=3,
    num_cat=1000,
    dropout_rate=0.1
)

dff = 16
learning_rate = CustomSchedule(dff)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

transformer4.compile(
    optimizer=optimizer,
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = ['accuracy']
)

aug = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

train_generator = aug.flow_from_directory(
                    TRAIN_PATH,
                    target_size=(16, 16),
                    batch_size=16,
                    class_mode='test'
)


history3 = transformer4.fit_generator(train_generator, steps_per_epoch=80000, epochs=10)

In [None]:
from layers.image import ImageDataGenerator
VAL_PATH = '../datasets/ImageNet/val_data/'


val = ImageDataGenerator(rescale = 1./255.0,
					  shear_range=0.2,
					  zoom_range=0.2,
					  horizontal_flip=True)

val_generator = val.flow_from_directory(
                    VAL_PATH,
                    target_size=(16, 16),
                    batch_size=16,
                    class_mode='test'
)

test_loss, test_acc = transformer4.evaluate(val_generator, batch_size=16, verbose=2)

print('\nTest accuracy:', test_acc)

In [None]:
transformer4.summary()