<a href="https://colab.research.google.com/github/sourcecode369/unconventional-neural-networks/blob/master/attention-mechanism-%26-attention-maps/Attention_Mechanism_1_MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### References

1. https://github.com/johnsmithm/multi-heads-attention-image-classification/blob/master/multi-heads-attention-mnist.py

2. https://github.com/RobRomijnders/attention

3. https://medium.com/@moshnoi2000/all-you-need-is-attention-computer-vision-edition-dbe7538330a4

4. https://jacobgil.github.io/deeplearning/class-activation-maps

5. https://www.kaggle.com/kmader/pretrained-vgg16-w-attention-for-seedlings

6. https://www.kaggle.com/kmader/pretrained-vgg16-w-attention-for-tuberculosis

7. https://www.kaggle.com/kmader/attention-on-pretrained-vgg16-for-bone-age

8. https://lab.heuritech.com/attention-mechanism

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
print("TensorFlow version: {}".format(tf.__version__))

TensorFlow 2.x selected.
TensorFlow version: 2.0.0


In [0]:
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Input, Flatten, Dense, Lambda, Reshape, Layer, Add
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import mnist
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, TensorBoard, ModelCheckpoint
from tensorflow.keras.utils import plot_model 
from tensorflow.keras import backend as K

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_format = "retina"

In [3]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, X_test = X_train[..., np.newaxis], X_test[..., np.newaxis]
X_train, X_test = X_train / 255.0, X_test / 255.0
print(f"{X_train.shape}, {X_test.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(60000, 28, 28, 1), (10000, 28, 28, 1)


In [0]:
input = Input((28,28,1))
x = Conv2D(32, (3,3), activation="relu", padding="same")(input)
x = MaxPooling2D((2,2))(x)
x = Conv2D(64, (3,3), activation="relu", padding="same")(x)
x = MaxPooling2D((2,2))(x)
x = Conv2D(64*3, (2,2), activation="relu", padding="same")(x)
x = MaxPooling2D((2,2))(x)
x = Flatten()(x)
x = Dense(256, activation="relu")(x)
output = Dense(10, activation="softmax")(x)

In [5]:
model = Model(input, output)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 28, 28, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 14, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 7, 7, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 192)         49344     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 3, 3, 192)         0     

In [6]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model_history = model.fit(X_train, y_train, epochs=50, batch_size=4096, verbose=1, validation_data=(X_test, y_test), callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3,verbose=1)])

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 00032: early stopping


In [0]:
def MultiHeadsAttModel(l=8*8, d=512, dv=64, dout=512, nv = 8 ):

    v1 = Input(shape = (l, d))
    q1 = Input(shape = (l, d))
    k1 = Input(shape = (l, d))

    v2 = Dense(dv*nv, activation = "relu")(v1)
    q2 = Dense(dv*nv, activation = "relu")(q1)
    k2 = Dense(dv*nv, activation = "relu")(k1)

    v = Reshape([l, nv, dv])(v2)
    q = Reshape([l, nv, dv])(q2)
    k = Reshape([l, nv, dv])(k2)
        
    att = Lambda(lambda x: K.batch_dot(x[0],x[1] ,axes=[-1,-1]) / np.sqrt(dv),
                 output_shape=(l, nv, nv))([q,k])# l, nv, nv
    att = Lambda(lambda x:  K.softmax(x) , output_shape=(l, nv, nv))(att)

    out = Lambda(lambda x: K.batch_dot(x[0], x[1],axes=[4,3]),  output_shape=(l, nv, dv))([att, v])
    out = Reshape([l, d])(out)
    
    out = Add()([out, q1])

    out = Dense(dout, activation = "relu")(out)

    return  Model(inputs=[q1,k1,v1], outputs=out)

In [0]:
class NormL(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(NormL, self).__init__(**kwargs)
  def build(self, input_shape):
    self.a = self.add_weight(name='kernel',
                             shape=(1, input_shape[-1]),
                             initializer = 'zeros',
                             trainable=True)
    self.b = self.add_weight(name="kernel",
                             shape=(1, input_shape[-1]),
                             initializer = "zeros",
                             trainable=True)
    super(NormL, self).build(input_shape)
  def call(self, x):
    eps = 0.000001
    mu = K.mean(x, keepdims=True, axis=-1)
    sigma = K.std(x, keepdims=True, axis=-1)
    ln_out = (x - mu) / (sigma + eps)
    return ln_out*self.a + self.b

In [0]:
input = Input((28,28,1))
x = Conv2D(32, (3,3), activation="relu", padding="same")(input)
x = MaxPooling2D((2,2))(x)
x = Conv2D(64, (3,3), activation="relu", padding="same")(x)
x = MaxPooling2D((2,2))(x)
x = Conv2D(64*3, (2,2), activation="relu", padding="same")(x)
x = MaxPooling2D((2,2))(x)
x = Reshape([6*6, 64*3])(x)
att = MultiHeadsAttention(l=6*6, d=64*3, dv=8*3, dout=32, nv=8)
x = att([x,x,x])
x = Reshape([6,6,32])(x)
x = NormL()(x)
x = Flatten()(x)
x = Dense(256, activation="relu")(x)
output = Dense(10, activation="softmax")(x)

In [0]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
tbCallBack = TensorBoard(log_dir='./Graph/mhatt1', histogram_freq=0, write_graph=True, write_images=True)
model.fit(X_train, Y_train,
          batch_size=128, 
          epochs=100,
          verbose=1,          
          validation_data=(X_test, Y_test),
          callbacks=[tbCallBack]
          )