In [3]:
! pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.18.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 16.2 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.18.0


In [4]:
import tensorflow as tf
import sympy
import tensorflow_addons as tfa

import numpy as np
import seaborn as sns
import collections

# visualization tools
%matplotlib inline
import matplotlib.pyplot as plt

#Needed toolboxes
from tensorflow.keras import models, layers, callbacks
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils.generic_utils import get_custom_objects
from keras.layers.core import Activation
import time
import random
from sklearn.model_selection import train_test_split
from math import ceil

# Read in Data

In [8]:
#loads MNIST dataset for test and train sets
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data();

# print shape of dataset
print("x_train shape: {}; y_train shape: {}".format(x_train.shape, y_train.shape))
print("x_test shape: {}; y_test shape: {}".format(x_test.shape, y_test.shape))

# MNIST image preprocessing
# flattens images
num_pixels = x_train.shape[1] * x_train.shape[2];
x_test = x_test.reshape((x_test.shape[0], num_pixels)).astype('float32');
x_train = x_train.reshape((x_train.shape[0], num_pixels)).astype('float32');

# normalizes 
x_test = x_test/255;
x_train = x_train/255;

# one hot encoding
y_test = np_utils.to_categorical(y_test);
y_train = np_utils.to_categorical(y_train);

# number of categories in MNIST
num_classes = y_test.shape[1];

x_train shape: (60000, 28, 28); y_train shape: (60000,)
x_test shape: (10000, 28, 28); y_test shape: (10000,)


In [6]:
# set training hyperparameters

learning_rate = 0.01
weight_decay = 0.001
batch_size = 256
num_epochs = 100
image_size = 72  # resize input images
patch_size = 6  # Size of the patches to be extract from the input images
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4

# transformer layer size
transformer_units = [
    projection_dim * 2,
    projection_dim,
]
transformer_layers = 8

# size of dense layers of final classifier
mlp_head_units = [2048, 1024]

# Performer Variants

In [7]:
def run_perfs(x_train, y_train,x_test, y_test):
  '''
  Initializes the model, train with x_train and y_train
  with relevant performer attention kernal activation;
  Tests on x_test and y_test and displays run time and error
  '''
  #######################
  ### ReLU ACTIVATION ###
  #######################

  a = time.time();

  # initialize model
  model = Sequential();

  # set architecture configuration and parameters
  model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'));
  model.add(Dense(num_classes, kernel_initializer='normal', activation='relu'));

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']);

  # fit to training dataset
  model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=200, verbose=0);

  b = time.time();

  # evaluate model
  scores = model.evaluate(x_test, y_test, verbose=0);
  print("ReLU error: %", (100-scores[1]*100));
  print("ReLU run time: ", b - a);

  ##########################
  ### SOFTMAX ACTIVATION ###
  ##########################

  a = time.time();

  # initialize model
  model = Sequential();

  # set architecture configuration and parameters
  model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='softmax'));
  model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'));

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']);

  # fit to training dataset
  model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=200, verbose=0);

  b = time.time();

  # evaluate model
  scores = model.evaluate(x_test, y_test, verbose=0);
  print("Softmax error: %",(100-scores[1]*100));
  print("Softmax run time: ", b - a);

  ###############################
  ### PERFORMER-X4 ACTIVATION ###
  ###############################

  # define custom activation function
  def custom_activation(x):
    # performer x4 activation
    return tf.math.pow(x,4)

  get_custom_objects().update({'custom_activation': Activation(custom_activation)})

  a = time.time();

  # initialize model
  model = Sequential();

  # set architecture configuration and parameters
  model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation="custom_activation"));
  model.add(Dense(num_classes, kernel_initializer='normal', activation="custom_activation"));

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']);

  # fit to training dataset
  model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=200, verbose=0);

  b = time.time();

  # evaluate model
  scores = model.evaluate(x_test, y_test, verbose=0);
  print("x^4 Error: %",(100-scores[1]*100));
  print("x^4 run time: ", b - a);

  #################################
  ### PERFORMER QUAD ACTIVATION ###
  #################################

  # define custom activation function
  def custom_activation(x):
    # performer quad activation
    return tf.math.maximum(tf.math.pow(x,4),0)

  get_custom_objects().update({'custom_activation': Activation(custom_activation)})

  a = time.time();

  # initialize model 
  model = Sequential();

  # set architecture configuration and parameters
  model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation="custom_activation"));
  model.add(Dense(num_classes, kernel_initializer='normal', activation="custom_activation"));

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']);

  # fit to training dataset
  model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=200, verbose=0);

  b = time.time();

  # evaluate model
  scores = model.evaluate(x_test, y_test, verbose=0);
  print("Quad error: ", (100-scores[1]*100));
  print("Quad run time: ", b - a);

  ###############################
  ### PERFORMER X2 ACTIVATION ###
  ###############################

  # define custom activation function
  def custom_activation(x):
    # performer x2 activation
    return tf.math.pow(x,2)

  get_custom_objects().update({'custom_activation': Activation(custom_activation)})

  a = time.time();

  # initialize model
  model = Sequential();

  # set architecture configuration and parameters
  model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation="custom_activation"));
  model.add(Dense(num_classes, kernel_initializer='normal', activation="custom_activation"));

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']);

  # fit to training dataset
  model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=200, verbose=0);

  b = time.time();

  # model evaluation
  scores = model.evaluate(x_test, y_test, verbose=0);
  print("x^2 error: %", (100-scores[1]*100));
  print("x^2 run time: ", b - a);

  ####################################
  ### PERFORMER RELU X2 ACTIVATION ###
  ####################################

  # define custom activation function
  def custom_activation(x):
    # performer relu x2 activation
    return tf.math.maximum(tf.math.pow(x,2),0)

  get_custom_objects().update({'custom_activation': Activation(custom_activation)})

  a = time.time();

  # initialize model
  model = Sequential();

  # set architecture configuration and parameters
  model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation="custom_activation"));
  model.add(Dense(num_classes, kernel_initializer='normal', activation="custom_activation"));

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']);

  # fit to training dataset
  model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=200, verbose=0);

  b = time.time();

  # evaluate model
  scores = model.evaluate(x_test, y_test, verbose=0);
  print("max(x^2,0) error: %", (100-scores[1]*100));
  print("max(x^2,0) run time: ", b - a);

In [9]:
# run training on all performer variants
run_perfs(x_train, y_train,x_test, y_test)

ReLU error: % 90.20000025629997
ReLU run time:  56.77838921546936
Softmax error: % 6.099998950958252
Softmax run time:  48.3295521736145
x^4 Error: % 2.1000027656555176
x^4 run time:  62.79771566390991
Quad error:  3.4699976444244385
Quad run time:  63.37994909286499
x^2 error: % 3.1599998474121094
x^2 run time:  46.91620850563049
max(x^2,0) error: % 3.850001096725464
max(x^2,0) run time:  48.58880925178528


# Ablation Studies

In [34]:
# function to sample 
def random_sample(sample_size, replacement):
  '''
  Load in MNIST data and generate random sample given a sample size
  and whether to draw with or without replacement;
  Process the image data (image flattening, normalization, one hot encoding)
  '''

  #loads MNIST dataset for test and train sets
  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data();

  # sample train sets
  idx = np.random.choice(x_train.shape[0], sample_size, replace=replacement)
  x_train = x_train[idx,:]
  y_train = y_train[idx]

  # sample test sets
  idx = np.random.choice(x_test.shape[0], int(sample_size*(1/7)), replace=replacement)
  x_test = x_test[idx,:]
  y_test = y_test[idx]

  # print shape of dataset after processing
  print("x_train shape: {}; y_train shape: {}".format(x_train.shape, y_train.shape))
  print("x_test shape: {}; y_test shape: {}".format(x_test.shape, y_test.shape))

  # MNIST image preprocessing
  # flattens images
  num_pixels = x_train.shape[1] * x_train.shape[2];
  x_test = x_test.reshape((x_test.shape[0], num_pixels)).astype('float32');
  x_train = x_train.reshape((x_train.shape[0], num_pixels)).astype('float32');

  # normalizes 
  x_test = x_test/255;
  x_train = x_train/255;

  # one hot encoding
  y_test = np_utils.to_categorical(y_test);
  y_train = np_utils.to_categorical(y_train);

  return x_train, y_train, x_test, y_test

### Sampling without Replacement

In [37]:
# 10000 samples in training
x_train, y_train, x_test, y_test = random_sample(10000, False)

# number of categories in MNIST
num_classes = y_test.shape[1];

# run performer variants
run_perfs(x_train, y_train,x_test, y_test)

x_train shape: (10000, 28, 28); y_train shape: (10000,)
x_test shape: (1428, 28, 28); y_test shape: (1428,)
ReLU error: % 89.14565816521645
ReLU run time:  12.481255769729614
Softmax error: % 8.473390340805054
Softmax run time:  9.651816368103027
x^4 Error: % 6.372547149658203
x^4 run time:  10.890950441360474
Quad error:  6.372547149658203
Quad run time:  12.476388692855835
x^2 error: % 6.372547149658203
x^2 run time:  9.540419101715088
max(x^2,0) error: % 6.092435121536255
max(x^2,0) run time:  20.951552867889404


In [38]:
# 20000 samples in training
x_train, y_train, x_test, y_test = random_sample(20000, False)

# number of categories in MNIST
num_classes = y_test.shape[1];

# run performer variants
run_perfs(x_train, y_train,x_test, y_test)

x_train shape: (20000, 28, 28); y_train shape: (20000,)
x_test shape: (2857, 28, 28); y_test shape: (2857,)
ReLU error: % 90.02450108528137
ReLU run time:  15.686229228973389
Softmax error: % 7.280361652374268
Softmax run time:  16.897547960281372
x^4 Error: % 3.6051809787750244
x^4 run time:  19.826745748519897
Quad error:  5.495274066925049
Quad run time:  20.738535165786743
x^2 error: % 5.1452577114105225
x^2 run time:  20.95934224128723
max(x^2,0) error: % 4.2002081871032715
max(x^2,0) run time:  16.72758984565735


In [39]:
# 50000 samples in training
x_train, y_train, x_test, y_test = random_sample(50000, False)

# number of categories in MNIST
num_classes = y_test.shape[1];

# run performer variants
run_perfs(x_train, y_train,x_test, y_test)

x_train shape: (50000, 28, 28); y_train shape: (50000,)
x_test shape: (7142, 28, 28); y_test shape: (7142,)
ReLU error: % 90.21282568573952
ReLU run time:  45.36798691749573
Softmax error: % 6.594789028167725
Softmax run time:  41.87660479545593
x^4 Error: % 2.3942887783050537
x^4 run time:  47.53387761116028
Quad error:  2.5623083114624023
Quad run time:  50.792929887771606
x^2 error: % 5.586671829223633
x^2 run time:  38.026211977005005
max(x^2,0) error: % 10.627275705337524
max(x^2,0) run time:  41.602829456329346


### Sampling with Replacement

In [40]:
# 10000 samples in training
x_train, y_train, x_test, y_test = random_sample(10000, True)

# number of categories in MNIST
num_classes = y_test.shape[1];

# run performer variants
run_perfs(x_train, y_train,x_test, y_test)

x_train shape: (10000, 28, 28); y_train shape: (10000,)
x_test shape: (1428, 28, 28); y_test shape: (1428,)
ReLU error: % 88.51540610194206
ReLU run time:  9.160441160202026
Softmax error: % 14.075630903244019
Softmax run time:  8.8079833984375
x^4 Error: % 3.7114858627319336
x^4 run time:  10.35050344467163
Quad error:  7.0728302001953125
Quad run time:  10.860066652297974
x^2 error: % 7.422971725463867
x^2 run time:  9.58069396018982
max(x^2,0) error: % 8.683472871780396
max(x^2,0) run time:  9.46939730644226


In [41]:
# 20000 samples in training
x_train, y_train, x_test, y_test = random_sample(20000, True)

# number of categories in MNIST
num_classes = y_test.shape[1];

# run performer variants
run_perfs(x_train, y_train,x_test, y_test)

x_train shape: (20000, 28, 28); y_train shape: (20000,)
x_test shape: (2857, 28, 28); y_test shape: (2857,)
ReLU error: % 90.26951342821121
ReLU run time:  16.256171703338623
Softmax error: % 7.4553728103637695
Softmax run time:  20.95085120201111
x^4 Error: % 5.635279417037964
x^4 run time:  20.9736111164093
Quad error:  3.3601701259613037
Quad run time:  20.446016788482666
x^2 error: % 9.100455045700073
x^2 run time:  20.949095249176025
max(x^2,0) error: % 8.225411176681519
max(x^2,0) run time:  16.69939422607422


In [42]:
# 50000 samples in training
x_train, y_train, x_test, y_test = random_sample(50000, True)

# number of categories in MNIST
num_classes = y_test.shape[1];

# run performer variants
run_perfs(x_train, y_train,x_test, y_test)

x_train shape: (50000, 28, 28); y_train shape: (50000,)
x_test shape: (7142, 28, 28); y_test shape: (7142,)
ReLU error: % 90.07280841469765
ReLU run time:  38.50339365005493
Softmax error: % 6.888824701309204
Softmax run time:  43.930743932724
x^4 Error: % 3.136378526687622
x^4 run time:  48.23574495315552
Quad error:  3.962475061416626
Quad run time:  82.41865348815918
x^2 error: % 4.158496856689453
x^2 run time:  40.70253872871399
max(x^2,0) error: % 4.802578687667847
max(x^2,0) run time:  40.54644560813904
