!pip install tensorflow_datasets

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
tf.__version__

'2.3.1'

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
tf.test.is_built_with_gpu_support()

True

In [5]:
(ds_train, ds_test), ds_info = tfds.load(
    'mnist',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True,
)

In [6]:
def normalize_img(image, label):
  """Normalizes images: `uint8` -> `float32`."""
  return tf.cast(image, tf.float32) / 255., label

ds_train = ds_train.map(
    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
ds_train = ds_train.batch(128)
ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

In [7]:
ds_test = ds_test.map(
    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_test = ds_test.batch(128)
ds_test = ds_test.cache()
ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)

In [8]:
def get_compiled_model():
    model = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
      tf.keras.layers.Dense(128*1000,activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(0.001),
        metrics=['accuracy'],
    )
    return model

In [9]:
model = get_compiled_model()
model.fit(
    ds_train,
    epochs=2,
)

Epoch 1/2






KeyboardInterrupt: 

In [None]:
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()
#strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

In [None]:
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

In [None]:
with strategy.scope():
    # Everything that creates variables should be under the strategy scope.
    # In general this is only model construction & `compile()`.
    model = get_compiled_model()

In [None]:
model.fit(
    ds_train,
    epochs=2,
    
)
#1s 2ms/step - loss: 0.3582 - accuracy: 0.9007 - val_loss: 0.1991 - val_accuracy: 0.9420
#1s 3ms/step - loss: 0.0908 - accuracy: 0.9739

In [22]:
from tensorflow import keras

In [23]:
keras.datasets.cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


((array([[[[ 59,  62,  63],
           [ 43,  46,  45],
           [ 50,  48,  43],
           ...,
           [158, 132, 108],
           [152, 125, 102],
           [148, 124, 103]],
  
          [[ 16,  20,  20],
           [  0,   0,   0],
           [ 18,   8,   0],
           ...,
           [123,  88,  55],
           [119,  83,  50],
           [122,  87,  57]],
  
          [[ 25,  24,  21],
           [ 16,   7,   0],
           [ 49,  27,   8],
           ...,
           [118,  84,  50],
           [120,  84,  50],
           [109,  73,  42]],
  
          ...,
  
          [[208, 170,  96],
           [201, 153,  34],
           [198, 161,  26],
           ...,
           [160, 133,  70],
           [ 56,  31,   7],
           [ 53,  34,  20]],
  
          [[180, 139,  96],
           [173, 123,  42],
           [186, 144,  30],
           ...,
           [184, 148,  94],
           [ 97,  62,  34],
           [ 83,  53,  34]],
  
          [[177, 144, 116],
           [16

In [18]:
import tensorflow as tf
import tensorflow_datasets as tfds
import time
from tensorflow import keras

from tensorflow.keras.optimizers import Adam


def scale(image, label):
  image = tf.cast(image, tf.float32)  
  image /= 255
  #image = tf.reshape(image, (-1, 28, 28, 1))
  image = tf.concat((image, image), axis=-2)
  image = tf.concat((image, image), axis=-3)
  return image, label


def build_model():
    filters = 64
    units = 32
    kernel_size = 5
    learning_rate = 1e-2
    model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(filters=filters, kernel_size=(7, 7), activation='relu', input_shape=(28*2, 28*2, 1)),      
      
      tf.keras.layers.Conv2D(filters=filters*2, kernel_size=(5, 5), activation='swish'),      
      
      tf.keras.layers.Conv2D(filters=filters*4, kernel_size=(3, 3), activation='swish'),
      
      tf.keras.layers.Conv2D(filters=filters*8, kernel_size=(3, 3), activation='swish'),
      tf.keras.layers.MaxPool2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(units, activation='swish'),
      tf.keras.layers.Dense(units, activation='swish'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate), metrics=['accuracy'])
    return model


#strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')

datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
mnist_train, mnist_test = datasets['train'], datasets['test']

num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples
epochs = 1

In [19]:
import numpy as np

In [20]:
BUFFER_SIZE = 10000
BATCH_SIZE = 256

train_dataset = mnist_train.map(scale).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [23]:
train_dataset

<PrefetchDataset shapes: ((None, 56, 56, 1), (None,)), types: (tf.float32, tf.int64)>

In [21]:
model_base = build_model()
print(model_base.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 50, 50, 64)        3200      
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 46, 46, 128)       204928    
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 44, 44, 256)       295168    
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 42, 42, 512)       1180160   
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 21, 21, 512)       0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 225792)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)               

In [22]:
start = time.perf_counter()
model_base.fit(train_dataset,epochs=epochs)
elapsed = time.perf_counter() - start
print('elapsed: {:0.3f}'.format(elapsed))





elapsed: 61.113


In [26]:
def get_dataset(batch_size = 256, is_multi=False):
    if is_multi:
        batch_size *= 2
    print('batch_size', batch_size)
    num_val_samples = 10000

    # Return the MNIST dataset in the form of a `tf.data.Dataset`.
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

    # Preprocess the data (these are Numpy arrays)
    x_train = np.expand_dims(x_train, -1)
    x_test = np.expand_dims(x_test, -1)
    x_train = x_train.astype("float32") / 255
    x_test = x_test.astype("float32") / 255
    y_train = y_train.astype("float32")
    y_test = y_test.astype("float32")

    # Reserve num_val_samples samples for validation
    x_val = x_train[-num_val_samples:]
    y_val = y_train[-num_val_samples:]
    x_train = x_train[:-num_val_samples]
    y_train = y_train[:-num_val_samples]
    
    x_train = np.concatenate((x_train, x_train), -2)
    x_train = np.concatenate((x_train, x_train), -3)
    
    return (
        tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),
    )
train_dataset1, val_dataset1, test_dataset1 = get_dataset()

batch_size 256


In [27]:
start = time.perf_counter()
model_base.fit(train_dataset1,epochs=epochs)
elapsed = time.perf_counter() - start
print('elapsed: {:0.3f}'.format(elapsed))





elapsed: 49.547


In [28]:
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


Number of devices: 2


In [29]:
num_train_examples, num_test_examples, strategy.num_replicas_in_sync, epochs

(60000, 10000, 2, 1)

In [30]:
#BUFFER_SIZE = 10000
#BATCH_SIZE_s = BATCH_SIZE * strategy.num_replicas_in_sync

with strategy.scope():
    model_strategy = build_model()
    #train_dataset_strategy = mnist_train.map(scale).batch(BATCH_SIZE_s).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    #eval_dataset_strategy = mnist_test.map(scale).batch(BATCH_SIZE_s).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

train_dataset_2, val_dataset, test_dataset = get_dataset(is_multi=True)
start = time.perf_counter()
model_strategy.fit(
        train_dataset_2,        
        epochs=epochs)
elapsed = time.perf_counter() - start
print('elapsed: {:0.3f}'.format(elapsed))
#204s 3ms/step - loss: 0.0258 - accuracy: 0.9971 - val_loss: 1.9846 - val_accuracy: 0.9851 elapsed: 207.905
#350s 6ms/step - loss: 0.0149 - accuracy: 0.9985 - val_loss: 2.6412 - val_accuracy: 0.9873 elapsed: 353.443
#326s 174ms/step - loss: 30.8885 - accuracy: 0.1101 elapsed: 332.191

batch_size 512
INFO:tensorflow:batch_all_reduce: 14 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 14 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:batch_all_reduce: 14 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 14 all-reduces with algorithm = nccl, num_packs = 1


elapsed: 35.338


- single/single/dual
- 59/49/29

In [32]:
39/69, 69/39, 49/29, 29/49

(0.5652173913043478,
 1.7692307692307692,
 1.6896551724137931,
 0.5918367346938775)