# Basic training on multiple GPUs of a CNN on imagenet from tfrecord files using TensorFlow's `tf.data` API

Here we will run a simplified training loop for a CNN model on ImageNet. We will create a TensorFlow's [`tf.data` API](https://www.tensorflow.org/guide/data) input pipeline based to feed to model with ImageNet data stored in tfrecord files.

We use [TensorFlow Datasets](https://www.tensorflow.org/datasets) to convert a `tf.data.Dataset` dataset to an iterable of NumPy arrays:
```python
np_dataset = tfds.as_numpy(tf_dataset)
```
from which the data is converted to `torch.tensor` and then moved to the GPU.

In [1]:
import ipcmagic

In [2]:
%ipcluster start -n 2

100%|██████████| 2/2 [00:09<00:00,  4.56s/engine]


In [3]:
%pxconfig --progress-after -1

In [4]:
%%px
import glob
import time
import numpy as np
import tensorflow_datasets as tfds
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.distributed as dist
import tensorflow as tf
from torch.nn.parallel import DistributedDataParallel
from torchvision import models
from pt_distr_env import DistributedEnviron

[stderr:0] 2022-10-12 10:29:48.335462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-12 10:29:48.489707: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-12 10:29:49.467117: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/daint/UES/6.0.UP04/sandboxes/sarafael/software/cuDNN/8.1.0/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/11.3/compat:/usr/local/cuda-11.3/compat:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/math_libs/1

[stderr:1] 2022-10-12 10:29:48.508854: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-12 10:29:48.938998: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-12 10:29:51.339906: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/daint/UES/6.0.UP04/sandboxes/sarafael/software/cuDNN/8.1.0/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/11.3/compat:/usr/local/cuda-11.3/compat:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/math_libs/1

In [5]:
%%px
tfrec_files = glob.glob(f'/scratch/snx3000/datasets/imagenet/ILSVRC2012_1k//train/*')

In [6]:
%%px
tf.config.set_visible_devices(
    tf.config.list_physical_devices('CPU')
)

In [7]:
%%px
distr_env = DistributedEnviron()
dist.init_process_group(backend="nccl")
world_size = dist.get_world_size()
rank = dist.get_rank()
device = distr_env.local_rank

[stderr:0] [W socket.cpp:401] [c10d] The server socket cannot be initialized on [::]:39591 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03206]:39591 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03206]:39591 (errno: 97 - Address family not supported by protocol).


[stderr:1] [W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03206]:39591 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03206]:39591 (errno: 97 - Address family not supported by protocol).


In [8]:
%%px
batch_size = 128

In [9]:
%%px
def decode(serialized_example):
    """Decode and resize"""
    example = tf.io.parse_single_example(
        serialized_example,
        features={
            'image/encoded': tf.io.FixedLenFeature([], tf.string),
            'image/class/label': tf.io.FixedLenFeature([], tf.int64),
        })
    image = tf.image.decode_jpeg(example['image/encoded'], channels=3)
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    image = tf.transpose(image, (2, 0, 1)) # rgb channels to the front
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    label = example['image/class/label'] - 1  # -> [0-999]
    return image, label

In [10]:
%%px
dataset = tf.data.TFRecordDataset(tfrec_files)
dataset = dataset.map(decode, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(tf.data.AUTOTUNE)
dataset = dataset.shard(world_size, rank)

[stderr:0] 2022-10-12 10:30:09.928024: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[stderr:1] 2022-10-12 10:30:09.929858: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
%%px
dataset_np = tfds.as_numpy(dataset)

In [12]:
%%px
_model = models.resnet50()
_model.to(device);

ddp_model = DistributedDataParallel(_model, device_ids=[device])

[stderr:0] libibverbs: Could not locate libibgni (/usr/lib64/libibgni.so.1: undefined symbol: verbs_uninit_context)


[stderr:1] libibverbs: Could not locate libibgni (/usr/lib64/libibgni.so.1: undefined symbol: verbs_uninit_context)


In [13]:
%%px
optimizer = optim.SGD(ddp_model.parameters(), lr=0.01)

In [14]:
%%px
def benchmark_step(model, imgs, labels):
    optimizer.zero_grad()
    output = model(imgs)
    loss = F.cross_entropy(output, labels)
    loss.backward()
    optimizer.step()

In [15]:
%%px
print()
num_epochs = 5
num_iters = 10
imgs_sec = []
for epoch in range(num_epochs):
    t0 = time.time()
    for step, (imgs, labels) in enumerate(dataset_np):
        if step > num_iters:
            break

        imgs = torch.from_numpy(imgs).to(device)
        labels = torch.from_numpy(labels).to(device)
        benchmark_step(ddp_model, imgs, labels)

    dt = time.time() - t0
    imgs_sec.append(batch_size * num_iters / dt)

    print(f' * Epoch {epoch:2d}: '
          f'{imgs_sec[epoch]:.2f} images/sec per GPU')

[stdout:1] 
 * Epoch  0: 142.36 images/sec per GPU
 * Epoch  1: 196.43 images/sec per GPU
 * Epoch  2: 195.52 images/sec per GPU
 * Epoch  3: 195.78 images/sec per GPU
 * Epoch  4: 197.07 images/sec per GPU


[stdout:0] 
 * Epoch  0: 142.42 images/sec per GPU
 * Epoch  1: 196.55 images/sec per GPU
 * Epoch  2: 195.48 images/sec per GPU
 * Epoch  3: 195.79 images/sec per GPU
 * Epoch  4: 197.01 images/sec per GPU


In [16]:
%ipcluster stop

IPCluster stopped.
