In [4]:
"""
Some helper functions for TensorFlow2.0, including:
    - get_dataset(): download dataset from TensorFlow.
    - get_mean_and_std(): calculate the mean and std value of dataset.
    - normalize(): normalize dataset with the mean the std.
    - dataset_generator(): return `Dataset`.
    - progress_bar(): progress bar mimic xlua.progress.
"""
import tensorflow as tf
from tensorflow.keras import datasets

import numpy as np

padding = 4
image_size = 32
target_size = image_size + padding * 2


def get_dataset():
    """Download, parse and process a dataset to unit scale and one-hot labels."""
    (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

    # Normalize pixel values to be between 0 and 1
    train_images, test_images = train_images / 255.0, test_images / 255.0

    # One-hot labels
    # train_labels = _one_hot(train_labels, 10)
    # test_labels = _one_hot(test_labels, 10)
    return train_images, train_labels, test_images, test_labels


def get_mean_and_std(images):
    """Compute the mean and std value of dataset."""
    mean = np.mean(images, axis=(0, 1, 2))
    std = np.std(images, axis=(0, 1, 2))
    return mean, std


def normalize(images, mean, std):
    """Normalize data with mean and std."""
    return (images - mean) / std


def dataset_generator(images, labels, batch_size):
    ds = tf.data.Dataset.from_tensor_slices((images, labels))
    ds = ds.map(_augment_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.shuffle(len(images)).batch(batch_size)
    ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return ds


def _one_hot(train_labels, num_classes, dtype=np.float32):
    """Create a one-hot encoding of labels of size num_classes."""
    return np.array(train_labels == np.arange(num_classes), dtype)


def _augment_fn(images, labels):
    images = tf.image.pad_to_bounding_box(images, padding, padding, target_size, target_size)
    images = tf.image.random_crop(images, (image_size, image_size, 3))
    images = tf.image.random_flip_left_right(images)
    return images, labels

In [5]:
train_images, train_labels, test_images, test_labels = get_dataset()

In [6]:
train_images.shape, train_images[0]

((50000, 32, 32, 3),
 array([[[0.23137255, 0.24313725, 0.24705882],
         [0.16862745, 0.18039216, 0.17647059],
         [0.19607843, 0.18823529, 0.16862745],
         ...,
         [0.61960784, 0.51764706, 0.42352941],
         [0.59607843, 0.49019608, 0.4       ],
         [0.58039216, 0.48627451, 0.40392157]],
 
        [[0.0627451 , 0.07843137, 0.07843137],
         [0.        , 0.        , 0.        ],
         [0.07058824, 0.03137255, 0.        ],
         ...,
         [0.48235294, 0.34509804, 0.21568627],
         [0.46666667, 0.3254902 , 0.19607843],
         [0.47843137, 0.34117647, 0.22352941]],
 
        [[0.09803922, 0.09411765, 0.08235294],
         [0.0627451 , 0.02745098, 0.        ],
         [0.19215686, 0.10588235, 0.03137255],
         ...,
         [0.4627451 , 0.32941176, 0.19607843],
         [0.47058824, 0.32941176, 0.19607843],
         [0.42745098, 0.28627451, 0.16470588]],
 
        ...,
 
        [[0.81568627, 0.66666667, 0.37647059],
         [0.78823529

In [7]:
train_labels

array([[6],
       [9],
       [9],
       ...,
       [9],
       [1],
       [1]], dtype=uint8)

In [6]:
data_size = len(train_images)
data_size

50000

In [7]:
mean, std = get_mean_and_std(train_images)
train_images = normalize(train_images, mean, std)
test_images = normalize(test_images, mean, std)



In [8]:
train_images[0]

array([[[-1.05260405e+00, -9.81666336e-01, -7.62543433e-01],
        [-1.30659965e+00, -1.23936215e+00, -1.03238868e+00],
        [-1.19547658e+00, -1.20715017e+00, -1.06237148e+00],
        ...,
        [ 5.18993668e-01,  1.45752846e-01, -8.79303202e-02],
        [ 4.23745321e-01,  3.30109280e-02, -1.77878735e-01],
        [ 3.60246424e-01,  1.69049397e-02, -1.62887333e-01]],

       [[-1.73521721e+00, -1.65811785e+00, -1.40717374e+00],
        [-1.98921280e+00, -1.98023761e+00, -1.70700179e+00],
        [-1.70346776e+00, -1.85138971e+00, -1.70700179e+00],
        ...,
        [-3.66216884e-02, -5.62910640e-01, -8.82474653e-01],
        [-1.00120586e-01, -6.43440581e-01, -9.57431666e-01],
        [-5.24964129e-02, -5.79016628e-01, -8.52491848e-01]],

       [[-1.59234469e+00, -1.59369389e+00, -1.39218234e+00],
        [-1.73521721e+00, -1.86749569e+00, -1.70700179e+00],
        [-1.21135130e+00, -1.54537593e+00, -1.58707057e+00],
        ...,
        [-1.15995311e-01, -6.27334593e-01,

In [9]:
batch_size = 128
train_ds = dataset_generator(train_images, train_labels, batch_size)


2023-06-22 13:36:05.442190: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-22 13:36:05.456903: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-22 13:36:05.457627: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [10]:
test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).\
        batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

: 