In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling2D, Flatten, Dense, Conv2D, Rescaling
import mpi4py
import os

In [3]:
# load in dataset from imagenette 

batch_size = 128
img_height = 224
img_width = 224
scratch = os.environ['SCRATCH']
train_dir = os.path.join(scratch,'imagenette/imagenette2/train/')
val_dir = os.path.join(scratch,'imagenette/imagenette2/val/')

train_dataset = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    labels='inferred',
    label_mode='int',
    class_names=None,
    color_mode='rgb',
    batch_size=batch_size,
    image_size=(img_height, img_width),
    shuffle=True,
)

val_dataset = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    labels='inferred',
    label_mode='int',
    class_names=None,
    color_mode='rgb',
    batch_size=batch_size,
    image_size=(img_height, img_width),
    shuffle=True,
)

# AUTOTUNE = tf.data.AUTOTUNE

# train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
# val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

Found 9469 files belonging to 10 classes.


2024-05-05 13:07:33.584705: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-05 13:07:35.591661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38218 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:03:00.0, compute capability: 8.0
2024-05-05 13:07:35.593485: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38218 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:41:00.0, compute capability: 8.0
2024-05-05 13:07:35.594982: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/devi

Found 3925 files belonging to 10 classes.


In [4]:
def vgg16():
    model = Sequential([
        Rescaling(1./255),
        Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(224, 224, 3), strides=1),
        Conv2D(64, (3, 3), activation='relu', padding='same', strides=1),
        MaxPooling2D((2, 2), strides=(2, 2)),
        
        Conv2D(128, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(128, (3, 3), activation='relu', padding='same', strides=1),
        MaxPooling2D((2, 2), strides=(2, 2)),
        
        Conv2D(256, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(256, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(256, (3, 3), activation='relu', padding='same', strides=1),
        MaxPooling2D((2, 2), strides=(2, 2)),
        
        Conv2D(512, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(512, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(512, (3, 3), activation='relu', padding='same', strides=1),
        MaxPooling2D((2, 2), strides=(2, 2)),
        
        Conv2D(512, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(512, (3, 3), activation='relu', padding='same', strides=1),
        Conv2D(512, (3, 3), activation='relu', padding='same', strides=1),
        MaxPooling2D((2, 2), strides=(2, 2)),
        
        Flatten(),
        Dense(4096, activation='relu'),
        Dense(4096, activation='relu'),
        Dense(1000, activation='softmax')
    ])
    return model

In [5]:
train_dataset
for image, label in train_dataset.take(1):
    print(image.shape)
    # print(label)

(128, 224, 224, 3)


In [14]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Create VGG16 model
vgg16_model = vgg16()
# vgg16_model.build((10, 224, 224, 3))

# Print model summary
# vgg16_model.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004, beta_1=0.9, beta_2=0.999, amsgrad=False)
# optimizer= tf.keras.optimizers.Adam(learning_rate=0.001,beta_1=0.9, beta_2=0.999)

# Compile the model
# vgg16_model.compile(optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train on image
# vgg16_model.fit(train_dataset, validation_data=val_dataset, epochs=50)

Num GPUs Available:  4


In [18]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = vgg16_model(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, vgg16_model.trainable_weights)
    optimizer.apply_gradients(zip(grads, vgg16_model.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value
@tf.function
def test_step(x, y):
    val_logits = vgg16_model(x, training=False)
    val_acc_metric.update_state(y, val_logits)

In [None]:
import time
# Instantiate an optimizer to train the model.
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004, beta_1=0.9, beta_2=0.999, amsgrad=False)
# optimizer= tf.keras.optimizers.Adam(learning_rate=0.001,beta_1=0.9, beta_2=0.999)
# Instantiate a loss function.

# Prepare the metrics.
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

epochs = 30
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss_value = train_step(x_batch_train, y_batch_train)

        # Log every 200 batches.
        if step % 20 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))



Start of epoch 0
Training loss (for one batch) at step 0: 2.3314
Seen so far: 128 samples
Training loss (for one batch) at step 20: 2.3311
Seen so far: 2688 samples
Training loss (for one batch) at step 40: 2.3077
Seen so far: 5248 samples
Training loss (for one batch) at step 60: 2.2932
Seen so far: 7808 samples
Training acc over epoch: 0.1023
Validation acc: 0.0910
Time taken: 17.99s

Start of epoch 1
Training loss (for one batch) at step 0: 2.3031
Seen so far: 128 samples
Training loss (for one batch) at step 20: 2.3556
Seen so far: 2688 samples
Training loss (for one batch) at step 40: 2.3144
Seen so far: 5248 samples
Training loss (for one batch) at step 60: 2.3148
Seen so far: 7808 samples
Training acc over epoch: 0.1030
Validation acc: 0.1042
Time taken: 17.42s

Start of epoch 2
Training loss (for one batch) at step 0: 2.3083
Seen so far: 128 samples
Training loss (for one batch) at step 20: 2.3073
Seen so far: 2688 samples
Training loss (for one batch) at step 40: 2.3052
Seen 