In [1]:
import os
import sys
import argparse

# run locally without install asynfed package
# root = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))
sys.path.append(root)


# tensorflow 
from asynfed.client_v2.frameworks.tensorflow.tensorflow_framework import TensorflowFramework
from resnet18 import Resnet18
from utils import *


2023-07-02 05:30:52.941490: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-02 05:30:52.977084: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
config = {
    "client_id": "234-gpu-1",
    "queue_consumer": {
        'exchange_name': 'asynfl_exchange',
        'exchange_type': 'topic',
        'queue_name': 'server_queue',
        'routing_key': 'client.#',
        'end_point': 'amqps://gocktdwu:jYQBoATqKHRqXaV4O9TahpPcbd8xjcaw@armadillo.rmq.cloudamqp.com/gocktdwu'
    },
    "queue_producer": {
        'exchange_name': 'asynfl_exchange',
        'exchange_type': 'topic',
        'queue_name': 'server_consumer',
        'routing_key': 'server.#',
        'end_point': "amqps://gocktdwu:jYQBoATqKHRqXaV4O9TahpPcbd8xjcaw@armadillo.rmq.cloudamqp.com/gocktdwu"
    },
    "training_params": {
        "dataset": "cifar10",
        "model": "resnet18",

        "regularization": "l2",
        "lambda_value": 5e-4,
        "learning_rate": 1e-3,

        # setup differently for different device
        "gpu_index": 0,
        "chunk_index": 1,

        "qod": 0.45,
        "batch_size": 64,
        "epoch": 200,

        "tracking_point": 2000,
        "sleeping_time": 10,
        "delta_time": 1000000
    }
}


In [3]:



gpu_index = 0

print("*" * 20)
print("*" * 20)
if tf.config.list_physical_devices('GPU'):
    tf.config.set_visible_devices(tf.config.list_physical_devices('GPU')[gpu_index], 'GPU')
    print("config tensorflow using gpu successfully")
else:
    print("There is no gpu or your tensorflow is not built in with gpu support")
print("*" * 20)
print("*" * 20)


epoch = 200
batch_size = 128
patience = 2000


print('==> Preparing data...')
train_images, train_labels, test_images, test_labels = get_dataset()
data_size = len(train_images)

mean, std = get_mean_and_std(train_images)
train_images = normalize(train_images, mean, std)
test_images = normalize(test_images, mean, std)

train_ds = dataset_generator(train_images, train_labels, batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).\
        batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# # define model
# model = Resnet18(input_features = (32, 32, 3), output_features = 10, lr=1e-1, decay_steps=int(epoch * data_size / batch_size))
# # define framework
# tensorflow_framework = TensorflowFramework(model = model, epoch= 200, data_size= data_size, train_ds= train_ds, test_ds= test_ds, regularization='l2', delta_time= 10000, qod= 0.45)



# Define model
model = Resnet18(input_features= (32, 32, 3), 
                 output_features= 10,
                 lr=config['training_params']['learning_rate'],
                 decay_steps=int(config['training_params']['epoch'] * data_size / config['training_params']['batch_size']))
                #  decay_steps=int(Config.EPOCH * data_size / Config.BATCH_SIZE))
# Define framework
tensorflow_framework = TensorflowFramework(model=model, 
                                           data_size= data_size, 
                                           train_ds= train_ds, 
                                           test_ds= test_ds, 
                                           config=config)



# Initialize variables for early stopping check
best_val_loss = float("inf")
# Number of epochs to wait before stopping training when performance worsens
# already set patience above
waiting = 0
# training with 200 epoch or early stopping
print("*" * 20)
print("*" * 20)
print(f"Training for the total number of epoch {epoch} with batch_size {batch_size} for datasize of {data_size}")
print("*" * 20)
print("*" * 20)
for epoch in range(epoch):
    tensorflow_framework.model.train_loss.reset_states()
    tensorflow_framework.model.train_performance.reset_states()
    tensorflow_framework.model.test_loss.reset_states()
    tensorflow_framework.model.test_performance.reset_states()

    for images, labels in tensorflow_framework.train_ds:
        train_acc, train_loss= tensorflow_framework.fit(images, labels)

    for test_images, test_labels in tensorflow_framework.test_ds:
        test_acc, test_loss = tensorflow_framework.evaluate(test_images, test_labels)

    print("Epoch {} - Train Acc: {:.2f} -- Train Loss {} Test Acc {:.2f}  Test Loss {}".format(epoch+1,
                                                                                       train_acc * 100,
                                                                                       train_loss,
                                                                                       test_acc * 100,
                                                                                       test_loss))
    
    # After each epoch, check the validation loss
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        waiting = 0
    else:
        waiting += 1

    if waiting >= patience:
        print("Early stopping triggered - ending training.")
        break




********************
********************
config tensorflow using gpu successfully
********************
********************
==> Preparing data...


2023-07-02 05:31:05.232361: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22301 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:19:00.0, compute capability: 8.6


********************
********************
Training for the total number of epoch 200 with batch_size 128 for datasize of 50000
********************
********************


2023-07-02 05:31:07.471697: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [50000,32,32,3]
	 [[{{node Placeholder/_0}}]]
2023-07-02 05:31:07.471938: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [50000,10]
	 [[{{node Placeholder/_1}}]]
2023-07-02 05:31:11.377341: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-07-02 05:31:12.330801: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.

Epoch 1 - Train Acc: 38.27 -- Train Loss 3.9105777740478516 Test Acc 44.35  Test Loss 1.5151580572128296
Epoch 2 - Train Acc: 51.65 -- Train Loss 3.5630154609680176 Test Acc 53.86  Test Loss 1.2750625610351562
Epoch 3 - Train Acc: 58.70 -- Train Loss 3.372438907623291 Test Acc 58.44  Test Loss 1.216694951057434
Epoch 4 - Train Acc: 63.48 -- Train Loss 3.238180637359619 Test Acc 56.47  Test Loss 1.2350311279296875
Epoch 5 - Train Acc: 66.63 -- Train Loss 3.1420679092407227 Test Acc 59.30  Test Loss 1.192697525024414
Epoch 6 - Train Acc: 69.45 -- Train Loss 3.0574536323547363 Test Acc 65.85  Test Loss 0.9974250197410583
Epoch 7 - Train Acc: 71.58 -- Train Loss 2.987840414047241 Test Acc 69.92  Test Loss 0.8565183281898499
Epoch 8 - Train Acc: 73.97 -- Train Loss 2.921764373779297 Test Acc 74.09  Test Loss 0.7322002649307251
Epoch 9 - Train Acc: 75.57 -- Train Loss 2.8658366203308105 Test Acc 75.19  Test Loss 0.7061163187026978
Epoch 10 - Train Acc: 77.23 -- Train Loss 2.8132710456848145 

In [4]:
# save weights
save_location = "weights1.pkl"
weights = model.get_weights()
with open(save_location, 'wb') as f:
    import pickle
    pickle.dump(weights, f)