In [1]:
import os
import sys
import argparse

# run locally without install asynfed package
# root = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))
sys.path.append(root)


# tensorflow 
from asynfed.client_v2.frameworks.tensorflow.tensorflow_framework import TensorflowFramework
from resnet18 import Resnet18
from utils import *


2023-07-02 07:20:18.961377: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-02 07:20:18.998052: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from data_preprocessing import preprocess_dataset


In [3]:
config = {
    "client_id": "234-gpu-1",
    "queue_consumer": {
        'exchange_name': 'asynfl_exchange',
        'exchange_type': 'topic',
        'queue_name': 'server_queue',
        'routing_key': 'client.#',
        'end_point': 'amqps://gocktdwu:jYQBoATqKHRqXaV4O9TahpPcbd8xjcaw@armadillo.rmq.cloudamqp.com/gocktdwu'
    },
    "queue_producer": {
        'exchange_name': 'asynfl_exchange',
        'exchange_type': 'topic',
        'queue_name': 'server_consumer',
        'routing_key': 'server.#',
        'end_point': "amqps://gocktdwu:jYQBoATqKHRqXaV4O9TahpPcbd8xjcaw@armadillo.rmq.cloudamqp.com/gocktdwu"
    },
    "training_params": {
        "dataset": "cifar10",
        "model": "resnet18",

        "regularization": "l2",
        "lambda_value": 5e-4,
        "learning_rate": 1e-1,

        # setup differently for different device
        "gpu_index": 0,
        "chunk_index": 1,

        "qod": 0.45,
        "batch_size": 128,
        "epoch": 200,

        "tracking_point": 2000,
        "sleeping_time": 10,
        "delta_time": 1000000
    }
}


In [5]:



gpu_index = 0

print("*" * 20)
print("*" * 20)
if tf.config.list_physical_devices('GPU'):
    tf.config.set_visible_devices(tf.config.list_physical_devices('GPU')[gpu_index], 'GPU')
    print("config tensorflow using gpu successfully")
else:
    print("There is no gpu or your tensorflow is not built in with gpu support")
print("*" * 20)
print("*" * 20)


epoch = 200
batch_size = 128
patience = 2000


# default_testing_dataset_path = "../../../../data/cifar_data/test_set.pickle"
default_testing_dataset_path = "../../../data/cifar_data/test_set.pickle"
training_dataset_path = f"../../../data/cifar_data/5_chunks/chunk_{config['training_params']['chunk_index']}.pickle"
# if os.getenv("cifar_train_dataset_path"):
#     training_dataset_path = os.getenv("cifar_train_dataset_path")
# else:
#     training_dataset_path = default_training_dataset_path
    

# train_ds, data_size = preprocess_dataset(training_dataset_path, training = True)
# test_ds, _ = preprocess_dataset(testing_dataset_path, training = False)
train_ds, data_size = preprocess_dataset(training_dataset_path, batch_size = 128, training = True)
test_ds, _ = preprocess_dataset(default_testing_dataset_path, training = False)


********************
********************
config tensorflow using gpu successfully
********************
********************


2023-07-02 07:21:32.153736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22301 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:19:00.0, compute capability: 8.6


In [6]:

# Define model
model = Resnet18(input_features= (32, 32, 3), 
                 output_features= 10,
                 lr=config['training_params']['learning_rate'],
                 decay_steps=int(config['training_params']['epoch'] * data_size / config['training_params']['batch_size']))
                #  decay_steps=int(Config.EPOCH * data_size / Config.BATCH_SIZE))
# Define framework
tensorflow_framework = TensorflowFramework(model=model, 
                                           data_size= data_size, 
                                           train_ds= train_ds, 
                                           test_ds= test_ds, 
                                           config=config)



# Initialize variables for early stopping check
best_val_loss = float("inf")
# Number of epochs to wait before stopping training when performance worsens
# already set patience above
waiting = 0
# training with 200 epoch or early stopping
print("*" * 20)
print("*" * 20)
print(f"Training for the total number of epoch {epoch} with batch_size {batch_size} for datasize of {data_size}")
print("*" * 20)
print("*" * 20)
for epoch in range(epoch):
    tensorflow_framework.model.train_loss.reset_states()
    tensorflow_framework.model.train_performance.reset_states()
    tensorflow_framework.model.test_loss.reset_states()
    tensorflow_framework.model.test_performance.reset_states()

    for images, labels in tensorflow_framework.train_ds:
        train_acc, train_loss= tensorflow_framework.fit(images, labels)

    for test_images, test_labels in tensorflow_framework.test_ds:
        test_acc, test_loss = tensorflow_framework.evaluate(test_images, test_labels)

    print("Epoch {} - Train Acc: {:.2f} -- Train Loss {} Test Acc {:.2f}  Test Loss {}".format(epoch+1,
                                                                                       train_acc * 100,
                                                                                       train_loss,
                                                                                       test_acc * 100,
                                                                                       test_loss))
    
    # After each epoch, check the validation loss
    if test_loss < best_val_loss:
        best_val_loss = test_loss
        waiting = 0
    else:
        waiting += 1

    if waiting >= patience:
        print("Early stopping triggered - ending training.")
        break



********************
********************
Training for the total number of epoch 200 with batch_size 128 for datasize of 20833
********************
********************


2023-07-02 07:21:49.230406: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [20833,10]
	 [[{{node Placeholder/_1}}]]
2023-07-02 07:21:49.230654: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [20833,32,32,3]
	 [[{{node Placeholder/_0}}]]
2023-07-02 07:21:52.910776: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-07-02 07:21:53.886031: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.

Epoch 1 - Train Acc: 22.14 -- Train Loss 5.039610385894775 Test Acc 17.54  Test Loss 2.5480892658233643
Epoch 2 - Train Acc: 33.17 -- Train Loss 3.9001669883728027 Test Acc 32.16  Test Loss 1.8047676086425781
Epoch 3 - Train Acc: 39.98 -- Train Loss 3.4360148906707764 Test Acc 34.29  Test Loss 1.8114347457885742
Epoch 4 - Train Acc: 44.77 -- Train Loss 3.0639326572418213 Test Acc 38.67  Test Loss 1.7052552700042725
Epoch 5 - Train Acc: 48.32 -- Train Loss 2.7619235515594482 Test Acc 48.85  Test Loss 1.4457772970199585
Epoch 6 - Train Acc: 52.60 -- Train Loss 2.479161262512207 Test Acc 48.61  Test Loss 1.4168365001678467
Epoch 7 - Train Acc: 56.39 -- Train Loss 2.228421926498413 Test Acc 34.09  Test Loss 2.1786036491394043
Epoch 8 - Train Acc: 60.65 -- Train Loss 2.0119681358337402 Test Acc 52.07  Test Loss 1.395901083946228
Epoch 9 - Train Acc: 63.36 -- Train Loss 1.8373239040374756 Test Acc 49.93  Test Loss 1.4239997863769531
Epoch 10 - Train Acc: 65.79 -- Train Loss 1.681914687156677

In [4]:
# save weights
save_location = "weights2.pkl"
weights = model.get_weights()
with open(save_location, 'wb') as f:
    import pickle
    pickle.dump(weights, f)