From 8424e376a34ac33ed70a5eab669a8f17f091b91d Mon Sep 17 00:00:00 2001 From: "Radionov, Maksim" Date: Fri, 28 May 2021 12:34:56 +0300 Subject: [PATCH 1/4] Use epochs instead batch_num. Log epoch count --- openfl-workspace/keras_nlp/src/nlp_dataloader.py | 1 + openfl-workspace/tf_cnn_histology/plan/plan.yaml | 3 +-- .../workspace/plan/defaults/tasks_tensorflow.yaml | 2 +- .../workspace/plan/defaults/tasks_torch.yaml | 1 + openfl/federated/task/runner_keras.py | 11 ++++++----- openfl/federated/task/runner_pt.py | 12 +++++++----- openfl/federated/task/runner_tf.py | 11 +++-------- 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/openfl-workspace/keras_nlp/src/nlp_dataloader.py b/openfl-workspace/keras_nlp/src/nlp_dataloader.py index 1754c51eec..fd43d87c44 100644 --- a/openfl-workspace/keras_nlp/src/nlp_dataloader.py +++ b/openfl-workspace/keras_nlp/src/nlp_dataloader.py @@ -89,6 +89,7 @@ def get_valid_data_size(self): """ return self.X_valid[0].shape[0] + @staticmethod def _batch_generator(X1, X2, y, idxs, batch_size, num_batches): """ Generate batch of data. diff --git a/openfl-workspace/tf_cnn_histology/plan/plan.yaml b/openfl-workspace/tf_cnn_histology/plan/plan.yaml index 3fe6f8809a..52a660f949 100644 --- a/openfl-workspace/tf_cnn_histology/plan/plan.yaml +++ b/openfl-workspace/tf_cnn_histology/plan/plan.yaml @@ -61,5 +61,4 @@ tasks: batch_size: 32 epochs: 1 metrics: - - loss - num_batches: 1 + - loss \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml b/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml index 586a885b40..6d000cc618 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml @@ -18,6 +18,6 @@ train: function : train_batches kwargs : batch_size : 32 - num_batches : 1 metrics : - loss + epochs : 1 diff --git a/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml b/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml index a240c2003b..f41b0c3600 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml @@ -17,3 +17,4 @@ train: kwargs : metrics : - loss + epochs : 1 diff --git a/openfl/federated/task/runner_keras.py b/openfl/federated/task/runner_keras.py index 16ebfee0a7..aa21c177a1 100644 --- a/openfl/federated/task/runner_keras.py +++ b/openfl/federated/task/runner_keras.py @@ -59,7 +59,7 @@ def rebuild_model(self, round, input_tensor_dict, validation=False): else: self.set_tensor_dict(input_tensor_dict, with_opt_vars=False) - def train(self, col_name, round_num, input_tensor_dict, metrics, num_batches=None, **kwargs): + def train(self, col_name, round_num, input_tensor_dict, metrics, epochs=1, batch_size=1, **kwargs): """ Perform the training for a specified number of batches. @@ -80,10 +80,11 @@ def train(self, col_name, round_num, input_tensor_dict, metrics, num_batches=Non # rebuild model with updated weights self.rebuild_model(round_num, input_tensor_dict) - - results = self.train_iteration(self.data_loader.get_train_loader(num_batches), - metrics=metrics, - **kwargs) + for epoch in range(epochs): + self.logger.info(f"Run {epoch} epoch of {round_num} round") + results = self.train_iteration(self.data_loader.get_train_loader(batch_size), + metrics=metrics, + **kwargs) # output metric tensors (scalar) origin = col_name diff --git a/openfl/federated/task/runner_pt.py b/openfl/federated/task/runner_pt.py index 4d9d1f1267..7e22dbd6fb 100644 --- a/openfl/federated/task/runner_pt.py +++ b/openfl/federated/task/runner_pt.py @@ -124,7 +124,7 @@ def validate(self, col_name, round_num, input_tensor_dict, return output_tensor_dict, {} def train_batches(self, col_name, round_num, input_tensor_dict, - num_batches=None, use_tqdm=False, **kwargs): + num_batches=None, use_tqdm=False, epochs=1, **kwargs): """Train batches. Train the model on the requested number of batches. @@ -145,10 +145,12 @@ def train_batches(self, col_name, round_num, input_tensor_dict, # set to "training" mode self.train() self.to(self.device) - loader = self.data_loader.get_train_loader(num_batches) - if use_tqdm: - loader = tqdm.tqdm(loader, desc="train epoch") - metric = self.train_epoch(loader) + for epoch in range(epochs): + self.logger.info(f"Run {epoch} epoch of {round_num} round") + loader = self.data_loader.get_train_loader(num_batches) + if use_tqdm: + loader = tqdm.tqdm(loader, desc="train epoch") + metric = self.train_epoch(loader) # Output metric tensors (scalar) origin = col_name tags = ('trained',) diff --git a/openfl/federated/task/runner_tf.py b/openfl/federated/task/runner_tf.py index 4f39444a23..9dd2d1bd56 100644 --- a/openfl/federated/task/runner_tf.py +++ b/openfl/federated/task/runner_tf.py @@ -83,7 +83,7 @@ def rebuild_model(self, round, input_tensor_dict, validation=False): self.set_tensor_dict(input_tensor_dict, with_opt_vars=False) def train_batches(self, col_name, round_num, input_tensor_dict, - num_batches, use_tqdm=False, **kwargs): + epochs=1, use_tqdm=False, **kwargs): """ Perform the training for a specified number of batches. @@ -106,22 +106,17 @@ def train_batches(self, col_name, round_num, input_tensor_dict, self.rebuild_model(round_num, input_tensor_dict) tf.keras.backend.set_learning_phase(True) - losses = [] - batch_num = 0 - while batch_num < num_batches: + for epoch in range(epochs): + self.logger.info(f"Run {epoch} epoch of {round_num} round") # get iterator for batch draws (shuffling happens here) gen = self.data_loader.get_train_loader(batch_size) if use_tqdm: gen = tqdm.tqdm(gen, desc="training epoch") for (X, y) in gen: - if batch_num >= num_batches: - break - else: losses.append(self.train_batch(X, y)) - batch_num += 1 # Output metric tensors (scalar) origin = col_name From 4205e9ee19af7dd68b11fa3763fc956c054835da Mon Sep 17 00:00:00 2001 From: "Radionov, Maksim" Date: Fri, 28 May 2021 13:02:19 +0300 Subject: [PATCH 2/4] Fix flake8 --- openfl/federated/task/runner_keras.py | 7 ++++--- openfl/federated/task/runner_tf.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/openfl/federated/task/runner_keras.py b/openfl/federated/task/runner_keras.py index aa21c177a1..d37d9b09d5 100644 --- a/openfl/federated/task/runner_keras.py +++ b/openfl/federated/task/runner_keras.py @@ -59,7 +59,8 @@ def rebuild_model(self, round, input_tensor_dict, validation=False): else: self.set_tensor_dict(input_tensor_dict, with_opt_vars=False) - def train(self, col_name, round_num, input_tensor_dict, metrics, epochs=1, batch_size=1, **kwargs): + def train(self, col_name, round_num, input_tensor_dict, + metrics, epochs=1, batch_size=1, **kwargs): """ Perform the training for a specified number of batches. @@ -83,8 +84,8 @@ def train(self, col_name, round_num, input_tensor_dict, metrics, epochs=1, batch for epoch in range(epochs): self.logger.info(f"Run {epoch} epoch of {round_num} round") results = self.train_iteration(self.data_loader.get_train_loader(batch_size), - metrics=metrics, - **kwargs) + metrics=metrics, + **kwargs) # output metric tensors (scalar) origin = col_name diff --git a/openfl/federated/task/runner_tf.py b/openfl/federated/task/runner_tf.py index 9dd2d1bd56..6cd8da658b 100644 --- a/openfl/federated/task/runner_tf.py +++ b/openfl/federated/task/runner_tf.py @@ -116,7 +116,7 @@ def train_batches(self, col_name, round_num, input_tensor_dict, gen = tqdm.tqdm(gen, desc="training epoch") for (X, y) in gen: - losses.append(self.train_batch(X, y)) + losses.append(self.train_batch(X, y)) # Output metric tensors (scalar) origin = col_name From e028fee0a3f299b08ff60581bf242cbcf6fc8c21 Mon Sep 17 00:00:00 2001 From: Ilya Trushkin Date: Wed, 25 Aug 2021 18:16:09 +0300 Subject: [PATCH 3/4] Resolve conversations --- openfl/federated/task/runner_keras.py | 2 +- openfl/federated/task/runner_pt.py | 7 +++---- openfl/federated/task/runner_tf.py | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/openfl/federated/task/runner_keras.py b/openfl/federated/task/runner_keras.py index ec40129912..93ac40b1d0 100644 --- a/openfl/federated/task/runner_keras.py +++ b/openfl/federated/task/runner_keras.py @@ -63,7 +63,7 @@ def rebuild_model(self, round_num, input_tensor_dict, validation=False): def train(self, col_name, round_num, input_tensor_dict, metrics, epochs=1, batch_size=1, **kwargs): """ - Perform the training for a specified number of batches. + Perform the training. Is expected to perform draws randomly, without replacement until data is exausted. Then data is replaced and shuffled and draws continue. diff --git a/openfl/federated/task/runner_pt.py b/openfl/federated/task/runner_pt.py index 1e8a3fb790..2905f09798 100644 --- a/openfl/federated/task/runner_pt.py +++ b/openfl/federated/task/runner_pt.py @@ -129,7 +129,7 @@ def validate(self, col_name, round_num, input_tensor_dict, return output_tensor_dict, {} def train_batches(self, col_name, round_num, input_tensor_dict, - num_batches=None, use_tqdm=False, epochs=1, **kwargs): + use_tqdm=False, epochs=1, **kwargs): """Train batches. Train the model on the requested number of batches. @@ -138,9 +138,8 @@ def train_batches(self, col_name, round_num, input_tensor_dict, col_name: Name of the collaborator round_num: What round is it input_tensor_dict: Required input tensors (for model) - num_batches: The number of batches to train on before - returning use_tqdm (bool): Use tqdm to print a progress bar (Default=True) + epochs: The number of epochs to train Returns: global_output_dict: Tensors to send back to the aggregator @@ -152,7 +151,7 @@ def train_batches(self, col_name, round_num, input_tensor_dict, self.to(self.device) for epoch in range(epochs): self.logger.info(f'Run {epoch} epoch of {round_num} round') - loader = self.data_loader.get_train_loader(num_batches) + loader = self.data_loader.get_train_loader() if use_tqdm: loader = tqdm.tqdm(loader, desc='train epoch') metric = self.train_epoch(loader) diff --git a/openfl/federated/task/runner_tf.py b/openfl/federated/task/runner_tf.py index 9c04e74085..18c7726ce1 100644 --- a/openfl/federated/task/runner_tf.py +++ b/openfl/federated/task/runner_tf.py @@ -86,15 +86,15 @@ def rebuild_model(self, round_num, input_tensor_dict, validation=False): def train_batches(self, col_name, round_num, input_tensor_dict, epochs=1, use_tqdm=False, **kwargs): """ - Perform the training for a specified number of batches. + Perform the training. Is expected to perform draws randomly, without replacement until data is exausted. Then data is replaced and shuffled and draws continue. Args: - num_batches: Number of batches to train on use_tqdm (bool): True = use tqdm to print a progress bar (Default=False) + epochs (int): Number of epochs to train Returns: float: loss metric """ From 38a7fc40830eda3f72c98ba7399adac9192dabd2 Mon Sep 17 00:00:00 2001 From: Ilya Trushkin Date: Wed, 25 Aug 2021 18:19:55 +0300 Subject: [PATCH 4/4] Fix linter --- openfl/federated/task/runner_tf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/federated/task/runner_tf.py b/openfl/federated/task/runner_tf.py index 18c7726ce1..620a0ca0dd 100644 --- a/openfl/federated/task/runner_tf.py +++ b/openfl/federated/task/runner_tf.py @@ -94,7 +94,7 @@ def train_batches(self, col_name, round_num, input_tensor_dict, Args: use_tqdm (bool): True = use tqdm to print a progress bar (Default=False) - epochs (int): Number of epochs to train + epochs (int): Number of epochs to train Returns: float: loss metric """