From 2b1e7469e46ea581fa0835d38144371aa50e212a Mon Sep 17 00:00:00 2001 From: Swati Allabadi Date: Tue, 21 Jan 2025 09:24:25 +0000 Subject: [PATCH 1/2] Adding the support to resume the fine tuning using checkpoints from a prev run whoch would have stopped in between. There's no necessity to pass tokenizer_name if a model_name is passed. It will take the same name as model_name by default. If a different tokenizer_name is required than the model_name, then it can be passed separately as an argument. Signed-off-by: Swati Allabadi --- QEfficient/finetune/configs/training.py | 2 +- QEfficient/finetune/utils/train_utils.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index 41ffa3fb3..698210ace 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -10,7 +10,7 @@ @dataclass class train_config: model_name: str = "meta-llama/Llama-3.2-1B" - tokenizer_name: str = "meta-llama/Llama-3.2-1B" + tokenizer_name: str = None # if not passed as an argument, it uses the value of model_name run_validation: bool = True batch_size_training: int = 1 context_length: int = None diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index fe5493978..59f756a76 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -116,6 +116,10 @@ def train( # enable profile for qaic qaic_profile.start_profiling(device, 1) if train_config.use_profiler else None for step, batch in enumerate(train_dataloader): + if train_config.use_peft and train_config.from_peft_checkpoint: + intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) + if step < intermediate_step: + continue total_train_steps += 1 # stop when the maximum number of training steps is reached if train_config.max_train_step > 0 and total_train_steps > train_config.max_train_step: From e037095ac93eac87bfa2efb8701321d88ab68e87 Mon Sep 17 00:00:00 2001 From: Swati Allabadi Date: Thu, 20 Feb 2025 08:32:11 +0000 Subject: [PATCH 2/2] Adding changes of both together: resuming fine tuning using checkpoints and check for loss convergence. Signed-off-by: Swati Allabadi --- QEfficient/finetune/utils/train_utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index a32f9429b..3867bd7b6 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -112,7 +112,7 @@ def train( f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." ) break - + if train_config.use_peft and train_config.from_peft_checkpoint: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 if epoch < intermediate_epoch: @@ -268,13 +268,17 @@ def train( epoch_end_time = time.perf_counter() - epoch_start_time epoch_times.append(epoch_end_time) - if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch: - train_epoch_loss = total_loss / (len(train_dataloader) - intermediate_step) - if loss_0_counter.item() == train_config.convergence_counter: - train_epoch_loss = total_loss / step + if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch: + train_epoch_loss = total_loss / (step - intermediate_step) + else: + train_epoch_loss = total_loss / step else: - train_epoch_loss = total_loss / len(train_dataloader) + if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch: + train_epoch_loss = total_loss / (len(train_dataloader) - intermediate_step) + else: + train_epoch_loss = total_loss / len(train_dataloader) + train_perplexity = torch.exp(train_epoch_loss) train_prep.append(float(train_perplexity))