From 2b1e7469e46ea581fa0835d38144371aa50e212a Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic_sallabad@quicinc.com>
Date: Tue, 21 Jan 2025 09:24:25 +0000
Subject: [PATCH 1/2] Adding the support to resume the fine tuning using
 checkpoints from a prev run whoch would have stopped in between. There's no
 necessity to pass tokenizer_name if a model_name is passed. It will take the
 same name as model_name by default. If a different tokenizer_name is required
 than the model_name, then it can be passed separately as an argument.

Signed-off-by: Swati Allabadi <quic_sallabad@quicinc.com>
---
 QEfficient/finetune/configs/training.py  | 2 +-
 QEfficient/finetune/utils/train_utils.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
index 41ffa3fb3..698210ace 100644
--- a/QEfficient/finetune/configs/training.py
+++ b/QEfficient/finetune/configs/training.py
@@ -10,7 +10,7 @@
 @dataclass
 class train_config:
     model_name: str = "meta-llama/Llama-3.2-1B"
-    tokenizer_name: str = "meta-llama/Llama-3.2-1B"
+    tokenizer_name: str = None  # if not passed as an argument, it uses the value of model_name
     run_validation: bool = True
     batch_size_training: int = 1
     context_length: int = None
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
index fe5493978..59f756a76 100644
--- a/QEfficient/finetune/utils/train_utils.py
+++ b/QEfficient/finetune/utils/train_utils.py
@@ -116,6 +116,10 @@ def train(
         # enable profile for qaic
         qaic_profile.start_profiling(device, 1) if train_config.use_profiler else None
         for step, batch in enumerate(train_dataloader):
+            if train_config.use_peft and train_config.from_peft_checkpoint:
+                intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
+                if step < intermediate_step:
+                    continue
             total_train_steps += 1
             #  stop when the maximum number of training steps is reached
             if train_config.max_train_step > 0 and total_train_steps > train_config.max_train_step:

From e037095ac93eac87bfa2efb8701321d88ab68e87 Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic-swatia@quicinc.com>
Date: Thu, 20 Feb 2025 08:32:11 +0000
Subject: [PATCH 2/2] Adding changes of both together: resuming fine tuning
 using checkpoints and check for loss convergence.

Signed-off-by: Swati Allabadi <quic_sallabad@quicinc.com>
---
 QEfficient/finetune/utils/train_utils.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
index a32f9429b..3867bd7b6 100644
--- a/QEfficient/finetune/utils/train_utils.py
+++ b/QEfficient/finetune/utils/train_utils.py
@@ -112,7 +112,7 @@ def train(
                     f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps."
                 )
                 break
-     
+
         if train_config.use_peft and train_config.from_peft_checkpoint:
             intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
             if epoch < intermediate_epoch:
@@ -268,13 +268,17 @@ def train(
         epoch_end_time = time.perf_counter() - epoch_start_time
         epoch_times.append(epoch_end_time)
 
-        if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch:
-            train_epoch_loss = total_loss / (len(train_dataloader) - intermediate_step)
-
         if loss_0_counter.item() == train_config.convergence_counter:
-            train_epoch_loss = total_loss / step
+            if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch:
+                train_epoch_loss = total_loss / (step - intermediate_step)
+            else:
+                train_epoch_loss = total_loss / step
         else:
-            train_epoch_loss = total_loss / len(train_dataloader)
+            if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch:
+                train_epoch_loss = total_loss / (len(train_dataloader) - intermediate_step)
+            else:
+                train_epoch_loss = total_loss / len(train_dataloader)
+
         train_perplexity = torch.exp(train_epoch_loss)
 
         train_prep.append(float(train_perplexity))