In [None]:
def FineTune_model(beauty_qa_standardizedV2, model, tokenizer, max_seq_length):
  '''
  This function fine-tunes a small language model to behave like a chatbot using efficient training techniques that reduce memory use and training time.
  It uses a 4-bit quantized model and LoRA adapters to update only a subset of the modelâ€™s parameters and includes memory tracking for analysis
  '''

  # Peft(Parameter efficient fine-tuning tried here; update a small subset of model parameters)
  # peft matches performance of full fine-tuning,
  # sometimes might lead to better generalization as state-of-the-art models are massively over-parametrized
  # Soon, we might hit global compute capacity. So, perform minimum computing possible
  # This wraps the original model with fine-tuning adapters.
  model = FastLanguageModel.get_peft_model(
      model,
      r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
      target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
      lora_alpha = 16,
      lora_dropout = 0, # Supports any, but = 0 is optimized
      bias = "none",    # Supports any, but = "none" is optimized
      # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
      use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
      random_state = 3407,
      use_rslora = False,  # We support rank stabilized LoRA
      loftq_config = None, # And LoftQ
  )

    # @title Fine tune model(SFT), to make it like a chatbot
  trainer = SFTTrainer(
      model = model,
      tokenizer = tokenizer,
      train_dataset = beauty_qa_standardizedV2,
      dataset_text_field = "text",
      max_seq_length = max_seq_length,
      data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
      dataset_num_proc = 2,
      packing = False, # Can make training 5x faster for short sequences.
      args = SFTConfig(
          per_device_train_batch_size = 2,
          gradient_accumulation_steps = 4,
          warmup_steps = 5,
          # num_train_epochs = 1, # Set this for 1 full training run.
          max_steps = 10,
          learning_rate = 2e-4,
          logging_steps = 1,
          optim = "adamw_8bit",
          weight_decay = 0.01,
          lr_scheduler_type = "linear",
          seed = 3407,
          output_dir = "outputs",
          report_to = "none", # Use this for WandB etc
      ),
  )
  trainer = train_on_responses_only(
      trainer,
      instruction_part = "<|im_start|>user\n",
      response_part = "<|im_start|>assistant\n",
  )

  tokenizer.decode(trainer.train_dataset[0]["input_ids"])

  space = tokenizer(" ", add_special_tokens = False).input_ids[0]
  tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])

  # @title Show current memory stats
  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print("Before fine-tuning:\n")
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of memory reserved.")

  trainer_stats = trainer.train()
  # The training step modifies the model weights in-place (specifically, the LoRA adapter weights).
  # No separate model loading is done for inference


  # @title Show final memory and time stats
  used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
  used_percentage = round(used_memory / max_memory * 100, 3)
  lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
  print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
  print(
      f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
  )
  print(f"Peak reserved memory = {used_memory} GB.")
  print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
  print(f"Peak reserved memory % of max memory = {used_percentage} %.")
  print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
  return model