In [1]:
%pip install bitsandbytes accelerate
%pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Login to Huggingface

In [2]:
from huggingface_hub import notebook_login
import os

#notebook_login()
# Set the Hugging Face token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

### Load the model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

#Model configs

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
checkpoint_dir = "../output/checkpoints"
model_output_dir = "../output/final_adapter"
training_data = "../data/combinations.jsonl"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)


In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config ,
    device_map="auto",
    trust_remote_code=True
)


model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Configure PEFT

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()



trainable params: 13,631,488 || all params: 7,261,655,040 || trainable%: 0.1877


### Load Dataset

In [6]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig

# load the dataset and split it into train, validation and test sets
dataset = load_dataset("json", data_files=training_data, split='train')
shuffled_dataset = dataset.shuffle(seed=42)
train_temp_split = shuffled_dataset.train_test_split(test_size=0.3) #30% for validation and test
temp_dataset = train_temp_split['test']
validation_test_split = temp_dataset.train_test_split(test_size=1/3)# 10% for validation and 20% for test
split_datasets = DatasetDict({
    'train': train_temp_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})


tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
config = AutoConfig.from_pretrained(model_name)
MAX_LENGTH = 1500


def tokenize_with_loss_mask(example):
    chat_str = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    tokenized = tokenizer(chat_str, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    input_ids = tokenized["input_ids"]
    labels = []

    seq_start_idx = 0 # start of sequence
    total_length = len(chat_str)

    while seq_start_idx < total_length:

        # print("-----------------------")
        seq_end_idx = chat_str.find("</s>", seq_start_idx)+ len("</s>") # end of sequence
        if seq_end_idx == -1:
            break

        sequence = chat_str[seq_start_idx:seq_end_idx]
        end_user_idx = sequence.find("[/INST]")+ len("[/INST]") # end of user message
        end_assistant_idx = sequence.find("</s>")
        user_content = sequence[:end_user_idx]
        assistant_content = sequence[end_user_idx:end_assistant_idx]

        seq_tokens = tokenizer(sequence, add_special_tokens=False)["input_ids"]
        user_tokens = tokenizer(user_content, add_special_tokens=False)["input_ids"]
        assistant_tokens = tokenizer(assistant_content, add_special_tokens=False)["input_ids"]
        
        labels.extend([-100] * len(user_tokens))
        labels.extend(assistant_tokens)
        labels.extend([-100] * (len(seq_tokens) - len(user_tokens) - len(assistant_tokens)))
        
        seq_start_idx = seq_end_idx

    labels = [-100] * (len(input_ids) - len(labels)) + labels  # Pad to max length

    tokenized["labels"] = labels
    if len(tokenized["input_ids"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for inputs, truncating.")
    if len(tokenized["labels"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for labels, truncating.")
    if len(tokenized["input_ids"]) != len(tokenized["labels"]):
        print("Error: Input and label lengths do not match after processing.")

    return tokenized

tokenized_dataset = split_datasets.map(tokenize_with_loss_mask, remove_columns=split_datasets["train"].column_names)
print(tokenized_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 210
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 60
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30
    })
})


### Tune the model

In [None]:
from transformers import TrainingArguments, Trainer
from transformers.integrations import MLflowCallback
import os
import mlflow
import glob

mlflow.set_tracking_uri("file:../mlruns")  # or your preferred URI
mlflow.set_experiment("Synthetic-data-generator-fine-tuning")


training_args = TrainingArguments(
    per_device_train_batch_size=1, # 1 sample per device due to GPU memory constraints
    per_device_eval_batch_size=1, # 1 sample per device due to GPU memory constraints
    gradient_accumulation_steps=8, # to accumulate gradients update over multiple steps to simulate larger batch sizes
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available(),
    logging_steps=10,#10
    log_level="info", # set log level to info to see training progress
    eval_strategy="epoch",
    save_strategy="epoch",
    # save_steps=500,
    metric_for_best_model="loss",
    load_best_model_at_end=True,
    output_dir=checkpoint_dir,
    label_names=["labels"],
    disable_tqdm=False, # enable tqdm progress bars
    gradient_checkpointing=True, # to train large models with limited GPU memory
    save_total_limit=4, # keep only the last 4 checkpoints to save space
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    args=training_args
)

trainer.add_callback(MLflowCallback)

resume_from_checkpoint = None
if os.path.exists(checkpoint_dir):
    # Find all checkpoint folders in the checkpoint_dir by extracting the checkpoint index
    checkpoint_paths = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
    if checkpoint_paths:
        # Extract checkpoint indices and sort by the integer value
        checkpoint_indices = [
            (int(os.path.basename(path).split("-")[-1]), path)
            for path in checkpoint_paths
            if os.path.basename(path).split("-")[-1].isdigit()
        ]
        if checkpoint_indices:
            # Get the path with the highest checkpoint index
            latest_checkpoint = max(checkpoint_indices, key=lambda x: x[0])[1]
            resume_from_checkpoint = latest_checkpoint
            print(f"Resuming training from latest checkpoint: {resume_from_checkpoint}")
        else:
            print("No valid checkpoint found. Starting training from scratch.")
    else:
        print("No checkpoint found. Starting training from scratch.")
else:
    print("No checkpoint directory found. Starting training from scratch.") 


trainer.train(resume_from_checkpoint=resume_from_checkpoint)

print(trainer.state.log_history)

2025/07/28 22:39:55 INFO mlflow.tracking.fluent: Experiment with name 'Synthetic-data-generator-fine-tuning' does not exist. Creating a new experiment.
PyTorch: setting up devices
average_tokens_across_devices is True but world size is 1. Setting it to False automatically.
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend
Exception ignored in: <function MLflowCallback.__del__ at 0x0000018E334F5F80>
Traceback (most recent call last):
  File "c:\Users\Admin\miniconda3\Lib\site-packages\transformers\integrations\integration_utils.py", line 1500, in __del__
    self._ml_flow.end_run()
  File "c:\Users\Admin\miniconda3\Lib\site-packages\mlflow\tracking\fluent.py", line 547, in end_run
    MlflowClient().set_terminated(last_active_run_id, statu

No checkpoint found. Starting training from scratch.


Epoch,Training Loss,Validation Loss
1,0.5341,0.650067
2,0.7024,0.666562
3,0.5856,0.676275



***** Running Evaluation *****
  Num examples = 60
  Batch size = 1
Saving model checkpoint to ../output/checkpoints\checkpoint-27
loading configuration file config.json from cache at C:\Users\Admin\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.3\snapshots\0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a\config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.54.0",
  "use_cache": true,
  "vocab_size": 32768
}

loading configuration file

TrainOutput(global_step=81, training_loss=0.598622559029379, metrics={'train_runtime': 1069.2934, 'train_samples_per_second': 0.589, 'train_steps_per_second': 0.076, 'total_flos': 4.079902224384e+16, 'train_loss': 0.598622559029379, 'epoch': 3.0})

### Evaluation

In [31]:
# metrics = trainer.evaluate(eval_dataset=tokenized_dataset['test'])
# print(metrics)
print(trainer.state.log_history)

[{'loss': 0.6365, 'grad_norm': 0.3650476932525635, 'learning_rate': 0.00017777777777777779, 'epoch': 0.38095238095238093, 'step': 10}, {'loss': 0.5341, 'grad_norm': 0.31551894545555115, 'learning_rate': 0.0001530864197530864, 'epoch': 0.7619047619047619, 'step': 20}, {'eval_loss': 0.6500667929649353, 'eval_runtime': 29.8519, 'eval_samples_per_second': 2.01, 'eval_steps_per_second': 2.01, 'epoch': 1.0, 'step': 27}, {'loss': 0.6239, 'grad_norm': 0.25058436393737793, 'learning_rate': 0.00012839506172839505, 'epoch': 1.1142857142857143, 'step': 30}, {'loss': 0.5466, 'grad_norm': 0.2913757264614105, 'learning_rate': 0.0001037037037037037, 'epoch': 1.4952380952380953, 'step': 40}, {'loss': 0.7024, 'grad_norm': 0.3275398313999176, 'learning_rate': 7.901234567901235e-05, 'epoch': 1.8761904761904762, 'step': 50}, {'eval_loss': 0.6665619611740112, 'eval_runtime': 29.8585, 'eval_samples_per_second': 2.009, 'eval_steps_per_second': 2.009, 'epoch': 2.0, 'step': 54}, {'loss': 0.5684, 'grad_norm': 0.

### Save the model

In [29]:
trainer.model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

loading configuration file config.json from cache at C:\Users\Admin\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.3\snapshots\0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a\config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.54.0",
  "use_cache": true,
  "vocab_size": 32768
}

loading configuration file config.json from cache at C:\Users\Admin\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.3\snapshots\0d4b76e1efeb5

('../output/final_adapter\\tokenizer_config.json',
 '../output/final_adapter\\special_tokens_map.json',
 '../output/final_adapter\\chat_template.jinja',
 '../output/final_adapter\\tokenizer.model',
 '../output/final_adapter\\added_tokens.json',
 '../output/final_adapter\\tokenizer.json')

In [30]:
#! mlflow ui --backend-store-uri file:../mlruns