In [1]:
# %pip install  transformers peft
# %pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129
# %pip install  bitsandbytes accelerate
# %pip install ipywidgets
# %pip install sentencepiece #conda install sentencepiece
# %pip install  mlflow

### Login to Huggingface

In [2]:
from huggingface_hub import notebook_login
import os

# notebook_login()
# Set the Hugging Face token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
# os.environ["HF_HOME"] = "C:\\Users\\Admin\\.cache\\huggingface"

### Load the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

#Model configs

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
f_checkpoint_dir = "../output/{experiment_name}/{run_id}/checkpoints"
f_model_output_dir = "../output/{experiment_name}/adapters/{epoch}"
training_data = "../data/schema_style_dataset_v2.jsonl"

num_train_epochs = 5
mlflow_experiment_name = "Approach_1_Multiple_Schemas"
# mlflow_experiment_name = "Approach_2_Property_Style"
mlflow_run_name = "run-4"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)


In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config ,
    device_map="auto",
    trust_remote_code=True
)


model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Configure PEFT

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()



trainable params: 13,631,488 || all params: 7,261,655,040 || trainable%: 0.1877


### Load Dataset

In [6]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig

# load the dataset and split it into train, validation and test sets
dataset = load_dataset("json", data_files=training_data, split='train')
shuffled_dataset = dataset.shuffle(seed=42)
train_temp_split = shuffled_dataset.train_test_split(test_size=0.3) #30% for validation and test
temp_dataset = train_temp_split['test']
validation_test_split = temp_dataset.train_test_split(test_size=1/3)# 10% for validation and 20% for test
split_datasets = DatasetDict({
    'train': train_temp_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})


tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
config = AutoConfig.from_pretrained(model_name)
MAX_LENGTH = 3000


def tokenize_with_loss_mask(example):
    chat_str = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    tokenized = tokenizer(chat_str, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    input_ids = tokenized["input_ids"]
    labels = []

    seq_start_idx = 0 # start of sequence
    total_length = len(chat_str)

    while seq_start_idx < total_length:

        # print("-----------------------")
        seq_end_idx = chat_str.find("</s>", seq_start_idx)+ len("</s>") # end of sequence
        if seq_end_idx == -1:
            break

        sequence = chat_str[seq_start_idx:seq_end_idx]
        end_user_idx = sequence.find("[/INST]")+ len("[/INST]") # end of user message
        end_assistant_idx = sequence.find("</s>")
        user_content = sequence[:end_user_idx]
        assistant_content = sequence[end_user_idx:end_assistant_idx]

        seq_tokens = tokenizer(sequence, add_special_tokens=False)["input_ids"]
        user_tokens = tokenizer(user_content, add_special_tokens=False)["input_ids"]
        assistant_tokens = tokenizer(assistant_content, add_special_tokens=False)["input_ids"]
        
        labels.extend([-100] * len(user_tokens))
        labels.extend(assistant_tokens)
        labels.extend([-100] * (len(seq_tokens) - len(user_tokens) - len(assistant_tokens)))
        
        seq_start_idx = seq_end_idx

    labels = [-100] * (len(input_ids) - len(labels)) + labels  # Pad to max length

    tokenized["labels"] = labels
    if len(tokenized["input_ids"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for inputs, truncating.")
    if len(tokenized["labels"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for labels, truncating.")
    if len(tokenized["input_ids"]) != len(tokenized["labels"]):
        print("Error: Input and label lengths do not match after processing.")

    return tokenized

tokenized_dataset = split_datasets.map(tokenize_with_loss_mask, remove_columns=split_datasets["train"].column_names)
print(tokenized_dataset)


Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 700
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})


### Setup MLflow tracking

In [7]:
import mlflow

def get_last_run_id_for_run_name(experiment_name, run_name):
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name(experiment_name)
    runs = client.search_runs(experiment.experiment_id, filter_string=f"tags.mlflow.runName='{run_name}'", order_by=["attributes.start_time DESC"], max_results=1)

    if runs:
        return runs[0].info.run_id
    else:
        return None


mlflow_tracking_uri = f"file:{os.path.dirname(os.getcwd())}/mlruns"  
print("MLflow tracking URI:", mlflow_tracking_uri)
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment(mlflow_experiment_name)
mlflow_run_id = get_last_run_id_for_run_name(mlflow_experiment_name, mlflow_run_name)
checkpoint_dir = f_checkpoint_dir.format(experiment_name=mlflow_experiment_name, run_id=mlflow_run_name)
model_output_dir = f_model_output_dir.format(experiment_name=mlflow_experiment_name, epoch=f"epoch-{num_train_epochs}")

mlflow.end_run() # end any previous active runs
if mlflow_run_id:
    print("Starting MLflow run with ID:", mlflow_run_id)
    mlflow.start_run(run_id=mlflow_run_id)
else:
    print("Starting new MLflow run with name:", mlflow_run_name)
    mlflow.start_run(run_name=mlflow_run_name)

mlflow.log_param("dataset", training_data)

MLflow tracking URI: file:c:\Users\Admin\Documents\Projects\Repositories\synthetic-data-generator/mlruns
Starting MLflow run with ID: bad2d1d386c349ec8449867def081965


'../data/schema_style_dataset_v2.jsonl'

### Tune the model

In [8]:
from transformers import TrainingArguments, Trainer
from transformers.integrations import MLflowCallback
import os
import glob

training_args = TrainingArguments(
    per_device_train_batch_size=1, # 1 sample per device due to GPU memory constraints
    per_device_eval_batch_size=1, # 1 sample per device due to GPU memory constraints
    gradient_accumulation_steps=8, # to accumulate gradients update over multiple steps to simulate larger batch sizes
    num_train_epochs=num_train_epochs,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available(),
    logging_steps=10,#10
    log_level="info", # set log level to info to see training progress
    eval_strategy="epoch",
    save_strategy="epoch",
    # save_steps=500,
    metric_for_best_model="loss",
    load_best_model_at_end=True,
    output_dir=checkpoint_dir,
    label_names=["labels"],
    disable_tqdm=False, # enable tqdm progress bars
    gradient_checkpointing=True, # to train large models with limited GPU memory
    save_total_limit=4, # keep only the last 4 checkpoints to save space
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    args=training_args
)

trainer.add_callback(MLflowCallback)

resume_from_checkpoint = None
if os.path.exists(checkpoint_dir):
    # Find all checkpoint folders in the checkpoint_dir by extracting the checkpoint index
    checkpoint_paths = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
    if checkpoint_paths:
        # Extract checkpoint indices and sort by the integer value
        checkpoint_indices = [
            (int(os.path.basename(path).split("-")[-1]), path)
            for path in checkpoint_paths
            if os.path.basename(path).split("-")[-1].isdigit()
        ]
        if checkpoint_indices:
            # Get the path with the highest checkpoint index
            latest_checkpoint = max(checkpoint_indices, key=lambda x: x[0])[1]
            resume_from_checkpoint = latest_checkpoint
            print(f"Resuming training from latest checkpoint: {resume_from_checkpoint}")
        else:
            print("No valid checkpoint found. Starting training from scratch.")
    else:
        print("No checkpoint found. Starting training from scratch.")
else:
    print("No checkpoint directory found. Starting training from scratch.") 


trainer.train(resume_from_checkpoint=resume_from_checkpoint)

print(trainer.state.log_history)

Using auto half precision backend
You are adding a <class 'transformers.integrations.integration_utils.MLflowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
MLflowCallback
NotebookProgressCallback
Loading model from ../output/Approach_1_Multiple_Schemas/run-4/checkpoints\checkpoint-176.


Resuming training from latest checkpoint: ../output/Approach_1_Multiple_Schemas/run-4/checkpoints\checkpoint-176


***** Running training *****
  Num examples = 700
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 264
  Number of trainable parameters = 13,631,488
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 176
  Will skip the first 2 epochs then the first 0 batches in the first epoch.
2025/08/13 18:22:23 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id bad2d1d386c349ec8449867def081965: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='../output/Approach_1_Multiple_Schemas/run-4/checkpoints\runs\Aug13_15-57-26_Ragus-pc' for run ID='bad2d1d386c349ec8449867def081965'. Attempted logging new value '../output/Approach_1_Multiple_Schemas/run-4/checkpoints\runs\Aug13_18-22-

Epoch,Training Loss,Validation Loss
3,0.4282,0.325347



***** Running Evaluation *****
  Num examples = 200
  Batch size = 1
Saving model checkpoint to ../output/Approach_1_Multiple_Schemas/run-4/checkpoints\checkpoint-264
loading configuration file config.json from cache at C:\Users\Admin\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.3\snapshots\0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a\config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.55.0",
  "use_cache": true,
  "vocab_size":

[{'epoch': 0.11428571428571428, 'grad_norm': 0.5070728063583374, 'learning_rate': 0.00017954545454545456, 'loss': 0.9141, 'step': 10}, {'epoch': 0.22857142857142856, 'grad_norm': 2.2638933658599854, 'learning_rate': 0.00015681818181818182, 'loss': 0.5314, 'step': 20}, {'epoch': 0.34285714285714286, 'grad_norm': 0.5135822892189026, 'learning_rate': 0.0001340909090909091, 'loss': 0.5398, 'step': 30}, {'epoch': 0.45714285714285713, 'grad_norm': 0.7794923782348633, 'learning_rate': 0.00011136363636363636, 'loss': 0.4716, 'step': 40}, {'epoch': 0.5714285714285714, 'grad_norm': 0.2082739770412445, 'learning_rate': 8.863636363636364e-05, 'loss': 0.4515, 'step': 50}, {'epoch': 0.6857142857142857, 'grad_norm': 0.2679755687713623, 'learning_rate': 6.59090909090909e-05, 'loss': 0.4963, 'step': 60}, {'epoch': 0.8, 'grad_norm': 0.21189631521701813, 'learning_rate': 4.318181818181819e-05, 'loss': 0.4963, 'step': 70}, {'epoch': 0.9142857142857143, 'grad_norm': 0.3668287694454193, 'learning_rate': 2.0

### Save the model

In [9]:
import shutil
trainer.model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

mlflow.log_artifact(model_output_dir, artifact_path="adapters")  

mlflow.end_run()  # End the MLflow run
print(f"Model and tokenizer saved to {model_output_dir}")

shutil.rmtree(model_output_dir)

loading configuration file config.json from cache at C:\Users\Admin\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.3\snapshots\0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a\config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.55.0",
  "use_cache": true,
  "vocab_size": 32768
}

chat template saved in ../output/Approach_1_Multiple_Schemas/adapters/epoch-3\chat_template.jinja
tokenizer config file saved in ../output/Approach_1_Multiple

Model and tokenizer saved to ../output/Approach_1_Multiple_Schemas/adapters/epoch-3


In [10]:
#! python -m mlflow ui --backend-store-uri file:./mlruns