In [9]:
%pip install bitsandbytes accelerate

Note: you may need to restart the kernel to use updated packages.


### Login to Huggingface

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

#model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", trust_remote_code=True)

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# quantization_config  = BitsAndBytesConfig(
#     load_in_8bit=True, 
#     llm_int8_enable_fp32_cpu_offload=True
# )

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config ,
    device_map="auto",
    trust_remote_code=True
)


model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# next(model.parameters()).dtype

# for name, module in model.named_modules():
#     print(name)

### Configure PEFT

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 13,631,488 || all params: 7,261,655,040 || trainable%: 0.1877


In [5]:
len(model.peft_config)

1

### Load Dataset

In [6]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoConfig


raw_dataset = load_dataset("json", data_files="../data/combinations.jsonl")["train"]

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
config = AutoConfig.from_pretrained(model_name)
MAX_LENGTH = 1500


def tokenize_with_loss_mask(example):
    chat_str = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    tokenized = tokenizer(chat_str, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    input_ids = tokenized["input_ids"]
    labels = []

    seq_start_idx = 0 # start of sequence
    total_length = len(chat_str)

    while seq_start_idx < total_length:

        # print("-----------------------")
        seq_end_idx = chat_str.find("</s>", seq_start_idx)+ len("</s>") # end of sequence
        if seq_end_idx == -1:
            break

        sequence = chat_str[seq_start_idx:seq_end_idx]
        end_user_idx = sequence.find("[/INST]")+ len("[/INST]") # end of user message
        end_assistant_idx = sequence.find("</s>")
        user_content = sequence[:end_user_idx]
        assistant_content = sequence[end_user_idx:end_assistant_idx]

        seq_tokens = tokenizer(sequence, add_special_tokens=False)["input_ids"]
        user_tokens = tokenizer(user_content, add_special_tokens=False)["input_ids"]
        assistant_tokens = tokenizer(assistant_content, add_special_tokens=False)["input_ids"]
        
        labels.extend([-100] * len(user_tokens))
        labels.extend(assistant_tokens)
        labels.extend([-100] * (len(seq_tokens) - len(user_tokens) - len(assistant_tokens)))
        
        seq_start_idx = seq_end_idx

    labels = [-100] * (len(input_ids) - len(labels)) + labels  # Pad to max length

    tokenized["labels"] = labels
    if len(tokenized["input_ids"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for inputs, truncating.")
    if len(tokenized["labels"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for labels, truncating.")
    if len(tokenized["input_ids"]) != len(tokenized["labels"]):
        print("Error: Input and label lengths do not match after processing.")

    return tokenized


tokenized_dataset = raw_dataset.map(tokenize_with_loss_mask, remove_columns=raw_dataset.column_names)
print(tokenized_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})


In [7]:
# from datasets import load_dataset, Dataset
# from transformers import AutoTokenizer


# raw_dataset = load_dataset("json", data_files="../data/combinations.jsonl")["train"]


# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# tokenizer.pad_token = tokenizer.eos_token

# def tokenize(example):
#     model_inputs = tokenizer(
#         tokenizer.apply_chat_template(example["messages"], tokenize=False),
#         truncation=True,
#         padding="max_length",
#         max_length=1024
#     )
#     model_inputs["labels"] = model_inputs["input_ids"].copy()
#     return model_inputs

# tokenized_dataset = raw_dataset.map(tokenize)

# tokenized_dataset

### Tune the model

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    per_device_train_batch_size=1,#2
    gradient_accumulation_steps=8,#4
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,#10
    save_steps=500,
    output_dir="../output",
    label_names=["labels"],
    disable_tqdm=False,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args
)

trainer.train()


{'loss': 1.0124, 'grad_norm': 2.8352882862091064, 'learning_rate': 0.0002, 'epoch': 0.02}
{'loss': 0.0497, 'grad_norm': 0.40480104088783264, 'learning_rate': 0.00019866666666666668, 'epoch': 0.04}
{'loss': 1.1566, 'grad_norm': 2.475431203842163, 'learning_rate': 0.00019733333333333335, 'epoch': 0.06}
{'loss': 0.6005, 'grad_norm': 2.8346757888793945, 'learning_rate': 0.000196, 'epoch': 0.08}
{'loss': 1.169, 'grad_norm': 1.6370095014572144, 'learning_rate': 0.0001946666666666667, 'epoch': 0.1}
{'loss': 0.4162, 'grad_norm': 0.7328026294708252, 'learning_rate': 0.00019333333333333333, 'epoch': 0.12}
{'loss': 0.9909, 'grad_norm': 1.8748282194137573, 'learning_rate': 0.000192, 'epoch': 0.14}
{'loss': 0.0014, 'grad_norm': 0.06207975372672081, 'learning_rate': 0.00019066666666666668, 'epoch': 0.16}
{'loss': 0.129, 'grad_norm': 0.6751952767372131, 'learning_rate': 0.00018933333333333335, 'epoch': 0.18}
{'loss': 0.9511, 'grad_norm': 1.1241636276245117, 'learning_rate': 0.000188, 'epoch': 0.2}
{'

KeyboardInterrupt: 

### Evaluate

In [None]:
import json
from jsonschema import validate, ValidationError

# Check if a string is valid JSON
def is_valid_json(s):
    try:
        json.loads(s)
        return True
    except json.JSONDecodeError:
        return False
    

# Validate a JSON string against a schema
def is_valid_json_schema(json_str, schema):
    try:
        data = json.loads(json_str)
        validate(instance=data, schema=schema)
        return True
    except (json.JSONDecodeError, ValidationError):
        return False
    
# Compare two JSON strings at field level and match field values
def compare_json_field_values(json_ref, json_gen):
    try:
        obj1 = json.loads(json_ref)
        obj2 = json.loads(json_gen)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON input.","percentage": 0}
    matching_fields = []
    matching_values = 0
    total_fields = max(len(obj1), len(obj2))
    for key in obj1:
        if key in obj2:
            matching_fields.append(key)
            if obj1[key] == obj2[key]:
                matching_values += 1
    score = matching_values
    percentage = score / total_fields if total_fields > 0 else 0
    return {
        "matching_fields": matching_fields,
        "matching_values": score,
        "total_fields": total_fields,
        "percentage": percentage
    }



### Save the model