In [1]:
%pip install bitsandbytes accelerate
%pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Login to Huggingface

In [None]:
from huggingface_hub import notebook_login
import os

#notebook_login()
# Set the Hugging Face token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load the model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

#Model configs

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)

In [None]:

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config ,
    device_map="auto",
    trust_remote_code=True
)


model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# next(model.parameters()).dtype

# for name, module in model.named_modules():
#     print(name)

### Configure PEFT

In [10]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 13,631,488 || all params: 7,261,655,040 || trainable%: 0.1877


In [11]:
len(model.peft_config)

1

### Load Dataset

In [12]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig

# load the dataset and split it into train, validation and test sets
dataset = load_dataset("json", data_files="../data/combinations.jsonl", split='train')
shuffled_dataset = dataset.shuffle(seed=42)
train_temp_split = shuffled_dataset.train_test_split(test_size=0.3) #30% for validation and test
temp_dataset = train_temp_split['test']
validation_test_split = temp_dataset.train_test_split(test_size=1/3)# 10% for validation and 20% for test
split_datasets = DatasetDict({
    'train': train_temp_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})


tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
config = AutoConfig.from_pretrained(model_name)
MAX_LENGTH = 1500


def tokenize_with_loss_mask(example):
    chat_str = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    tokenized = tokenizer(chat_str, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    input_ids = tokenized["input_ids"]
    labels = []

    seq_start_idx = 0 # start of sequence
    total_length = len(chat_str)

    while seq_start_idx < total_length:

        # print("-----------------------")
        seq_end_idx = chat_str.find("</s>", seq_start_idx)+ len("</s>") # end of sequence
        if seq_end_idx == -1:
            break

        sequence = chat_str[seq_start_idx:seq_end_idx]
        end_user_idx = sequence.find("[/INST]")+ len("[/INST]") # end of user message
        end_assistant_idx = sequence.find("</s>")
        user_content = sequence[:end_user_idx]
        assistant_content = sequence[end_user_idx:end_assistant_idx]

        seq_tokens = tokenizer(sequence, add_special_tokens=False)["input_ids"]
        user_tokens = tokenizer(user_content, add_special_tokens=False)["input_ids"]
        assistant_tokens = tokenizer(assistant_content, add_special_tokens=False)["input_ids"]
        
        labels.extend([-100] * len(user_tokens))
        labels.extend(assistant_tokens)
        labels.extend([-100] * (len(seq_tokens) - len(user_tokens) - len(assistant_tokens)))
        
        seq_start_idx = seq_end_idx

    labels = [-100] * (len(input_ids) - len(labels)) + labels  # Pad to max length

    tokenized["labels"] = labels
    if len(tokenized["input_ids"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for inputs, truncating.")
    if len(tokenized["labels"]) > MAX_LENGTH:
        print("Warning: Input sequence exceeds max length for labels, truncating.")
    if len(tokenized["input_ids"]) != len(tokenized["labels"]):
        print("Error: Input and label lengths do not match after processing.")

    return tokenized

tokenized_dataset = split_datasets.map(tokenize_with_loss_mask, remove_columns=split_datasets["train"].column_names)
print(tokenized_dataset)


Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 70
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})


In [13]:
# from datasets import load_dataset, Dataset
# from transformers import AutoTokenizer


# raw_dataset = load_dataset("json", data_files="../data/combinations.jsonl")["train"]


# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# tokenizer.pad_token = tokenizer.eos_token

# def tokenize(example):
#     model_inputs = tokenizer(
#         tokenizer.apply_chat_template(example["messages"], tokenize=False),
#         truncation=True,
#         padding="max_length",
#         max_length=1024
#     )
#     model_inputs["labels"] = model_inputs["input_ids"].copy()
#     return model_inputs

# tokenized_dataset = raw_dataset.map(tokenize)

# tokenized_dataset

### Tune the model

In [None]:
from transformers import TrainingArguments, Trainer
import os

checkpoint_dir = "../output/checkpoints"
model_output_dir = "../output/final_adapter"

training_args = TrainingArguments(
    per_device_train_batch_size=1, # 1 sample per device due to GPU memory constraints
    per_device_eval_batch_size=1, # 1 sample per device due to GPU memory constraints
    gradient_accumulation_steps=8, # to accumulate gradients update over multiple steps to simulate larger batch sizes
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,#10
    eval_strategy="epoch",
    save_strategy="epoch",
    # save_steps=500,
    metric_for_best_model="loss",
    load_best_model_at_end=True,
    output_dir=checkpoint_dir,
    label_names=["labels"],
    disable_tqdm=False, # enable tqdm progress bars
    gradient_checkpointing=True, # to train large models with limited GPU memory
    save_total_limit=4, # keep only the last 4 checkpoints to save space
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    args=training_args
)

resume_from_checkpoint = None
if os.path.exists(checkpoint_dir) and len(os.listdir(checkpoint_dir)) > 0:
    resume_from_checkpoint = checkpoint_dir
    print(f"Resuming training from checkpoint: {resume_from_checkpoint}")
else:
    print("No checkpoint found. Starting training from scratch.")


trainer.train()

trainer.model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

Epoch,Training Loss,Validation Loss
1,No log,0.484546
2,0.764600,0.479432


('../output/final_adapter\\tokenizer_config.json',
 '../output/final_adapter\\special_tokens_map.json',
 '../output/final_adapter\\chat_template.jinja',
 '../output/final_adapter\\tokenizer.model',
 '../output/final_adapter\\added_tokens.json',
 '../output/final_adapter\\tokenizer.json')

In [None]:
metrics = trainer.evaluate(eval_dataset=tokenized_dataset['test'])
print(metrics)


### Reload model and Test

In [None]:
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                  quantization_config=quantization_config , 
                                                  device_map="auto")

# Load QLoRA adapter on top
model = PeftModel.from_pretrained(base_model, model_output_dir)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_output_dir)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(config.max_position_embeddings)

prompt = " <s><INST>You are a Json test data generator generating valid json data based on a json schema. Given the below schema,\n\n    ### Schema: {'$schema': 'http://json-schema.org/draft-07/schema#', 'title': 'X12 NM1 Segment', 'description': \"Schema for X12 NM1 segment, which contains subscriber's or organization name information.\", 'type': 'object', 'properties': {'entity_identifier_code': {'type': 'string', 'description': 'Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)', 'enum': ['IL', '03', 'PR', '1P']}, 'entity_type_qualifier': {'type': 'string', 'description': 'Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)', 'enum': ['1', '2']}, 'name_last_or_organizationName': {'type': 'string', 'description': 'Name Last or Organization Name', 'minLength': 1, 'maxLength': 60}, 'name_first': {'type': 'string', 'description': 'Name First', 'nullable': True, 'minLength': 1, 'maxLength': 35}, 'name_middle': {'type': 'string', 'description': 'Name Middle', 'nullable': True, 'minLength': 1, 'maxLength': 25}, 'name_prefix': {'type': 'string', 'description': 'Name Prefix', 'nullable': True, 'minLength': 1, 'maxLength': 10}, 'name_suffix': {'type': 'string', 'description': 'Name Suffix', 'nullable': True, 'minLength': 1, 'maxLength': 10}, 'identification_code_qualifier': {'type': 'string', 'description': 'Identification Code Qualifier (e.g., MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, ZZ = Mutually Defined)', 'enum': ['MI', 'XX', 'SV', 'FI', 'PI', 'ZZ'], 'nullable': True}, 'identification_code': {'type': 'string', 'description': 'Identification Code', 'nullable': True, 'minLength': 2, 'maxLength': 80}, 'nm1_segment': {'type': 'string', 'description': 'NM1 Segment format of the data (e.g., NM1*IL*1*DOE*JOHN****MI*123456789~)', 'nullable': True, 'maxLength': 255}}, 'required': ['entity_identifier_code', 'entity_type_qualifier', 'name_last_or_organizationName', 'nm1_segment']}\n\n Generate a sample json that confirms to the given schema</INST>"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=1000,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )
    
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Output: ", generated_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Output:  <INST>Generate a sample JSON for the given schema. {"$schema":"http://json-schema.org/draft-07/schema#","title":"X12 NM1 Segment","description":"Schema for X12 NM1 segment, which contains subscriber's or organization name information.","type":"object","properties":{"entity_identifier_code":{"type":"string","description":"Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)","enum":["IL","03","PR","1P"]},"entity_type_qualifier":{"type":"string","description":"Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)","enum":["1","2"]},"name_last_or_organizationName":{"type":"string","description":"Name Last or Organization Name","minLength":1,"maxLength":60},"name_first":{"type":"string","description":"Name First","nullable":true,"minLength":1,"maxLength":35},"name_middle":{"type":"string","description":"Name Middle","nullable":true,"minLength":1,"maxLength":25},"name_prefix":{"type":"string","description":"Name Prefix","nullable"

### Evaluate

In [None]:
import json
import re
import numpy as np
from jsonschema import validate, ValidationError

# function to extact JSON from a string
def extract_json_from_string(input_string):
    try:
        # Use a regular expression to find the first JSON object in the string
        json_match = re.search(r'\{.*?\}', input_string, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)  # Extract the JSON string
            return json.loads(json_str)  # Parse and return the JSON object
        else:
            print("No JSON object found in the string.")
            return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None


# Validate a JSON string against a schema
def is_valid_json_schema(json_generated, schema):
    try:
        validate(instance=json_generated, schema=schema)
        return True
    except (json.JSONDecodeError, ValidationError):
        return False
    
# Compare two JSON strings at field level and match field values
def compare_json_field_values(json_ref, json_gen):
    obj1=json_ref
    obj2=json_gen
    matching_fields = []
    matching_values = 0
    total_fields = max(len(obj1), len(obj2))
    for key in obj1:
        if key in obj2:
            matching_fields.append(key)
            if obj1[key] == obj2[key]:
                matching_values += 1
    score = matching_values
    percentage = score / total_fields if total_fields > 0 else 0
    result = {
        "matching_fields": matching_fields,
        "matching_values": score,
        "total_fields": total_fields,
        "percentage": percentage
    }
    return result['percentage']


def test_model(adapter_dir, test_dataset):

    valid_json = []
    valid_json_schema = []
    comparison_results = []
    
    base_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                    quantization_config=quantization_config , 
                                                    device_map="auto")
    
    model = PeftModel.from_pretrained(base_model, model_output_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_output_dir)

    for i, row in test_dataset.iterrows():

        prompt = f"<s><INST>{row['messages'][0]['content']}</INST>"
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=1000,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                eos_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        
        if("</INST>" in generated_text):
            generated_json = extract_json_from_string(generated_text.split("<INST>")[-1].strip())
            gold_json = extract_json_from_string(row['messages'][1]['content'])
            schema = extract_json_from_string(generated_text.split("<INST>")[-2].strip())
        
        if generated_json is not None:
            valid_json.append(True)
        else:
            valid_json.append(False)

        
        if not is_valid_json_schema(generated_json, schema):
            valid_json_schema.append(False)
        else:
            valid_json_schema.append(True)
        
        comparison_score = compare_json_field_values(gold_json, generated_json)
        comparison_results.append(comparison_score)

    
    result = {
        "valid_json": np.mean(valid_json) * 100,
        "valid_json_schema": np.mean(valid_json_schema) * 100,
        "comparison_results": np.mean(comparison_results) * 100
    }
    
    return result
    

### Save the model