### Evaluation

In [1]:
from transformers import BitsAndBytesConfig
#Model configs

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
checkpoint_dir = "../output/checkpoints"
model_output_dir = "../output/final_adapter"
training_data = "../data/combinations.jsonl"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)

In [2]:
! nvidia-smi

Mon Jul 28 15:13:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.40                 Driver Version: 576.40         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 5070 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
|  0%   43C    P8             19W /  300W |    7204MiB /  16303MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
from datasets import load_dataset, Dataset, DatasetDict


# load the dataset and split it into train, validation and test sets
dataset = load_dataset("json", data_files=training_data, split='train')
shuffled_dataset = dataset.shuffle(seed=42)
train_temp_split = shuffled_dataset.train_test_split(test_size=0.3) #30% for validation and test
temp_dataset = train_temp_split['test']
validation_test_split = temp_dataset.train_test_split(test_size=1/3)# 10% for validation and 20% for test
split_datasets = DatasetDict({
    'train': train_temp_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                    quantization_config=quantization_config , 
                                                    device_map="auto")
    
model = PeftModel.from_pretrained(base_model, model_output_dir)
tokenizer = AutoTokenizer.from_pretrained(model_output_dir)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import json
import re
import numpy as np
import torch
from jsonschema import validate, ValidationError
from transformers import StoppingCriteria, StoppingCriteriaList


# function to extact JSON from a string
def find_and_parse_json(text: str) -> dict:
    """
    Finds the first valid JSON object in a string and parses it.

    """
    try:
        # Find the first opening curly brace
        first_brace_index = text.find('{')
        if first_brace_index == -1:
            return None # No JSON object found

        # Start searching from the first brace
        brace_level = 1
        for i, char in enumerate(text[first_brace_index + 1:]):
            if char == '{':
                brace_level += 1
            elif char == '}':
                brace_level -= 1
            
            if brace_level == 0:
                # We found the matching closing brace
                last_brace_index = first_brace_index + i + 1
                json_string = text[first_brace_index : last_brace_index + 1]
                
                # Now, use the built-in json library to parse it
                return json.loads(json_string)
        
        return None # No complete JSON object found
    except (json.JSONDecodeError, IndexError):
        # Handle cases where the substring is not valid JSON or string is malformed
        return None


# Validate a JSON string against a schema
def is_valid_json_schema(json_generated, schema):
    try:
        validate(instance=json_generated, schema=schema)
        return True
    except (json.JSONDecodeError, ValidationError):
        return False
    
# Compare two JSON strings at field level and match field values
def compare_json_field_values(json_ref, json_gen):
    obj1=json_ref
    obj2=json_gen
    matching_fields = []
    matching_values = 0
    total_fields = max(len(obj1), len(obj2))
    for key in obj1:
        if key in obj2:
            matching_fields.append(key)
            if obj1[key] == obj2[key]:
                matching_values += 1
    score = matching_values
    percentage = score / total_fields if total_fields > 0 else 0
    result = {
        "matching_fields": matching_fields,
        "matching_values": score,
        "total_fields": total_fields,
        "percentage": percentage
    }
    return result['percentage']

def get_result(valid_json, valid_json_schema, comparison_results):
    return json.dumps({
        "valid_json": np.mean(valid_json) * 100,
        "valid_json_schema": np.mean(valid_json_schema) * 100,
        "comparison_results": np.mean(comparison_results) * 100
    })


def test_model(model, tokenizer, test_dataset):

    torch.cuda.empty_cache()

    valid_json = []
    valid_json_schema = []
    comparison_results = []

    stop_words = ["[INST]"]
    stop_word_ids = [tokenizer(stop_word, add_special_tokens=False).input_ids[0] for stop_word in stop_words]
    stop_word_ids.append(tokenizer.eos_token_id)  # Add EOS token ID to the stop words
    print(f"Tokens to stop on: {stop_words}")
    print(f"Corresponding Token IDs: {stop_word_ids}")
        
    for i, row in enumerate(test_dataset):

        print(f"Processing row {i+1}/{len(test_dataset)}")

        prompt = f"<s>[INST] {row['messages'][0]['content']} [/INST]"
        # print(f"Prompt: \n {prompt} \n ---------------------------------")
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=1000,
                temperature=0.7,
                do_sample=True,                
                top_p=0.9,
                eos_token_id=stop_word_ids
            )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=False)

        # print(f"Generated text: \n {generated_text} \n ---------------------------------")
        
        if("[/INST]" in generated_text):
            generated_json = find_and_parse_json(generated_text.split("[/INST]")[-1].strip())
            gold_json = find_and_parse_json(row['messages'][1]['content'])
            schema = find_and_parse_json(generated_text.split("[/INST]")[-2].strip())
            # print(f"Generated JSON: {generated_json} \n Goled JSON: {gold_json} \n Schema: {schema}")
        
        if generated_json is not None:
            valid_json.append(True)
        else:
            valid_json.append(False)

        
        if schema is not None:
            if generated_json is not None and is_valid_json_schema(generated_json, schema):
                valid_json_schema.append(True)
            else:
                valid_json_schema.append(False)
        
        if gold_json is not None:
            if generated_json is not None:
                comparison_score = compare_json_field_values(gold_json, generated_json)
            else:
                comparison_score = 0.0
            comparison_results.append(comparison_score)

        print(get_result(valid_json, valid_json_schema, comparison_results))

    
    return get_result(valid_json, valid_json_schema, comparison_results)
    

In [32]:
test_model(model, tokenizer, split_datasets["test"])

Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.


Tokens to stop on: ['[INST]']
Corresponding Token IDs: [3, 2]
Processing row 1/1
{'valid_json': np.float64(100.0), 'valid_json_schema': np.float64(100.0), 'comparison_results': np.float64(10.0)}


{'valid_json': np.float64(100.0),
 'valid_json_schema': np.float64(100.0),
 'comparison_results': np.float64(10.0)}