# Evaluating Each Fine-Tuned Model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, logging
from peft import LoraConfig, PeftModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from torch import autocast
import json

hf_token = "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"

output_dir = "./results"
logging_dir = "./logs"

model_one_name = "google/gemma-2b-it" # already instruction fine-tuned
model_two_name = "microsoft/phi-2" # focused  on code, chat and QA tasks
model_three_name = "mistralai/Mistral-7B-v0.1" # only pretrained, but much larger (7B params)

In [2]:
dataset_name = "Clinton/Text-to-sql-v1"
dataset = load_dataset(dataset_name, split="train")

dataset = dataset.select(range(25000))
test_set = dataset.select(range(20000, 20010))

print(test_set[0])

{'instruction': 'What fraction has parentheses of 0.(3)?', 'input': 'CREATE TABLE table_36623 (\n    "Fraction" text,\n    "Ellipsis" text,\n    "Vinculum" text,\n    "Dots" text,\n    "Parentheses" text\n)', 'response': 'SELECT "Fraction" FROM table_36623 WHERE "Parentheses" = \'0.(3)\'', 'source': 'wikisql', 'text': 'Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables. ### Instruction: What fraction has parentheses of 0.(3)? ### Input: CREATE TABLE table_36623 (\n    "Fraction" text,\n    "Ellipsis" text,\n    "Vinculum" text,\n    "Dots" text,\n    "Parentheses" text\n) ### Response: SELECT "Fraction" FROM table_36623 WHERE "Parentheses" = \'0.(3)\''}


## Defining Functions

In [3]:
def format_prompt_gemma(instruction, input_text):
    return (
        f"<start_of_turn>user\n{instruction}\n\n{input_text}\n<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )

def format_prompt_phi2(instruction, input_text):
    return (
        f"Instruct: {instruction}\n"
        f"Input: {input_text}\n"
        f"Output:"
    )

def format_prompt_mistral(instruction, input_text):
    return (
        f"### Instruction:\n{instruction}\n\n"
        f"### Input:\n{input_text}\n\n"
        f"### Response:\n"
    )

def get_model_prompt(instruction, input_text, model_name):
    if model_name == "gemma":
        return format_prompt_gemma(instruction, input_text)
    elif model_name == "phi2":
        return format_prompt_phi2(instruction, input_text)
    elif model_name == "mistral":
        return format_prompt_mistral(instruction, input_text)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [6]:
def extract_model_response(full_output, model_name):
    if model_name == "gemma":
        # Look for response after "<start_of_turn>model"
        if "model" in full_output:
            response = full_output.split("model", 1)[1].strip()
        else:
            response = full_output.strip()

        # Optional: strip any <end_of_turn> after the response
        response = response.replace("<end_of_turn>", "").strip()

    elif model_name == "phi2":
        # Expecting: "Output: [response]"
        if "Output:" in full_output:
            response = full_output.split("Output:", 1)[1].strip()
        else:
            response = full_output.strip()

    elif model_name == "mistral":
        # Expecting: "### Response:\n[response]"
        if "### Response:" in full_output:
            response = full_output.split("### Response:", 1)[1].strip()
        else:
            response = full_output.strip()

    else:
        raise ValueError(f"Unsupported model name: {model_name}")

    return response

In [7]:
def attach_model_responses(test_data, model, tokenizer, model_name, response_key, max_tokens=100):
    model_responses = {}
    for i, example in enumerate(test_data):
        print(f"Example {i+1}")
        instruction = example["instruction"]
        input_text = example["input"]

        # Format prompt
        prompt = get_model_prompt(instruction, input_text, model_name)

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Generate response
        with torch.no_grad():
            with autocast(device_type="cuda", dtype=torch.bfloat16):  # or use device_type="cuda" for PyTorch >= 2
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.95,
                    pad_token_id=tokenizer.eos_token_id,
                )

        # Decode full output
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract response depending on model format
        generated_response = extract_model_response(full_output, model_name)

        # Attach to example
        model_responses[i] = generated_response
        
    return model_responses

In [8]:
gemma_path = f"{output_dir}/{model_one_name}"
phi_path = f"{output_dir}/{model_two_name}"
mistral_path = f"{output_dir}/{model_three_name}"

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

## Gemma Response Generation

In [19]:
gemma_model = AutoModelForCausalLM.from_pretrained(
    gemma_path,
    token=hf_token,
    quantization_config=bnb_config,
    device_map="cuda"
)

gemma_tokenizer = AutoTokenizer.from_pretrained(model_one_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
gemma_responses = attach_model_responses(test_set, gemma_model, gemma_tokenizer, "gemma", "gemma_response")
with open("gemma_responses.json", "w") as f:
    json.dump(gemma_responses, f, indent=2)

Example 1
Example 2
Example 3
Example 4
Example 5
Example 6
Example 7
Example 8
Example 9
Example 10


In [50]:
print(gemma_responses[0])

SELECT "Fraction" FROM table_36623 WHERE "Parentheses" = '0.(3)'


## Phi-2 Response Generation

In [10]:
phi_model = AutoModelForCausalLM.from_pretrained(
    phi_path,
    token=hf_token,
    quantization_config=bnb_config,
    device_map="cuda"
)
phi_tokenizer = AutoTokenizer.from_pretrained(model_two_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
phi_responses = attach_model_responses(test_set, phi_model, phi_tokenizer, "phi2", "phi2_response")

Example 1
Example 2
Example 3
Example 4
Example 5
Example 6
Example 7
Example 8
Example 9
Example 10


In [18]:
with open("phi_responses.json", "w") as f:
    json.dump(phi_responses, f, indent=2)

In [15]:
print(phi_responses[0])

SELECT "Fraction" FROM table_36623 WHERE "Parentheses" = '0.(3)' AND "Ellipsis" = '\u00a3\u00a3\u00a3\u00a3\u00a3\u00a3\u00a3' AND "Dots" = '\u00a3\u00a3\u00a3\u00a3\u00a3\u00


## Mistral Response Generation

In [16]:
mistral_model = AutoModelForCausalLM.from_pretrained(
    mistral_path,
    token=hf_token,
    quantization_config=bnb_config,
    device_map="cuda"
)
mistral_tokenizer = AutoTokenizer.from_pretrained(model_three_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
mistral_responses = attach_model_responses(test_set, mistral_model, mistral_tokenizer, "mistral", "mistral_response")

Example 1
Example 2
Example 3
Example 4
Example 5
Example 6
Example 7
Example 8
Example 9
Example 10


In [19]:
with open("mistral_responses.json", "w") as f:
    json.dump(mistral_responses, f, indent=2)

In [20]:
print(mistral_responses[0])

SELECT "Fraction" FROM table_36623 WHERE "Parentheses" = '0.(3)'


# Evaluating Responses

In [33]:
def get_query(entry, model_response):
    return (
        f"Instruction:\n{entry['instruction']}\n\n"
        f"Input:\n{entry['input']}\n\n"
        f"Correct Answer:\n{entry['response']}\n\n"
        f"Model's Answer:\n{model_response}\n\n"
        f"Score the model's answer from 0 to 100 based on how correct and complete it is. "
        f"Only respond with the score as an integer (no other text)."
    )


import urllib.request as req    
def query_model(prompt, model="gemma3:4b", url="http://localhost:11434/api/chat"):
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {
            "seed": 123,
            "temperature": 0,
            "num_ctx": 5000
        }
    }

    payload = json.dumps(data).encode("utf-8")
    
    request = req.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    response_data = ""
    with req.urlopen(request) as response:
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]
    return response_data


def generate_model_scores(test_set, model_responses, model='gemma3:4b'):
    scores = []
    for i, entry in enumerate(test_set):
        prompt = get_query(entry, model_response=model_responses[str(i)])
        print(f"Scoring example {i+1}")
        score = query_model(prompt, model)
        try:
            scores.append(int(score.strip().split()[0]))
        except ValueError:
            print(f"⚠️ Could not convert score: {score}")
            continue
    return scores

In [34]:
with open("gemma_responses.json", "r") as f:
    gemma_data = json.load(f)

gemma_scores = generate_model_scores(test_set, gemma_data)

Scoring example 1
Scoring example 2
Scoring example 3
Scoring example 4
Scoring example 5
Scoring example 6
Scoring example 7
Scoring example 8
Scoring example 9
Scoring example 10


In [36]:
print(gemma_scores)

[100, 100, 100, 100, 0, 100, 100, 100, 100, 40]


In [38]:
with open("phi_responses.json", "r") as f:
    phi_data = json.load(f)

phi_scores = generate_model_scores(test_set, phi_data)

Scoring example 1
Scoring example 2
Scoring example 3
Scoring example 4
Scoring example 5
Scoring example 6
Scoring example 7
Scoring example 8
Scoring example 9
Scoring example 10


In [39]:
print(phi_scores)

[100, 80, 0, 100, 0, 100, 90, 30, 0, 20]


In [40]:
with open("mistral_responses.json", "r") as f:
    mistral_data = json.load(f)

mistral_scores = generate_model_scores(test_set, mistral_data)

Scoring example 1
Scoring example 2
Scoring example 3
Scoring example 4
Scoring example 5
Scoring example 6
Scoring example 7
Scoring example 8
Scoring example 9
Scoring example 10


In [41]:
print(mistral_scores)

[100, 60, 100, 100, 100, 100, 100, 30, 95, 20]


In [42]:
print(f"Gemma average score: {sum(gemma_scores) / 10}")
print(f"Phi average score: {sum(phi_scores) / 10}")
print(f"Mistral average score: {sum(mistral_scores) / 10}")

Gemma average score: 84.0
Phi average score: 52.0
Mistral average score: 80.5
