# 📊 Model Evaluation on instruct_dataset.jsonl

This notebook evaluates the performance of a fine-tuned model using the original training dataset `instruct_dataset.jsonl`.

The comparison is made between the model's predictions and the ground truth JSON outputs.

In [None]:
!pip install transformers datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load your fine-tuned model (adjust path if needed)
model_path = "./checkpoints/final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
import json

# Load instruct_dataset.jsonl
with open("./dataset/instruct_dataset.jsonl", "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

print(f"Loaded {len(dataset)} samples")


In [None]:
from tqdm import tqdm

correct = 0
total = 0
failed = 0

for item in tqdm(dataset):
    prompt = f"{item['instruction']}

{item['input']}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    try:
        predicted_json = json.loads(generated)
        if predicted_json == item["output"]:
            correct += 1
    except:
        failed += 1  # JSON 파싱 실패

    total += 1

print(f"Total: {total}")
print(f"Correct JSON Matches: {correct}")
print(f"Parse Failures: {failed}")
print(f"Accuracy: {correct / total:.2%}")
