In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/test.jsonl
/kaggle/input/dataset/train.jsonl
/kaggle/input/lora-output-1/transformers/default/2/__huggingface_repos__.json
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/adapter_model.safetensors
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/merges.txt
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/adapter_config.json
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/README.md
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/tokenizer.json
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/vocab.json
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/tokenizer_config.json
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/chat_template.jinja
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/special_tokens_map.json
/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish/added_tokens.json
/kaggle/input/lora-output-1/

In [2]:
import json
import torch

def generate(model, tokenizer, prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text[len(prompt):].strip()

In [3]:
import json
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# -----------------------------
# Utils
# -----------------------------
DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")
LATIN_RE = re.compile(r"[A-Za-z]")

def is_valid_json(text):
    try:
        json.loads(text)
        return True
    except:
        return False

def build_prompt(example):
    return f"""<|system|>
You are an information extraction system.
Rules:
- Output ONLY valid JSON
- Do NOT add explanations
- Hindi words must be in Devanagari
- English words must remain Latin

<|user|>
Input: {example['input']}
Instruction: {example['instruction']}

<|assistant|>
"""

def script_compliance_score(predicted_text, gold_normalized_text):
    import re

    DEV_RE = re.compile(r'^[\u0900-\u097F]+$')
    LAT_RE = re.compile(r'^[A-Za-z]+$')

    pred_tokens = predicted_text.split()
    gold_tokens = gold_normalized_text.split()

    valid, total = 0, 0

    for p, g in zip(pred_tokens, gold_tokens):
        if not (DEV_RE.match(g) or LAT_RE.match(g)):
            continue

        total += 1
        if (DEV_RE.match(g) and DEV_RE.match(p)) or \
           (LAT_RE.match(g) and LAT_RE.match(p)):
            valid += 1

    return valid / total if total > 0 else 1.0


def slot_score(pred, gold):
    """
    Simple slot accuracy:
    correct_slot_keys / total_gold_slot_keys
    """
    pred_slots = pred.get("slots", {})
    gold_slots = gold.get("slots", {})

    if not gold_slots:
        return 1.0

    correct = 0
    for k, v in gold_slots.items():
        if k in pred_slots and pred_slots[k] == v:
            correct += 1

    return correct / len(gold_slots)


# -----------------------------
# Load model
# -----------------------------
def load_model(path):
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForCausalLM.from_pretrained(
        path,
        device_map="auto",
        torch_dtype="auto"
    )
    return model, tokenizer

# -----------------------------
# Evaluate
# -----------------------------
def evaluate(model, tokenizer, data):
    total = len(data)

    valid_json = 0
    slot_score_sum = 0.0
    script_score_sum = 0.0

    for ex in tqdm(data):
        prompt = build_prompt(ex)
        out = generate(model, tokenizer, prompt)

        if not is_valid_json(out):
            continue

        valid_json += 1
        pred = json.loads(out)
        gold = ex["output"]

        # Slot score (partial)
        slot_score_sum += slot_score(pred, gold)

        # Script compliance score (partial)
        script_score_sum += script_compliance_score(
            pred.get("normalized_text", ""),
            gold.get("normalized_text", "")
        )

    return {
        "json_validity_rate": valid_json / total,
        "slot_score": slot_score_sum / total,
        "script_compliance_rate": script_score_sum / total
    }

In [4]:
# -----------------------------
# Run
# -----------------------------
with open("/kaggle/input/dataset/test.jsonl") as f:
    test_data = [json.loads(x) for x in f]

# BEFORE
base_model, base_tok = load_model("Qwen/Qwen2.5-0.5B-Instruct")
before = evaluate(base_model, base_tok, test_data)

print("BEFORE:", before)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

2026-01-21 05:06:33.347079: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768971993.778171      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768971993.905100      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768971994.963468      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768971994.963499      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768971994.963502      24 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

  0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 100/100 [05:17<00:00,  3.17s/it]

BEFORE: {'json_validity_rate': 0.65, 'slot_score': 0.0, 'script_compliance_rate': 0.35393253968253974}





In [5]:
# AFTER
ft_model, ft_tok = load_model("/kaggle/input/lora-output-1/transformers/default/2/lora-hinglish")
after = evaluate(ft_model, ft_tok, test_data)

print("AFTER :", after)

100%|██████████| 100/100 [11:05<00:00,  6.66s/it]

AFTER : {'json_validity_rate': 0.72, 'slot_score': 0.395, 'script_compliance_rate': 0.6236150793650794}



