In [5]:
SYSTEM_PROMPT = """You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user's query.

Instructions:
- Only include expansions that are clearly and directly related to the query's context.
- If multiple meanings are relevant, include all of them.
- If no acronym is relevant, return an empty dictionary: `{}`.
- Acronyms must appear in the query to be considered.
- Preserve the acronym casing as it appears in the query.
- Output must be a valid **JSON dictionary**:
  - Keys: acronyms found in the query.
  - Values: lists of relevant expansions (as strings).

Output Format:
{
  "ACRONYM1": ["Relevant Expansion 1", "Relevant Expansion 2",...],
  "ACRONYM2": ["Relevant Expansion 1", "Relevant Expansion 2",...],
}
"""

def parse_raw_prompt(raw_prompt_string):
    """Parse raw prompt string into OpenAI-compatible message format for vLLM"""
    parts = [part.strip() for part in raw_prompt_string.split("###") if part.strip()]

    messages = []

    # First part is the system message with instructions
    if len(parts) > 0:
        # Use simple string content format for vLLM compatibility
        messages.append({"role": "system", "content": parts[0]})

    # Process examples in pairs (user query, assistant response)
    for i in range(1, len(parts), 2):
        user_example = parts[i]
        assistant_response = parts[i + 1] if i + 1 < len(parts) else ""

        # Add user example
        messages.append({"role": "user", "content": user_example})

        # Add assistant response if available
        if assistant_response:
            messages.append({"role": "assistant", "content": assistant_response})

    return messages

# Parse the system prompt into message format for the API
SYSTEM_PROMPT = parse_raw_prompt(SYSTEM_PROMPT)
print(SYSTEM_PROMPT[:])

[{'role': 'system', 'content': 'You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user\'s query.\n\nInstructions:\n- Only include expansions that are clearly and directly related to the query\'s context.\n- If multiple meanings are relevant, include all of them.\n- If no acronym is relevant, return an empty dictionary: `{}`.\n- Acronyms must appear in the query to be considered.\n- Preserve the acronym casing as it appears in the query.\n- Output must be a valid **JSON dictionary**:\n  - Keys: acronyms found in the query.\n  - Values: lists of relevant expansions (as strings).\n\nOutput Format:\n{\n  "ACRONYM1": ["Relevant Expansion 1", "Relevant Expansion 2",...],\n  "ACRONYM2": ["Relevant Expansion 1", "Relevant Expansion 2",...],\n}'}]


In [6]:
import json
import random
from sklearn.model_selection import train_test_split

# Load your JSON file
with open("data/best_output_25000.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Format dataset
formatted = []
for item in data:
    query = item["Query"]
    candidates = item["Candidate_Acronyms"]  # Assumed to be a list of strings
    output = item["Best_Output"]             # Assumed to be a dict

    # Format candidates into a readable string
    formatted_candidates = json.dumps(candidates, ensure_ascii=False)

    # Add each example
    formatted.append({
        "instruction": SYSTEM_PROMPT,
        "input": f'query: "{query}", candidate acronyms: {formatted_candidates}',
        "output": json.dumps(output, ensure_ascii=False)  # Ensure proper JSON formatting
    })

# Shuffle the dataset
random.shuffle(formatted)

# 60:40 train-eval split
train_data, eval_data = train_test_split(formatted, test_size=0.2, random_state=42)

# Save JSONL files
with open("data/unsloth_train_data.jsonl", "w", encoding="utf-8") as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("data/unsloth_eval_data.jsonl", "w", encoding="utf-8") as f:
    for item in eval_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"âœ… Saved {len(train_data)} training samples and {len(eval_data)} eval samples in Unsloth format.")


âœ… Saved 16000 training samples and 4000 eval samples in Unsloth format.


In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Paths to your JSONL files
train_file = "/home/ubuntu/acronyms_project/data/unsloth_train_data.jsonl"
eval_file = "/home/ubuntu/acronyms_project/data/unsloth_eval_data.jsonl"

# Load dataset from JSONL files
dataset = load_dataset("json", data_files={
    "train": train_file,
    "eval": eval_file
})

# Print basic info
print("âœ… Dataset loaded:")
print(dataset)
print("\nâœ… Sample training example:")
print(dataset["train"][0])

Generating train split: 16000 examples [00:00, 371356.04 examples/s]
Generating eval split: 4000 examples [00:00, 401234.42 examples/s]

âœ… Dataset loaded:
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 16000
    })
    eval: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4000
    })
})

âœ… Sample training example:
{'instruction': [{'role': 'system', 'content': 'You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user\'s query.\n\nInstructions:\n- Only include expansions that are clearly and directly related to the query\'s context.\n- If multiple meanings are relevant, include all of them.\n- If no acronym is relevant, return an empty dictionary: `{}`.\n- Acronyms must appear in the query to be considered.\n- Preserve the acronym casing as it appears in the query.\n- Output must be a valid **JSON dictionary**:\n  - Keys: acronyms found in the query.\n  - Values: lists of relevant expansions (as strings).\n\nOutput Format:\n{\n  "ACRONYM1": 




In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "Qwen/Qwen3-4B"

# Load model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=torch.float16,     # Explicit and recommended
    load_in_4bit=True,
)

# Set pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Apply LoRA with Unsloth helper
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,              
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 
)

# Diagnostics
model.print_trainable_parameters()
print("âœ… Model loaded on:", next(model.parameters()).device)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-08 17:24:13 [__init__.py:216] Automatically detected platform cuda.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.1: Fast Qwen3 patching. Transformers: 4.56.2. vLLM: 0.11.0.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.10.1 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


trainable params: 5,898,240 || all params: 4,028,366,336 || trainable%: 0.1464
âœ… Model loaded on: cuda:0


In [None]:
import re
import json
from typing import Dict, List, Any, Callable

def _clean_output_to_json_str(raw_out: Any) -> str:
    if isinstance(raw_out, dict):
        return json.dumps(raw_out, ensure_ascii=False)
    if not isinstance(raw_out, str):
        return json.dumps(raw_out, ensure_ascii=False)
    s = raw_out.strip()
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        try:
            s = json.loads(s)
        except Exception:
            s = s[1:-1]
    try:
        obj = json.loads(s)
        if isinstance(obj, str):
            obj = json.loads(obj)
        if isinstance(obj, (dict, list)):
            return json.dumps(obj, ensure_ascii=False)
    except Exception:
        pass
    return s

def make_formatting_func(tokenizer) -> Callable[[Dict[str, Any]], List[str]]:
    def _strip_think(text: Any) -> Any:
        if isinstance(text, str):
            return re.sub(r"<think>[\s\S]*?</think>\s*", "", text)
        return text

    def _render_one(ex: Dict[str, Any]) -> str:
        messages: List[Dict[str, str]] = []

        inst = ex.get("instruction")
        if isinstance(inst, list):
            for m in inst:
                if isinstance(m, dict) and "role" in m and "content" in m:
                    messages.append({"role": m["role"], "content": m["content"]})
                elif isinstance(m, str):
                    messages.append({"role": "system", "content": m})
        elif isinstance(inst, str) and inst.strip():
            messages.append({"role": "system", "content": inst})

        user_content = ex.get("input", "")
        if isinstance(user_content, str):
            user_content = re.sub(r"\bcandidate\s+acronyms\b",
                                  "candidate_acronyms",
                                  user_content,
                                  flags=re.IGNORECASE)
        else:
            user_content = str(user_content)
        messages.append({"role": "user", "content": user_content})

        raw_out = _strip_think(ex.get("output", ""))
        out_clean = _clean_output_to_json_str(raw_out)
        messages.append({"role": "assistant", "content": out_clean})

        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )

    def formatting_func(example: Dict[str, Any]) -> List[str]:
        # If Unsloth/TRL passes a single example dict
        if not isinstance(example.get("input"), list):
            return [_render_one(example)]
        # If a batched dict is passed, render each row
        size = len(example["input"])
        rows = [{k: example[k][i] for k in example} for i in range(size)]
        return [_render_one(row) for row in rows]

    return formatting_func


# print(formatting_func(dataset["train"][0]))
# print(formatting_func(dataset["train"][1]))
# print(formatting_func(dataset["eval"][0]))
# formatted = formatting_func(dataset["train"][0])
# print("ðŸ§¾ Formatted sample:\n", formatted["text"])

# assume: tokenizer, dataset, and make_formatting_func from earlier
fmt = make_formatting_func(tokenizer)

row = dataset["train"][0]
batch = {k: [row[k]] for k in ["instruction", "input", "output"]}

texts = fmt(batch)
print("Rendered sample (first 800 chars):\n", texts[0][:])
print("\nFirst 20 lines:\n", "\n".join(texts[0].splitlines()[:]))





NameError: name 'tokenizer' is not defined

In [10]:
import json
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def make_compute_metrics(tokenizer):
    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

    def compute_metrics(eval_pred):
        preds_logits, labels = eval_pred

        # Unpack if tuple (logits, ...)
        if isinstance(preds_logits, tuple):
            preds_logits = preds_logits[0]

        # Cast to float32 for stable argmax regardless of fp16/bf16 at eval-time
        arr = np.asarray(preds_logits, dtype=np.float32)
        if arr.ndim == 3:  # (batch, seq_len, vocab)
            pred_ids = arr.argmax(axis=-1)
        else:
            pred_ids = np.asarray(preds_logits)

        # Replace ignore index (-100) before decoding labels
        labels = np.asarray(labels)
        labels_dec = np.where(labels == -100, pad_id, labels)

        # Decode to text
        pred_txt = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_txt = tokenizer.batch_decode(labels_dec, skip_special_tokens=True)

        # Normalize whitespace
        pred_txt = [t.strip() for t in pred_txt]
        label_txt = [t.strip() for t in label_txt]

        # Exact match (string-level)
        em = np.mean([p == l for p, l in zip(pred_txt, label_txt)]).item()

        # JSON parse rate (task expects JSON dicts)
        def is_json_obj(s):
            try:
                return isinstance(json.loads(s), dict)
            except Exception:
                return False
        json_ok = np.mean([is_json_obj(p) for p in pred_txt]).item()

        # Character-level micro P/R/F1 as a simple robustness check
        # (sklearn will compare strings element-wise)
        precision, recall, f1, _ = precision_recall_fscore_support(
            label_txt, pred_txt, average="micro", zero_division=0
        )

        return {
            "exact_match": float(em),
            "json_parse_rate": float(json_ok),
            "precision": float(precision),
            "recall": float(recall),
            "f1": float(f1),
        }

    return compute_metrics


In [11]:
# ============================================================
# âœ… Training Configuration & Trainer Setup
# ============================================================

from trl import SFTTrainer, SFTConfig
from torch.utils.data import Subset
import torch

print("Setting up trainer configuration...")

# Training configuration optimized for Unsloth + QLoRA
training_args = SFTConfig(
    # Output and logging
    output_dir="./qwen3-4b-qlora",
    logging_dir="./logs",
    logging_steps=25,
    
    # Training parameters
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    max_steps=-1,  # Let epochs control training length
    
    # Optimization
    learning_rate=3e-4,

    warmup_steps=100,
    max_grad_norm=1.0,
    weight_decay=0.01,
    
    # Mixed precision
    bf16=False,  # Use fp16 for A10G
    fp16=True,
    # bf16=True,  # Use fp16 for A10G
    # fp16=False,
    
    # Optimizer
    optim="paged_adamw_8bit",
    
    # Evaluation and saving
    eval_strategy="epoch",
    # eval_steps=100,
    save_steps=200,
    save_total_limit=3,

    dataloader_num_workers=0,  # Set to 0 to avoid multiprocessing issues
    dataloader_pin_memory=True,

    remove_unused_columns=False,
    group_by_length=False,

    ddp_find_unused_parameters=False,
    gradient_checkpointing=True,

    prediction_loss_only=True,  # Only compute loss, not logits
    
    # SFT specific parameters
    dataset_text_field="text",  # This will be handled by formatting_func
    max_seq_length=1024,
    packing=False,  # Disable packing to avoid length mismatch issues
)


# ============================================================
# âœ… Eval subset (quick validation)
# ============================================================
small_eval_dataset = dataset["eval"].select(range(min(1000, len(dataset["eval"]))))

# ============================================================
# âœ… Trainer setup
# ============================================================
compute_fn = make_compute_metrics(tokenizer)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=small_eval_dataset,
    formatting_func=make_formatting_func(tokenizer),  # <- bind tokenizer
    args=training_args,
    compute_metrics=compute_fn,
)


Setting up trainer configuration...


Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/16000 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=12): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16000/16000 [00:04<00:00, 3518.04 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=12): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:02<00:00, 402.38 examples/s]


In [10]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 3 | Total steps = 12,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 5,898,240 of 4,028,366,336 (0.15% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.1613,0.165377
2,0.1341,0.148899
3,0.1211,0.146437


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=12000, training_loss=0.1583680603702863, metrics={'train_runtime': 10767.0397, 'train_samples_per_second': 4.458, 'train_steps_per_second': 1.115, 'total_flos': 2.6766797680944538e+17, 'train_loss': 0.1583680603702863, 'epoch': 3.0})

In [None]:
final_dir = "./checkpoints/final_2"
trainer.save_model(final_dir)
tokenizer.save_pretrained(final_dir)

('./checkpoints/final_2/tokenizer_config.json',
 './checkpoints/final_2/special_tokens_map.json',
 './checkpoints/final_2/chat_template.jinja',
 './checkpoints/final_2/vocab.json',
 './checkpoints/final_2/merges.txt',
 './checkpoints/final_2/added_tokens.json',
 './checkpoints/final_2/tokenizer.json')

In [12]:
metrics = trainer.evaluate()
print(metrics)

print("Evaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


{'eval_loss': 2.8362035751342773, 'eval_model_preparation_time': 0.0047, 'eval_runtime': 115.8871, 'eval_samples_per_second': 8.629, 'eval_steps_per_second': 8.629}
Evaluation Metrics:
eval_loss: 2.8362
eval_model_preparation_time: 0.0047
eval_runtime: 115.8871
eval_samples_per_second: 8.6290
eval_steps_per_second: 8.6290


In [None]:
SYSTEM_PROMPT = """You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user's query.

Instructions:
- Only include expansions that are clearly and directly related to the query's context.
- If multiple meanings are relevant, include all of them.
- If no acronym is relevant, return an empty dictionary: `{}`.
- Acronyms must appear in the query to be considered.
- Preserve the acronym casing as it appears in the query.
- Output must be a valid **JSON dictionary**:
  - Keys: acronyms found in the query.
  - Values: lists of relevant expansions (as strings).

Output Format:
{
  "ACRONYM": ["Relevant Expansion 1", "Relevant Expansion 2"]
}

Examples:
###
query: "Who leads the AI team", candidate acronyms: " (AI: artificial intelligence, Artificial Intelligence, Action Items)"
###
{"AI": ["artificial intelligence"]}
###
query: "update the okr", candidate acronyms: " (okr: Objectives and Key Results, Office of Knowledge Research)"
###
{"okr": ["Objectives and Key Results"]}
###
query: "follow up with hrbp and cpo", candidate acronyms: " (hrbp: Human Resources Business Partner) (cpo: Chief Product Officer, Chief People Officer)"
###
{"hrbp": ["Human Resources Business Partner"], "cpo": ["Chief People Officer"]}
###
query: "can you help me with this", candidate acronyms: " (can: Canada) (you: Young Outstanding Undergraduates)"
###
{}
###
"""
def prepare_inference_messages(base_messages_or_system, user_text):
    """
    Normalize to text-only messages for Qwen chat templates that expect strings.
    - If given a list of {role, content}, flatten any segment lists to a string.
    - If given a plain system string, wrap it as a single system message.
    - Append the live user turn as the last message.
    """
    def _to_text(c):
        if isinstance(c, list):
            parts = []
            for seg in c:
                if isinstance(seg, dict) and "text" in seg:
                    parts.append(seg["text"])
                else:
                    parts.append(str(seg))
            return "".join(parts)
        return str(c)

    if isinstance(base_messages_or_system, list):
        msgs = []
        for m in base_messages_or_system:
            role = m.get("role", "user")
            content = _to_text(m.get("content", ""))
            msgs.append({"role": role, "content": content})
    else:
        msgs = [{"role": "system", "content": str(base_messages_or_system)}]

    msgs.append({"role": "user", "content": user_text})
    return msgs


In [15]:
# minimal_infer_unsloth.py
import re
import os
import torch
from unsloth import FastLanguageModel

os.environ["UNSLOTH_DISABLE_PATCHES"] = "1" 

model_dir = "./checkpoints/final_2"  # set to your directory

model, tok = FastLanguageModel.from_pretrained(
    model_name=model_dir,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=False,
)
FastLanguageModel.for_inference(model)

if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token

messages = prepare_inference_messages(
    SYSTEM_PROMPT,  # <-- your list from training: [{'role':...,'content':...}, ...]
    'query: "pro athelete", candidate_acronyms: "(pro: professional, professor)"'
)

prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tok(prompt, return_tensors="pt").to(model.device)

bad = tok.encode("<think>", add_special_tokens=False)
gen_kwargs = dict(
    max_new_tokens=128,
    do_sample=False,
    eos_token_id=tok.eos_token_id,
    pad_token_id=tok.pad_token_id,
    bad_words_ids=[bad] if bad else None,
)

with torch.no_grad():
    out = model.generate(**inputs, **{k: v for k, v in gen_kwargs.items() if v is not None})

gen_ids = out[0][inputs["input_ids"].shape[-1]:]
text = tok.decode(gen_ids, skip_special_tokens=True)
text = re.sub(r"<think>[\s\S]*?</think>\s*", "", text).strip()
print(text)

==((====))==  Unsloth 2025.10.1: Fast Qwen3 patching. Transformers: 4.56.2. vLLM: 0.11.0.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:01<00:00,  1.54it/s]


</think>

{"PRO": ["professional"]}
