In [1]:
SYSTEM_PROMPT = """You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user's query.

Instructions:
- Only include expansions that are clearly and directly related to the query's context.
- If multiple meanings are relevant, include all of them.
- If no acronym is relevant, return an empty dictionary: `{}`.
- Acronyms must appear in the query to be considered.
- Preserve the acronym casing as it appears in the query.
- Output must be a valid **JSON dictionary**:
  - Keys: acronyms found in the query.
  - Values: lists of relevant expansions (as strings).

Output Format:
{
  "ACRONYM1": ["Relevant Expansion 1", "Relevant Expansion 2",...],
  "ACRONYM2": ["Relevant Expansion 1", "Relevant Expansion 2",...],
}
"""

def parse_raw_prompt(raw_prompt_string):
    """Parse raw prompt string into OpenAI-compatible message format for vLLM"""
    parts = [part.strip() for part in raw_prompt_string.split("###") if part.strip()]

    messages = []

    # First part is the system message with instructions
    if len(parts) > 0:
        # Use simple string content format for vLLM compatibility
        messages.append({"role": "system", "content": parts[0]})

    # Process examples in pairs (user query, assistant response)
    for i in range(1, len(parts), 2):
        user_example = parts[i]
        assistant_response = parts[i + 1] if i + 1 < len(parts) else ""

        # Add user example
        messages.append({"role": "user", "content": user_example})

        # Add assistant response if available
        if assistant_response:
            messages.append({"role": "assistant", "content": assistant_response})

    return messages

# Parse the system prompt into message format for the API
SYSTEM_PROMPT = parse_raw_prompt(SYSTEM_PROMPT)
print(SYSTEM_PROMPT[:])

[{'role': 'system', 'content': 'You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user\'s query.\n\nInstructions:\n- Only include expansions that are clearly and directly related to the query\'s context.\n- If multiple meanings are relevant, include all of them.\n- If no acronym is relevant, return an empty dictionary: `{}`.\n- Acronyms must appear in the query to be considered.\n- Preserve the acronym casing as it appears in the query.\n- Output must be a valid **JSON dictionary**:\n  - Keys: acronyms found in the query.\n  - Values: lists of relevant expansions (as strings).\n\nOutput Format:\n{\n  "ACRONYM1": ["Relevant Expansion 1", "Relevant Expansion 2",...],\n  "ACRONYM2": ["Relevant Expansion 1", "Relevant Expansion 2",...],\n}'}]


In [2]:
import json
import random
from sklearn.model_selection import train_test_split

# Load your JSON file
with open("data/best_output_25000.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Format dataset
formatted = []
for item in data:
    query = item["Query"]
    candidates = item["Candidate_Acronyms"]  # Assumed to be a list of strings
    output = item["Best_Output"]             # Assumed to be a dict

    # Format candidates into a readable string
    formatted_candidates = json.dumps(candidates, ensure_ascii=False)

    # Add each example
    formatted.append({
        "instruction": SYSTEM_PROMPT,
        "input": f'query: "{query}", candidate acronyms: {formatted_candidates}',
        "output": json.dumps(output, ensure_ascii=False)  # Ensure proper JSON formatting
    })

# Shuffle the dataset
random.shuffle(formatted)

# 60:40 train-eval split
train_data, eval_data = train_test_split(formatted, test_size=0.2, random_state=42)

# Save JSONL files
with open("data/unsloth_train_data.jsonl", "w", encoding="utf-8") as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("data/unsloth_eval_data.jsonl", "w", encoding="utf-8") as f:
    for item in eval_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"âœ… Saved {len(train_data)} training samples and {len(eval_data)} eval samples in Unsloth format.")


âœ… Saved 16000 training samples and 4000 eval samples in Unsloth format.


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Paths to your JSONL files
train_file = "/home/ubuntu/acronyms_project/data/unsloth_train_data.jsonl"
eval_file = "/home/ubuntu/acronyms_project/data/unsloth_eval_data.jsonl"

# Load dataset from JSONL files
dataset = load_dataset("json", data_files={
    "train": train_file,
    "eval": eval_file
})

# Print basic info
print("âœ… Dataset loaded:")
print(dataset)
print("\nâœ… Sample training example:")
print(dataset["train"][0])

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 16000 examples [00:00, 368553.85 examples/s]
Generating eval split: 4000 examples [00:00, 396708.90 examples/s]

âœ… Dataset loaded:
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 16000
    })
    eval: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4000
    })
})

âœ… Sample training example:
{'instruction': [{'role': 'system', 'content': 'You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user\'s query.\n\nInstructions:\n- Only include expansions that are clearly and directly related to the query\'s context.\n- If multiple meanings are relevant, include all of them.\n- If no acronym is relevant, return an empty dictionary: `{}`.\n- Acronyms must appear in the query to be considered.\n- Preserve the acronym casing as it appears in the query.\n- Output must be a valid **JSON dictionary**:\n  - Keys: acronyms found in the query.\n  - Values: lists of relevant expansions (as strings).\n\nOutput Format:\n{\n  "ACRONYM1": 




In [4]:
from unsloth import FastLanguageModel
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Or any other HF model

# Load model and tokenizer in full FP16 precision (no quantization)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=torch.float16,       # Use 16-bit precision
    load_in_4bit=False         # Do NOT quantize
)

# Set pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Apply LoRA with Unsloth helper
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)

# Diagnostics
model.print_trainable_parameters()
print("âœ… Model loaded on:", next(model.parameters()).device)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-11 17:50:56 [__init__.py:216] Automatically detected platform cuda.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.1: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.11.0.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.10.1 patched 22 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044
âœ… Model loaded on: cuda:0


In [5]:
import re
import json
from typing import Dict, List, Any, Callable

# Clean up the output field to ensure it's a valid JSON string or fallback to plain string
def _clean_output_to_json_str(raw_out: Any) -> str:
    if isinstance(raw_out, dict):
        return json.dumps(raw_out, ensure_ascii=False)
    if not isinstance(raw_out, str):
        return json.dumps(raw_out, ensure_ascii=False)

    s = raw_out.strip()
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        try:
            s = json.loads(s)
        except Exception:
            s = s[1:-1]

    try:
        obj = json.loads(s)
        if isinstance(obj, str):
            obj = json.loads(obj)
        if isinstance(obj, (dict, list)):
            return json.dumps(obj, ensure_ascii=False)
    except Exception:
        pass
    return s

# Main formatting function for TinyLlama
def make_formatting_func(tokenizer) -> Callable[[Dict[str, Any]], List[str]]:
    def _strip_think(text: Any) -> Any:
        if isinstance(text, str):
            return re.sub(r"<think>[\s\S]*?</think>\s*", "", text)
        return text

    def _render_one(ex: Dict[str, Any]) -> str:
        messages: List[Dict[str, str]] = []

        # System prompt (optional)
        system_prompt = None
        inst = ex.get("instruction")
        if isinstance(inst, str) and inst.strip():
            system_prompt = inst.strip()
        elif isinstance(inst, list):
            for m in inst:
                if isinstance(m, dict) and m.get("role") == "system":
                    system_prompt = m["content"]

        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})

        # User input
        user_content = ex.get("input", "")
        if isinstance(user_content, str):
            user_content = re.sub(r"\bcandidate\s+acronyms\b", "candidate_acronyms", user_content, flags=re.IGNORECASE)
        else:
            user_content = str(user_content)

        messages.append({"role": "user", "content": user_content})

        # Assistant output (label)
        raw_out = _strip_think(ex.get("output", ""))
        out_clean = _clean_output_to_json_str(raw_out)
        messages.append({"role": "assistant", "content": out_clean})

        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )

    def formatting_func(example: Dict[str, Any]) -> List[str]:
        if not isinstance(example.get("input"), list):
            return [_render_one(example)]

        # Batched format
        size = len(example["input"])
        rows = [{k: example[k][i] for k in example} for i in range(size)]
        return [_render_one(row) for row in rows]

    return formatting_func

fmt = make_formatting_func(tokenizer)

row = dataset["train"][0]
batch = {k: [row[k]] for k in ["instruction", "input", "output"]}

texts = fmt(batch)
print("Rendered sample:\n", texts[0][:])
# print("\nFirst 20 lines:\n", "\n".join(texts[0].splitlines()[:]))

Rendered sample:
 <|system|>
You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user's query.

Instructions:
- Only include expansions that are clearly and directly related to the query's context.
- If multiple meanings are relevant, include all of them.
- If no acronym is relevant, return an empty dictionary: `{}`.
- Acronyms must appear in the query to be considered.
- Preserve the acronym casing as it appears in the query.
- Output must be a valid **JSON dictionary**:
  - Keys: acronyms found in the query.
  - Values: lists of relevant expansions (as strings).

Output Format:
{
  "ACRONYM1": ["Relevant Expansion 1", "Relevant Expansion 2",...],
  "ACRONYM2": ["Relevant Expansion 1", "Relevant Expansion 2",...],
}</s>
<|user|>
query: "teams meeting recap", candidate_acronyms: "(teams: microsoft teams)"</s>
<|assistant|>
{"TEAMS": ["microsoft teams"]}</s>



In [None]:
# ============================================================
# âœ… Training Configuration & Trainer Setup for TinyLlama 1.1B
# ============================================================

from trl import SFTTrainer, SFTConfig
from torch.utils.data import Subset
import torch

print("Setting up trainer configuration for TinyLlama...")

# Training configuration optimized for TinyLlama + LoRA (FP16)
training_args = SFTConfig(
    # Output and logging
    output_dir="./tinyllama-1.1b-lora",
    logging_dir="./logs",
    logging_steps=25,

    # Training parameters
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    max_steps=-1,  # Let epochs control training length

    # Optimization
    learning_rate=2e-4,  # Slightly lower for stability on smaller models
    warmup_steps=100,
    max_grad_norm=1.0,
    weight_decay=0.01,

    # Mixed precision
    bf16=False,
    fp16=True,  # Use FP16 for A10G, RTX, or similar

    # Optimizer
    optim="adamw_torch",  # Better for full-precision / LoRA with small models

    # Evaluation and saving
    eval_strategy="epoch",
    save_steps=200,
    save_total_limit=2,

    dataloader_num_workers=0,
    dataloader_pin_memory=True,

    remove_unused_columns=False,
    group_by_length=False,

    ddp_find_unused_parameters=False,
    gradient_checkpointing=True,

    prediction_loss_only=True,

    # SFT specific
    dataset_text_field="text",  # Will be handled by formatting_func
    max_seq_length=1024,
    packing=False,
)

# ============================================================
# âœ… Trainer setup
# ============================================================
# compute_fn = make_compute_metrics(tokenizer)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["eval"],
    formatting_func=make_formatting_func(tokenizer),
    args=training_args,
    # compute_metrics=compute_fn,
)


Setting up trainer configuration for TinyLlama...


Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/16000 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=12): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16000/16000 [00:02<00:00, 6178.57 examples/s] 
Unsloth: Tokenizing ["text"] (num_proc=12): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4000/4000 [00:01<00:00, 3112.86 examples/s]


In [7]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 3 | Total steps = 12,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 2,252,800 of 1,102,301,184 (0.20% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.1459,0.141732
2,0.1377,0.130198
3,0.1168,0.127442


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=12000, training_loss=0.1411018192768097, metrics={'train_runtime': 4802.1369, 'train_samples_per_second': 9.996, 'train_steps_per_second': 2.499, 'total_flos': 9.632824195242394e+16, 'train_loss': 0.1411018192768097, 'epoch': 3.0})

In [8]:
final_dir = "./checkpoints/llama-1.1b-lora_2"
trainer.save_model(final_dir)
tokenizer.save_pretrained(final_dir)

('./checkpoints/llama-1.1b-lora_2/tokenizer_config.json',
 './checkpoints/llama-1.1b-lora_2/special_tokens_map.json',
 './checkpoints/llama-1.1b-lora_2/chat_template.jinja',
 './checkpoints/llama-1.1b-lora_2/tokenizer.model',
 './checkpoints/llama-1.1b-lora_2/added_tokens.json',
 './checkpoints/llama-1.1b-lora_2/tokenizer.json')

In [9]:
SYSTEM_PROMPT = """You are a precise assistant tasked with selecting only the **most relevant acronym expansions** from a given list, based strictly on the user's query.

Instructions:
- Only include expansions that are clearly and directly related to the query's context.
- If multiple meanings are relevant, include all of them.
- If no acronym is relevant, return an empty dictionary: `{}`.
- Acronyms must appear in the query to be considered.
- Preserve the acronym casing as it appears in the query.
- Output must be a valid **JSON dictionary**:
  - Keys: acronyms found in the query.
  - Values: lists of relevant expansions (as strings).

Output Format:
{
  "ACRONYM": ["Relevant Expansion 1", "Relevant Expansion 2"]
}

Examples:
###
query: "Who leads the AI team", candidate acronyms: " (AI: artificial intelligence, Artificial Intelligence, Action Items)"
###
{"AI": ["artificial intelligence"]}
###
query: "update the okr", candidate acronyms: " (okr: Objectives and Key Results, Office of Knowledge Research)"
###
{"okr": ["Objectives and Key Results"]}
###
query: "follow up with hrbp and cpo", candidate acronyms: " (hrbp: Human Resources Business Partner) (cpo: Chief Product Officer, Chief People Officer)"
###
{"hrbp": ["Human Resources Business Partner"], "cpo": ["Chief People Officer"]}
###
query: "can you help me with this", candidate acronyms: " (can: Canada) (you: Young Outstanding Undergraduates)"
###
{}
###
"""
def prepare_inference_messages(base_messages_or_system, user_text):
    """
    Normalize to text-only messages for Qwen chat templates that expect strings.
    - If given a list of {role, content}, flatten any segment lists to a string.
    - If given a plain system string, wrap it as a single system message.
    - Append the live user turn as the last message.
    """
    def _to_text(c):
        if isinstance(c, list):
            parts = []
            for seg in c:
                if isinstance(seg, dict) and "text" in seg:
                    parts.append(seg["text"])
                else:
                    parts.append(str(seg))
            return "".join(parts)
        return str(c)

    if isinstance(base_messages_or_system, list):
        msgs = []
        for m in base_messages_or_system:
            role = m.get("role", "user")
            content = _to_text(m.get("content", ""))
            msgs.append({"role": role, "content": content})
    else:
        msgs = [{"role": "system", "content": str(base_messages_or_system)}]

    msgs.append({"role": "user", "content": user_text})
    return msgs


In [None]:
import re
import os
import torch
from unsloth import FastLanguageModel

# Optional: Disable monkey patches if needed
os.environ["UNSLOTH_DISABLE_PATCHES"] = "1"

# âœ… Path to saved checkpoint (full model or LoRA merged)
model_dir = "./checkpoints/llama-1.1b-lora"

# âœ… Load model & tokenizer
model, tok = FastLanguageModel.from_pretrained(
    model_name=model_dir,
    max_seq_length=2048,
    dtype=torch.float16,         # Recommended for inference
    load_in_4bit=False           # Assuming you're using FP16 saved model
)

# âœ… Switch to inference mode (for Unsloth speedup)
FastLanguageModel.for_inference(model)

# âœ… Set pad token if missing
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token
model.config.pad_token_id = tok.pad_token_id

# âœ… Input: define your chat-style input messages
# This should follow the same format as during training
# Example SYSTEM_PROMPT and user query
SYSTEM_PROMPT = "You are a helpful acronym disambiguation assistant."
query_text = 'query: "pro athelete", candidate_acronyms: "(pro: professional, professor)"'

# âœ… Function to format chat input
def prepare_inference_messages(system_prompt, user_input):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ]

# âœ… Prepare prompt
messages = prepare_inference_messages(SYSTEM_PROMPT, query_text)
prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# âœ… Tokenize input
inputs = tok(prompt, return_tensors="pt").to(model.device)

# âœ… Prevent <think> token from being generated (if needed)
bad = tok.encode("<think>", add_special_tokens=False)
bad_words_ids = [bad] if bad else None

# âœ… Generation arguments
gen_kwargs = dict(
    max_new_tokens=128,
    do_sample=False,
    eos_token_id=tok.eos_token_id,
    pad_token_id=tok.pad_token_id,
    bad_words_ids=bad_words_ids,
)

# âœ… Generate response
with torch.no_grad():
    out = model.generate(**inputs, **gen_kwargs)

# âœ… Decode and clean output
gen_ids = out[0][inputs["input_ids"].shape[-1]:]
text = tok.decode(gen_ids, skip_special_tokens=True)
text = re.sub(r"<think>[\s\S]*?</think>\s*", "", text).strip()

print("ðŸ§  Model response:\n", text)


==((====))==  Unsloth 2025.10.1: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.11.0.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


RuntimeError: CUDA driver error: invalid argument

: 