In [None]:
!uv pip install --system trl peft bitsandbytes --quiet

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !uv pip install --system --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !uv pip install --system sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !uv pip install --system --no-deps unsloth
!uv pip install --system transformers==4.55.4
!uv pip install --system --no-deps trl==0.22.2

In [None]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
import os
from google.colab import userdata
from huggingface_hub import hf_hub_download
import pandas as pd
import re
import json
import ast
from textwrap import dedent
from tqdm.notebook import tqdm, trange

In [None]:
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import *

In [None]:
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth.chat_templates import train_on_responses_only

In [None]:
os.environ["KAGGLE_KEY"] = userdata.get("KAGGLE_KEY")
os.environ["KAGGLE_USERNAME"] = userdata.get("KAGGLE_USERNAME")
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.1-8B-Instruct",
    max_seq_length = 3096,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((409

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.9.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
csv_path = hf_hub_download(repo_id="numind/NuNER", filename="data/full-00001-of-00001.csv", repo_type="dataset")

In [None]:
df = pd.read_csv(csv_path, nrows=50000)

In [None]:
import ast
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        inp = self.df.iloc[idx, 0]
        ents = self._parse_output(self.df.iloc[idx, 1])

        sys_prompt = self._get_system_prompt(self._format_ents(ents))
        user_prompt = self._format_input(inp)
        completion = self._get_completion(ents)

        messages =  [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": completion},
        ]
        return {"texts": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}
    # -------- helpers ----------
    def _parse_output_entities(self, output_str):
        try:
            sp = output_str.split("<>")
            if len(sp) != 3:
                return None
            extract, entity, desc = sp
            return {
                "extract": extract.strip(),
                "entity": entity.strip(),
                "description": desc.strip()
            }
        except Exception:
            return None

    def _parse_output(self, output_str):
        return [
            e for out in ast.literal_eval(output_str)
            if (e := self._parse_output_entities(out))
        ]

    def _format_ents(self, ents):
        return "\n".join(
            f"- \'{ent['entity']}\': {ent['description']}"
            for ent in ents
        )

    def _format_input(self, inp):
        return f"{inp}"

    def _get_system_prompt(self, ents):
        # short and tight
        return f"""You are an NER model.
Extract entities from the user text.
For each entity, return lines in the format:
TEXT: <span> ENT: <entity>

Entities to recognize:
{ents}"""

    def _get_completion(self, ents):
        return "\n".join(
            f"TEXT: {ent['extract']} ENT: {ent['entity']}"
            for ent in ents
        )


In [None]:
from datasets import Dataset, DatasetDict

dataset = NERDataset(df, tokenizer)
train_dataset, val_dataset = random_split(dataset, [0.9, 0.1])

In [None]:
def gtrain():
    items = len(train_dataset)
    for i in range(items):
        yield train_dataset[i]

def gval():
    items = len(val_dataset)
    for i in range(items):
        yield val_dataset[i]

In [None]:
train_dataset = Dataset.from_generator(gtrain)
val_dataset = Dataset.from_generator(gval)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
username="4sp1d3r2"
output_dir = "llama-3.1-8b-ner"
per_device_train_batch_size = 24
per_device_eval_batch_size = 12
gradient_accumulation_steps = 4
logging_steps = 1
learning_rate = 2e-4

max_grad_norm = 2.0
num_train_epochs= 1

In [None]:
training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="steps",
    save_steps=25,
    eval_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    optim="adamw_8bit",
    weight_decay=0.01,
    report_to="wandb",
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    dataset_text_field='texts'
)

In [None]:
train_dataset[0]

{'texts': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are an NER model.\nExtract entities from the user text.\nFor each entity, return lines in the format:\nTEXT: <span> ENT: <entity>\n\nEntities to recognize:\n- 'Street Name': A street in Kirkbymoorside, York\n- 'City Name': A city in North Yorkshire\n- 'Housing Type': A type of residential building characterized by two separate dwellings sharing a common wall.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThis page displays sold house prices for Slingsby Garth in York. Slingsby Garth in Kirkbymoorside, York consists predominantly of semi-detached houses.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nTEXT: Slingsby Garth ENT: Street Name\nTEXT: York ENT: City Name\nTEXT: semi-detached houses ENT: Housing Type<|eot_id|>"}

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
)

Unsloth: Tokenizing ["texts"] (num_proc=6):   0%|          | 0/45000 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["texts"] (num_proc=6):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
trainer2 = train_on_responses_only(trainer, instruction_part="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n", response_part="<|start_header_id|>assistant<|end_header_id|>")

Map (num_proc=2):   0%|          | 0/45000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
import wandb

wandb.init(
    entity="4spy1337",
    project="smollm-135m-ner",
    config={
        "args": training_arguments.to_dict(),
    },
    sync_tensorboard=True,
    save_code=True
)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33maspy1337[0m ([33m4spy1337[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [None]:
trainer2.train()
trainer2.save_model()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 45,000 | Num Epochs = 1 | Total steps = 469
O^O/ \_/ \    Batch size per device = 24 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (24 x 4 x 1) = 96
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
trainer.push_to_hub(f"{username}/{output_dir}")

In [None]:
wandb.finish()