<a href="https://colab.research.google.com/github/spider2048/mlexperiments/blob/main/sft_ner_smollm_135m.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install --system pytorch_lightning trl torchsummary peft bitsandbytes --quiet

In [None]:
!uv pip install --system flash-attn --no-build-isolation --quiet

In [None]:
import os
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from google.colab import userdata
import pytorch_lightning as pl
from huggingface_hub import hf_hub_download
import pandas as pd
import re
import json
import ast
from textwrap import dedent
from tqdm.notebook import tqdm, trange

In [None]:
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import *

In [None]:
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
os.environ["KAGGLE_KEY"] = userdata.get("KAGGLE_KEY")
os.environ["KAGGLE_USERNAME"] = userdata.get("KAGGLE_USERNAME")
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")

In [None]:
MODEL_NAME = "HuggingfaceTB/SmolLM-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
csv_path = hf_hub_download(repo_id="numind/NuNER", filename="data/full-00001-of-00001.csv", repo_type="dataset")

In [None]:
df = pd.read_csv(csv_path, nrows=10000)

In [None]:
def parse_output_entities(output_str):
    try:
        sp = output_str.split("<>")
        if not sp or (len(sp) == 1 and not sp[0]):
            return None
        if len(sp) != 3:
            return None

        extract, entity, desc = sp
        return {
            "extract": extract.strip(),
            "entity": entity.strip(),
            "description": desc.strip()
        }
    except Exception as err:
        print("[parse_output_entities] unable to parse ents", output_str, err)
        return None

In [None]:
def parse_output(output_str):
    return [
        e
        for out in ast.literal_eval(output_str)
        if (e := parse_output_entities(out))
    ]

In [None]:
def format_ents(ents):
    f = dedent("""
    <entities>
    {entities}
    </entities>
    """)
    e = []
    for ent in ents:
        e.append(f"{json.dumps(dict(name=ent['entity'], description=ent['description']))}")
    return f.format(entities="\n".join(e)).strip()

In [None]:
def format_input(inp):
    return f"""<input>{inp}</input>"""

In [None]:
def get_system_prompt(ents):
    return dedent("""
You are a named entity recognition (NER) model.
Your task is to identify entities in the input text and return them in structured JSON.

Entities to recognize:
{entities}
""").format(entities=ents).strip()

In [None]:
def get_completion(ents):
    return dedent(f"""
```json
{json.dumps(ents, indent=4)}
```
    """).strip()

In [None]:
class NERDataset(Dataset):
    def __init__(self ,df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # return system prompt, user message, completion
        inp = self.df.iloc[idx, 0]
        ents = parse_output(self.df.iloc[idx, 1])

        prompt = get_system_prompt(format_ents(ents))
        inp = format_input(inp)
        cmpl = get_completion(ents)
        messages = [
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": inp,
            },
            {
                "role": "assistant",
                "content": cmpl
            }
        ]
        return {"text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}

In [None]:
from datasets import Dataset, DatasetDict

dataset = NERDataset(df)
train_dataset, val_dataset = random_split(dataset, [0.8, 0.2])

def gtrain():
    items = len(train_dataset)
    for i in range(items):
        yield train_dataset[i]

def gval():
    items = len(val_dataset)
    for i in range(items):
        yield val_dataset[i]

train_dataset = Dataset.from_generator(gtrain)
val_dataset = Dataset.from_generator(gval)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=64,
    lora_alpha=128,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "k_proj",
        "q_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    inference_mode=False
)

In [None]:
username="4sp1d3r2"
output_dir = "smollm-135m-ner"
per_device_train_batch_size = 24
per_device_eval_batch_size = 24
gradient_accumulation_steps = 4
logging_steps = 1
learning_rate = 2e-3

max_grad_norm = 1.0
num_train_epochs=5
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 1500

training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    eval_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb",
    bf16=True,
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    packing=True,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    dtype="bfloat16",
    attn_implementation="flash_attention_2"
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
)

Adding EOS to train dataset:   0%|          | 0/8000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/8000 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/8000 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
import wandb

wandb.init(
    entity="4spy1337",
    project="smollm-135m-ner",
    config={
        "peft": peft_config.to_dict(),
        "args": training_arguments.to_dict(),
    },
    sync_tensorboard=True,
    save_code=True
)

[34m[1mwandb[0m: Currently logged in as: [33maspy1337[0m ([33m4spy1337[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
trainer.train()
trainer.save_model()

Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.7268,0.734077,0.747453,2982646.0,0.850978
2,0.9615,0.692925,0.690798,5965292.0,0.859963
3,0.7326,0.678404,0.653155,8947938.0,0.862304
4,0.54,0.673842,0.64185,11930584.0,0.863422
5,0.6647,0.67343,0.634298,14913230.0,0.86356


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/78.2M [00:00<?, ?B/s]

In [None]:
trainer.push_to_hub(f"{username}/{output_dir}")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/4sp1d3r2/smollm-135m-ner/commit/698207aeeb2ce4ea1f8def15c1d149d07fdd2e02', commit_message='4sp1d3r2/smollm-135m-ner', commit_description='', oid='698207aeeb2ce4ea1f8def15c1d149d07fdd2e02', pr_url=None, repo_url=RepoUrl('https://huggingface.co/4sp1d3r2/smollm-135m-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='4sp1d3r2/smollm-135m-ner'), pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

0,1
eval/entropy,█▄▂▁▁
eval/loss,█▃▂▁▁
eval/mean_token_accuracy,▁▆▇██
eval/num_tokens,▁▃▅▆█
eval/runtime,█▁▄▁▁
eval/samples_per_second,▁█▅██
eval/steps_per_second,▁█▅██
train/entropy,██▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██

0,1
eval/entropy,0.6343
eval/loss,0.67343
eval/mean_token_accuracy,0.86356
eval/num_tokens,14913230.0
eval/runtime,30.0624
eval/samples_per_second,24.748
eval/steps_per_second,1.031
total_flos,1.125126036482688e+16
train/entropy,0.72239
train/epoch,5


In [None]:
messages = [
    {
        "role": "system",
        "content": """You are a named entity recognition (NER) model.
Your task is to identify entities in the input text and return them in structured JSON.

Entities to recognize:
<entities>
{"name": "Name", "description": "The name of a person"}
{"name": "Institute Name", "description": "The name of an institution"}
{"name": "Bank Name", "description": "The name of a bank"}
{"name": "Tool Name", "description": "The name of a tool"}
</entities>
        """.strip()
    },
    {
        "role": "user",
        "content": "<input>Sujatha is a recent graduate from Keymar Institute, who has a bank account in HDFC Bank. MLFlow is a great tool.</input>"
    }
]
token_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

In [None]:
with torch.inference_mode():
    outs = model.generate(inputs=token_ids.to(model.device), do_sample=True, num_beams=5, max_new_tokens=200)

In [None]:
print(tokenizer.batch_decode(outs)[0])

<|im_start|>system
You are a named entity recognition (NER) model.
Your task is to identify entities in the input text and return them in structured JSON.

Entities to recognize:
<entities>
{"name": "Name", "description": "The name of a person"}
{"name": "Institute Name", "description": "The name of an institution"}
{"name": "Bank Name", "description": "The name of a bank"}
{"name": "Tool Name", "description": "The name of a tool"}
</entities><|im_end|>
<|im_start|>user
<input>Sujatha is a recent graduate from Keymar Institute, who has a bank account in HDFC Bank. MLFlow is a great tool.</input><|im_end|>
<|im_start|>assistant
```json
[
    {
        "extract": "Sujatha",
        "entity": "Name",
        "description": "The name of a person"
    },
    {
        "extract": "Keymar Institute",
        "entity": "Institute Name",
        "description": "The name of an institution"
    },
    {
        "extract": "HDFC Bank",
        "entity": "Bank Name",
        "description": "The nam