In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
from transformers import BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("wnut_17")

In [5]:
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)

Fetching 3 files: 100%|██████████| 3/3 [15:03<00:00, 301.04s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.50s/it]


In [17]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)



In [18]:
def format_ner_example(example):
    tokens = example['tokens']
    labels = example['ner_tags']
    label_names = dataset["train"].features["ner_tags"].feature.names

    text = " ".join(tokens)
    entity_output = ""
    for token, tag in zip(tokens, labels):
        if tag != 0:  # 'O' is 0
            entity_output += f"- {token}: {label_names[tag]}\n"
    if not entity_output:
        entity_output = "No named entities found."

    prompt = f"Extract named entities from the following text:\n\nText: {text}\n\nAnswer:\n{entity_output}"
    return {"text": prompt}

In [19]:
dataset = dataset.map(format_ner_example)
dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)


Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map: 100%|██████████| 3394/3394 [00:00<00:00, 6847.78 examples/s]
Map: 100%|██████████| 1009/1009 [00:00<00:00, 6053.75 examples/s]
Map: 100%|██████████| 1287/1287 [00:00<00:00, 6813.95 examples/s]
Map: 100%|██████████| 3394/3394 [00:00<00:00, 9815.45 examples/s]
Map: 100%|██████████| 1009/1009 [00:00<00:00, 10251.32 examples/s]
Map: 100%|██████████| 1287/1287 [00:00<00:00, 9519.97 examples/s]


In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1287
    })
})

In [20]:
training_args = TrainingArguments(
    output_dir="./llama3-ner",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    save_steps=500,
    fp16=True,
    report_to="none",
)

In [21]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].select(range(1000)),
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [23]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
10,3.7703
20,3.0765
30,2.1527
40,1.9592
50,1.8721
60,1.9104
70,2.0038
80,1.9056
90,1.8158
100,1.8758


TrainOutput(global_step=375, training_loss=1.785125826517741, metrics={'train_runtime': 5068.4984, 'train_samples_per_second': 0.592, 'train_steps_per_second': 0.074, 'total_flos': 6.557653794816e+16, 'train_loss': 1.785125826517741, 'epoch': 3.0})

In [24]:
trainer.save_model()

In [27]:
trainer.evaluate()

{'eval_loss': 1.659113883972168,
 'eval_runtime': 519.9819,
 'eval_samples_per_second': 1.94,
 'eval_steps_per_second': 0.244,
 'epoch': 3.0}