In [2]:
!pip install transformers datasets evaluate seqeval bitsandbytes trl



In [3]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer


In [4]:
dataset = load_dataset("ai4privacy/pii-masking-200k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

english_pii_43k.jsonl:   0%|          | 0.00/73.8M [00:00<?, ?B/s]

french_pii_62k.jsonl:   0%|          | 0.00/116M [00:00<?, ?B/s]

german_pii_52k.jsonl:   0%|          | 0.00/97.8M [00:00<?, ?B/s]

italian_pii_50k.jsonl:   0%|          | 0.00/93.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/209261 [00:00<?, ? examples/s]

In [5]:
reduced_dataset = dataset['train'].select(range(1000))
# reduced_dataset_dict = {"train": reduced_dataset}

In [6]:
#   if "validation" in dataset:
#     reduced_val = dataset["validation"].select(range(min(300, len(dataset["validation"]))))
#     reduced_dataset_dict["validation"] = reduced_val
# else:
#     splits = reduced_dataset.train_test_split(test_size=0.1)
#     reduced_dataset_dict = {
#         "train": splits["train"],
#         "validation": splits["test"]
#     }

In [7]:
splits = reduced_dataset.train_test_split(test_size=0.1)
train_dataset = splits["train"]
val_dataset = splits["test"]

In [8]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Train dataset size: 900
Validation dataset size: 100


In [9]:
def format_instruction(example):
    """Format the data as an instruction for the model"""
    # Format: Input text followed by expected output
    instruction = f"""Anonymize the following text by replacing personally identifiable information (PII) with appropriate placeholders:

Text: {example['source_text']}

Anonymized text:"""

    # The target output is the anonymized text
    response = example['target_text']

    return {
        "instruction": instruction,
        "input": "",  # No additional input
        "output": response
    }

In [10]:
train_dataset = train_dataset.map(format_instruction)
val_dataset = val_dataset.map(format_instruction)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
print("\nSample instruction:")
print(train_dataset[0]["instruction"])
print("\nSample output:")
print(train_dataset[0]["output"])


Sample instruction:
Anonymize the following text by replacing personally identifiable information (PII) with appropriate placeholders:

Text: Hello Queenie, we will be hosting a preventive medicine seminar at building 74425 in the Southeast wing. Would love to have you there!

Anonymized text:

Sample output:
Hello [FIRSTNAME], we will be hosting a preventive medicine seminar at building [BUILDINGNUMBER] in the [ORDINALDIRECTION] wing. Would love to have you there!


In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [13]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [15]:
model = prepare_model_for_kbit_training(model)

In [16]:
peft_config = LoraConfig(
    r=16,                    # Rank
    lora_alpha=32,           # Alpha parameter for LoRA scaling
    lora_dropout=0.05,       # Dropout probability for LoRA layers
    bias="none",             # Bias type for LoRA
    task_type="CAUSAL_LM",   # Task type
    target_modules=[         # Target modules to apply LoRA
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


In [18]:
training_args = TrainingArguments(
    output_dir="./results_llama2_pii",
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Keep this to maximize batch size if GPU memory allows
    per_device_eval_batch_size=4,   # Same for evaluation batch size
    gradient_accumulation_steps=1,   # Remove gradient accumulation for speed
    gradient_checkpointing=False,    # Disable gradient checkpointing for faster training
    logging_steps=25,               # Log less frequently to reduce overhead
    save_steps=0,                  # Save checkpoints less frequently
    weight_decay=0.001,              # Keep weight decay
    fp16=True,                       # Keep mixed precision for speed
    bf16=False,                      # Keep fp16 since you're not using bf16
    max_grad_norm=0.3,               # Increase max grad norm slightly for stability
    warmup_ratio=0.03,               # Keep this for warmup (usually improves stability)
    report_to="none",                # No reporting during training
    eval_strategy="no",              # Disable evaluation during training to speed up
    do_eval=False,                   # Disable evaluation during training
)

In [19]:
from trl import SFTTrainer

# Tokenization function
def tokenize_function(example):
    tokens = tokenizer(
        example["instruction"] + "\n" + example["output"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(tokenize_function, remove_columns=val_dataset.column_names)

# Final trainer setup (only supported args)
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    peft_config=peft_config,
)


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
25,1.8735
50,0.4757
75,0.3835
100,0.3536
125,0.3271
150,0.3304
175,0.3156
200,0.3237
225,0.3137


TrainOutput(global_step=225, training_loss=0.5218610911899143, metrics={'train_runtime': 1539.3578, 'train_samples_per_second': 0.585, 'train_steps_per_second': 0.146, 'total_flos': 1.83785119285248e+16, 'train_loss': 0.5218610911899143})

In [21]:
# model.save_pretrained("/content/model")
# tokenizer.save_pretrained("/content/model_tokenized")

In [24]:
trainer.save_model("./results_llama2_pii")
tokenizer.save_pretrained("./results_llama2_pii")


('./results_llama2_pii/tokenizer_config.json',
 './results_llama2_pii/special_tokens_map.json',
 './results_llama2_pii/tokenizer.model',
 './results_llama2_pii/added_tokens.json',
 './results_llama2_pii/tokenizer.json')

In [27]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

In [28]:
model.push_to_hub("ramgovindv/llama2-pii-finetuned")
tokenizer.push_to_hub("ramgovindv/llama2-pii-finetuned")

adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ramgovindv/llama2-pii-finetuned/commit/b38af49ad7b6675224425750ddfdb0e95d29a4c9', commit_message='Upload tokenizer', commit_description='', oid='b38af49ad7b6675224425750ddfdb0e95d29a4c9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ramgovindv/llama2-pii-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='ramgovindv/llama2-pii-finetuned'), pr_revision=None, pr_num=None)