In [None]:
!pip install datasets transformers torch accelerate peft sentencepiece

In [2]:
!pip install -U transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached transformers-4.52.4-py3-none-any.whl (10.5 MB)
Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.36.2
    Uninstalling transformers-4.36.2:
      Successfully uninstalled transformers-4.36.2
Successfully installed tokenizers-0.21.1 transformers-4.52.4


In [3]:
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq,AutoTokenizer,AutoModelForSeq2SeqLM
import torch
import random
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
en_tox_ds = load_dataset("textdetox/multilingual_toxicity_dataset",split="en")
ru_tox_ds = load_dataset("textdetox/multilingual_toxicity_dataset",split="ru")
uk_tox_ds = load_dataset("textdetox/multilingual_toxicity_dataset",split="uk")

Generating en split: 100%|██████████| 5000/5000 [00:00<00:00, 832170.15 examples/s]
Generating ru split: 100%|██████████| 5000/5000 [00:00<00:00, 1676246.50 examples/s]
Generating uk split: 100%|██████████| 5000/5000 [00:00<00:00, 1829975.57 examples/s]
Generating de split: 100%|██████████| 5000/5000 [00:00<00:00, 1177844.43 examples/s]
Generating es split: 100%|██████████| 5000/5000 [00:00<00:00, 1376806.72 examples/s]
Generating am split: 100%|██████████| 5000/5000 [00:00<00:00, 1364267.50 examples/s]
Generating zh split: 100%|██████████| 5000/5000 [00:00<00:00, 2239111.68 examples/s]
Generating ar split: 100%|██████████| 5000/5000 [00:00<00:00, 1500108.73 examples/s]
Generating hi split: 100%|██████████| 5000/5000 [00:00<00:00, 1008585.58 examples/s]
Generating it split: 100%|██████████| 5000/5000 [00:00<00:00, 1569607.07 examples/s]
Generating fr split: 100%|██████████| 5000/5000 [00:00<00:00, 1768554.56 examples/s]
Generating he split: 100%|██████████| 2011/2011 [00:00<00:00, 1025

In [5]:
en_non_toxic = [en for en in en_tox_ds if en['toxic']==0]
ru_non_toxic = [ru for ru in ru_tox_ds if ru['toxic']==0]
uk_non_toxic = [uk for uk in uk_tox_ds if uk['toxic']==0]

random.seed(42)
en_id = random.sample(en_non_toxic, 2000)
ru_id = random.sample(ru_non_toxic, 1000)
uk_id = random.sample(uk_non_toxic, 470)

In [6]:
def build_identity_pairs(samples, lang_code):
    return [{"toxic": s['text'], "clean": s['text'], "lang": lang_code} for s in samples]

all_identity_pairs = (
    build_identity_pairs(en_id, "en") +
    build_identity_pairs(ru_id, "ru") +
    build_identity_pairs(uk_id, "uk")
)
identity_dataset = Dataset.from_list(all_identity_pairs)

In [7]:
en_ds = load_dataset("s-nlp/paradetox", split="train")
ru_ds = load_dataset("s-nlp/ru_paradetox", split="train")
uk_ds = load_dataset("textdetox/uk_paradetox",split="train")

Generating train split: 100%|██████████| 19744/19744 [00:00<00:00, 670930.97 examples/s]
Generating train split: 100%|██████████| 11090/11090 [00:00<00:00, 281870.49 examples/s]
Generating validation split: 100%|██████████| 1116/1116 [00:00<00:00, 193559.25 examples/s]
Generating train split: 100%|██████████| 3893/3893 [00:00<00:00, 273700.52 examples/s]


In [8]:
COL_MAP = {
    "en": ("en_toxic_comment", "en_neutral_comment"),
    "ru": ("ru_toxic_comment", "ru_neutral_comment"),
    "uk": ("toxic_sentence",  "neutral_sentence"),
    "es": ("toxic_sentence",  "neutral_sentence"),
    "zh": ("toxic_sentence",  "neutral_sentence")
}

def make_strip(lang):
    s, t = COL_MAP[lang]
    return lambda ex: {
        "toxic": ex[s],
        "clean": ex[t],
        "lang": lang  
    }

In [9]:
en_ds = en_ds.map(make_strip("en"), remove_columns=en_ds.column_names)
ru_ds = ru_ds.map(make_strip("ru"), remove_columns=ru_ds.column_names)
uk_ds = uk_ds.map(make_strip("uk"), remove_columns=uk_ds.column_names)
toxic_dataset = concatenate_datasets([en_ds, ru_ds, uk_ds])

Map: 100%|██████████| 19744/19744 [00:00<00:00, 47841.74 examples/s]
Map: 100%|██████████| 11090/11090 [00:00<00:00, 46197.27 examples/s]
Map: 100%|██████████| 3893/3893 [00:00<00:00, 46604.57 examples/s]


In [10]:
full_ds = concatenate_datasets([identity_dataset,toxic_dataset]).shuffle(seed=42)
split = full_ds.train_test_split(test_size=0.05)
train_ds = split['train']
dev_ds = split['test']

In [29]:
base_model_name = "bigscience/mt0-large"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base = AutoModelForSeq2SeqLM.from_pretrained(base_model_name, device_map="auto",torch_dtype=torch.float32)

In [30]:
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=32, #64
    lora_alpha=64, #128
    target_modules=["q","v","k","o"],
    lora_dropout=0.05 #0.05
)
model = get_peft_model(base, lora_config)
model.print_trainable_parameters()

trainable params: 18,874,368 || all params: 1,248,455,680 || trainable%: 1.5118


In [31]:
LANG_PROMPT_MAP = {
    "en": "Rewrite the sentence by replacing toxic or offensive words with neutral and polite expressions. Preserve the original meaning",
    "ru": "Перепишите предложение, заменив токсичные или грубые слова на нейтральные и вежливые. Смысл предложения должен сохраняться.",
    "uk": "Перепишіть речення, замінивши токсичні або образливі слова на нейтральні й ввічливі. Зміст має залишатися незмінним.",
    "zh": "请将句子中的粗俗或攻击性词语改写为中性、礼貌的表达，保留原句意思，避免不必要的删改。"
}

def add_lang_prefix(example):
    prefix = LANG_PROMPT_MAP.get(example["lang"], LANG_PROMPT_MAP["en"])
    model_inputs = tokenizer(
        prefix + example["toxic"],
        truncation=True,
        padding="longest",
        max_length=256
    )
    targets = tokenizer(
        text_target=example["clean"],
        truncation=True,
        padding="longest",
        max_length=256
    )
    input_ids = targets["input_ids"]
    labels = input_ids if isinstance(input_ids[0], int) else input_ids[0]
    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
    model_inputs["labels"] = labels
    return model_inputs

In [32]:
train_tok = train_ds.map(add_lang_prefix, remove_columns=train_ds.column_names)
train_tok.set_format("torch")

dev_tok = dev_ds.map(add_lang_prefix, remove_columns=dev_ds.column_names)
dev_tok.set_format("torch")

In [33]:
from transformers import Seq2SeqTrainer,EarlyStoppingCallback 
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=-100)

# ✅ 训练参数
args = Seq2SeqTrainingArguments(
    output_dir="ckpt_mt0_large_largelearning_Low",
    #batch size
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    #training epoch
    num_train_epochs=3,
    #优化器
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    warmup_steps=500,
    weight_decay=0.01,
    label_smoothing_factor=0.1, 
    # ---------- 日志 / 保存 ----------
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    #——————精度
    fp16=False,
    bf16=True,  
    half_precision_backend="auto", 
    #
    report_to="none",
    predict_with_generate=True,
    remove_unused_columns=False,
)

class MyTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs.loss  
        return (loss, outputs) if return_outputs else loss

trainer = MyTrainer( 
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=dev_tok,
    data_collator=collator,
    tokenizer=tokenizer,
)

trainer.train()

model.save_pretrained("mt0l-lora-adapter-largelearning_low")
tokenizer.save_pretrained("mt0l-lora-adapter-largelearning_low")

  trainer = MyTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
500,1.1424,0.946961
1000,1.0558,0.863434
1500,0.987,0.847739
2000,0.993,0.808196
2500,0.9642,0.797166
3000,0.9292,0.783389
3500,0.8867,0.779446
4000,0.8587,0.777512
4500,0.8985,0.77009
5000,0.8667,0.768276


('mt0l-lora-adapter-largelearning_low/tokenizer_config.json',
 'mt0l-lora-adapter-largelearning_low/special_tokens_map.json',
 'mt0l-lora-adapter-largelearning_low/spiece.model',
 'mt0l-lora-adapter-largelearning_low/added_tokens.json',
 'mt0l-lora-adapter-largelearning_low/tokenizer.json')