In [1]:
%pip install ruprompts
!pip install accelerate -U
!pip install datasets

Collecting ruprompts
  Downloading ruprompts-0.1.4-py3-none-any.whl (29 kB)
Collecting torch<2.0.0,>=1.10.0 (from ruprompts)
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtyping<0.2.0,>=0.1.4 (from ruprompts)
  Downloading torchtyping-0.1.4-py3-none-any.whl (17 kB)
Collecting transformers<5.0.0,>=4.6.0 (from ruprompts)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard<3.0.0,>=2.13.3 (from ruprompts)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch<2.0.0,>=1.10.0->ruprompts)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ruPROMPTs tutorial

This [tutorial](https://github.com/ai-forever/ru-prompts/blob/main/notebooks/detox-russe-train-python.ipynb) presents an example of prompt-tuning with ruPROMTS framework for the detoxification task.

In [4]:
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/train.tsv
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv

--2023-07-02 12:05:40--  https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1902888 (1.8M) [text/plain]
Saving to: ‘train.tsv’


2023-07-02 12:05:40 (61.4 MB/s) - ‘train.tsv’ saved [1902888/1902888]

--2023-07-02 12:05:40--  https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 200691 (196K) [text/plain]
Saving to: ‘dev.tsv’


2023-07-02 12:05:40 (18.4 MB/s) - ‘dev

In [5]:
import pandas as pd

df = pd.read_csv("train.tsv", sep="\t")
df.drop(["index"], axis=1, inplace=True)
df.to_csv("train.tsv", index=False, sep="\t")

## Training

In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Load the dataset:

In [7]:
from datasets import load_dataset

datasets = load_dataset("csv", data_files={"train": "train.tsv", "validation": "dev.tsv"}, sep="\t")
train_dataset = datasets["train"]
valid_dataset = datasets["validation"]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-768c831ea1e345a6/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-768c831ea1e345a6/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Load the backbone:

In [8]:
from transformers import GPT2LMHeadModel, AutoTokenizer

backbone_id = "sberbank-ai/rugpt3large_based_on_gpt2"

model = GPT2LMHeadModel.from_pretrained(backbone_id)
tokenizer = AutoTokenizer.from_pretrained(backbone_id, pad_token="<pad>", eos_token="<pad>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Define the prompt format:

In [9]:
from ruprompts import PromptFormat

prompt_format = PromptFormat("<P*100>{toxic_comment}<P*20>")

Define the parametrization of trainable embeddings:

In [10]:
from ruprompts import TensorPromptProvider
from transformers import set_seed

set_seed(1)

prompt_provider = TensorPromptProvider()

Compose prompt format and prompt provider into prompt object and apply it to the model and tokenizer, i.e. add special tokens to the tokenizer and modify the layer of input embeddings of the model:

In [11]:
from ruprompts import Prompt

prompt = Prompt(prompt_format, prompt_provider)
prompt.patch(model, tokenizer)

Preprocess the data:
1. format the data entries with the specified prompt format
2. tokenize the resulting sequences
3. truncate the `truncation_field` if sequence length exceeds `max_tokens`

In [12]:
from ruprompts import Text2TextPreprocessor

preprocessor = Text2TextPreprocessor(
    prompt_format=prompt_format,
    tokenizer=tokenizer,
    target_field="neutral_comment1",
    max_tokens=1792,
    truncation_field="toxic_comment",
)

train_dataset = train_dataset.map(preprocessor)
valid_dataset = valid_dataset.map(preprocessor)

Map:   0%|          | 0/6948 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Define training arguments:

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=".",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    eval_steps=1000,
    save_steps=1000,
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    learning_rate=0.1,
    max_steps=100000,
    report_to="tensorboard",
    # report_to=["tensorboard", "wandb"],  # uncomment to log to WandB
    logging_dir="logs",
    seed=1,
)

Choose optimization options:

In [14]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(prompt_provider.parameters(), lr=training_args.learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=2000,
    num_training_steps=training_args.max_steps,
)



Define the callbacks and start training:

In [None]:
from transformers import Trainer
from ruprompts.callbacks import (
    FreezeTransformerUnfreezePrompt,
    ReduceCheckpoint,
    SavePretrainedPrompt,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=preprocessor.collate_fn(),
    optimizers=(optimizer, scheduler),
    callbacks=[FreezeTransformerUnfreezePrompt(), ReduceCheckpoint(), SavePretrainedPrompt(prompt)],
)

trainer.train()

## Inference

Load prompt from the last checkpoint:

In [None]:
from transformers import pipeline

prompt = Prompt.from_pretrained(f"./checkpoint-{training_args.max_steps}")

ppln = pipeline("text2text-generation-with-prompt", prompt=prompt, model=model, tokenizer=tokenizer, device=0)

In [None]:
ppln({"toxic_comment": "Ублюдок, мать твою, а ну иди сюда"}, do_sample=False)

Run inference:

In [None]:
from tqdm import tqdm
import transformers

transformers.logging.set_verbosity_error()

beam_count = 10

predictions = []

for i in tqdm(valid_dataset["toxic_comment"]):
    options = ppln(
        {"toxic_comment": i},
        do_sample=False,
        num_beams=beam_count,
        num_return_sequences=beam_count,
    )

    options = [i["generated_text"].replace("<pad>", "") for i in options]
    answer = sorted(options, key=len)[-1]  # get longest answer
    predictions.append(answer)

with open("subm.txt", "w") as f:
    f.writelines(list(map(lambda x: x.replace("\n", " ") + "\n", predictions)))