In [2]:
!pip install wandb
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch] -U

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import json

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import wandb
api = ""
wandb.login(key=api)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
run_name = "gpt2-ru"
wandb.init(project="detox", name=run_name, entity="speech_sanitizers")

[34m[1mwandb[0m: Currently logged in as: [33mchenxinwang[0m ([33mspeech_sanitizers[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
def load_and_preprocess_data(toxic_filepath, detox_filepath):
    with open(toxic_filepath, 'r', encoding='utf-8') as file:
        toxic_data = [json.loads(line) for line in file]
    with open(detox_filepath, 'r', encoding='utf-8') as file:
        detox_data = [json.loads(line) for line in file]

    data = {
        'toxic_text': [toxic['text'] for toxic in toxic_data],
        'detoxified_text': ["detoxify: " + detox['text'] for detox in detox_data]
    }

    return Dataset.from_dict(data)

In [None]:
# en_train_toxic = "/content/drive/MyDrive/colx531_project/data/en_train_input.jsonl"
# en_train_detox = "/content/drive/MyDrive/colx531_project/data/en_train_gold.jsonl"
# en_train = load_and_preprocess_data(en_train_toxic, en_train_detox)
# en_train

In [8]:
ru_train_toxic = "/content/drive/MyDrive/colx531_project/data/ru_train_input.jsonl"
ru_train_detox = "/content/drive/MyDrive/colx531_project/data/ru_train_gold.jsonl"
ru_train = load_and_preprocess_data(ru_train_toxic, ru_train_detox)
ru_train

Dataset({
    features: ['toxic_text', 'detoxified_text'],
    num_rows: 11090
})

In [9]:
from transformers import GPT2Tokenizer

def preprocess_function(examples):
    model_inputs = tokenizer(examples['toxic_text'], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['detoxified_text'], padding="max_length", truncation=True, max_length=128)["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs


In [10]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<PAD>', padding_side='left')
# tokenized_dataset = en_train.map(preprocess_function, batched=True)
tokenized_dataset = ru_train.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/11090 [00:00<?, ? examples/s]



In [11]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

def fine_tune_model(tokenized_dataset):
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))

    model.to(device)

    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/colx531_project/output",
        num_train_epochs=1,
        per_device_train_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="/content/drive/MyDrive/colx531_project/models/model_logs",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    trainer.train()

    return model

In [12]:
# Run this code block when finetuning the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = fine_tune_model(tokenized_dataset)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,3.9281
1000,2.0372
1500,1.9571
2000,1.9076
2500,1.9295


In [None]:
# Run this code block when using an existing model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load('/content/drive/MyDrive/colx531_project/models/en-gpt2-1.pth')
model.to(device)

In [13]:
def generate_detoxified_text(model, tokenizer, text, device):
    encoded_input = tokenizer.encode_plus(
        "detoxify: " + text,
        return_tensors="pt",
        add_special_tokens=True,
        max_length=100,
        padding='max_length',
        truncation=True
    )
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    output_ids = model.generate(
      input_ids,
      attention_mask=attention_mask,
      max_new_tokens=50,
      # temperature=0.9,
      # top_k=50,
      # top_p=0.95,
      # no_repeat_ngram_size=2,
      num_return_sequences=1,
      pad_token_id=tokenizer.pad_token_id
    ).squeeze()

    detoxified_text = tokenizer.decode(output_ids, skip_special_tokens=True)

    return detoxified_text


In [14]:
device = model.device
sample_toxic_text = "пиздеж! температуры горения хватит чтобы её расплавить к херам.."
detoxified_text = generate_detoxified_text(model, tokenizer, sample_toxic_text, device)
print("Example output:", detoxified_text)

Example output: detoxify: пиздеж! температуры горения хватит чтобы её расплавить к херам.. � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � �


## Load validation data and generate output

In [15]:
# file_path = "/content/drive/MyDrive/colx531_project/data/en_valid_gold.jsonl"
file_path = "/content/drive/MyDrive/colx531_project/data/ru_valid_gold.jsonl"
with open(file_path, 'r', encoding='utf-8') as f:
  validation_data = [json.loads(line) for line in f]

In [16]:
def detoxify_output(validation_data, model, tokenizer, device):
    detoxified_outputs = []
    for item in validation_data:
        detoxified_text = generate_detoxified_text(model, tokenizer, item['text'], device)
        detoxified_outputs.append({
            'id': item['id'],
            'original_text': item['text'],
            'detoxified_text': detoxified_text
        })
    return detoxified_outputs

In [17]:
detoxified_outputs = detoxify_output(validation_data, model, tokenizer, device)

for item in detoxified_outputs:
    item['detoxified_text'] = item['detoxified_text'].replace('detoxify: ', '')

output_path = "/content/drive/MyDrive/colx531_project/output/ru_valid_output_gpt2_1.jsonl"
with open(output_path, 'w', encoding='utf-8') as f:
    for item in detoxified_outputs:
        f.write(json.dumps({"id": item['id'], "text": item['detoxified_text']}, ensure_ascii=False) + '\n')

In [18]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▃▄▆▇█
train/global_step,▁▃▄▆▇█
train/grad_norm,▁▁▁█▃
train/learning_rate,█▆▅▃▁
train/loss,█▁▁▁▁

0,1
total_flos,724432158720000.0
train/epoch,1.0
train/global_step,2773.0
train/grad_norm,38.86938
train/learning_rate,1e-05
train/loss,1.9295
train_loss,2.30829
train_runtime,762.7373
train_samples_per_second,14.54
train_steps_per_second,3.636


In [None]:
# Generate output for dev data
file_path = "/content/drive/MyDrive/colx531_project/data/en_dev.jsonl"
with open(file_path, 'r', encoding='utf-8') as f:
  validation_data = [json.loads(line) for line in f]

detoxified_outputs = detoxify_output(validation_data, model, tokenizer, device)

for item in detoxified_outputs:
    item['detoxified_text'] = item['detoxified_text'].replace('detoxify: ', '')

output_path = "/content/drive/MyDrive/colx531_project/output/en_dev_output_finetuned-gpt2-1.jsonl"
with open(output_path, 'w', encoding='utf-8') as f:
    for item in detoxified_outputs:
        f.write(json.dumps({"id": item['id'], "text": item['detoxified_text']}, ensure_ascii=False) + '\n')

In [19]:
torch.save(model, '/content/drive/MyDrive/colx531_project/models/ru-gpt2-1.pth')

## Evaluation

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

In [26]:
import subprocess

subprocess.run(['pip', 'install', 'sacrebleu'], check=True)
subprocess.run(['pip', 'install', 'sentence_transformers'], check=True)

CompletedProcess(args=['pip', 'install', 'sentence_transformers'], returncode=0)

In [27]:
run_name = "gpt2-eval-ru"
wandb.init(project="detox", name=run_name, entity="speech_sanitizers")

In [28]:
import subprocess

cmd = [
    'python', '/content/drive/MyDrive/colx531_project/evaluation_script/evaluate.py',
    '--input', '/content/drive/MyDrive/colx531_project/data/ru_valid_input.jsonl',
    '--golden', '/content/drive/MyDrive/colx531_project/data/ru_valid_gold.jsonl',
    '--prediction', '/content/drive/MyDrive/colx531_project/output/ru_valid_output_gpt2_1.jsonl'
]

# result = subprocess.run(cmd, capture_output=True, text=True)
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
if result.stderr:
    print(result.stderr)



modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]
modules.json: 100%|██████████| 461/461 [00:00<00:00, 1.55MB/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]
config_sentence_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 425kB/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]
README.md: 100%|██████████| 2.22k/2.22k [00:00<00:00, 8.94MB/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 189kB/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]
config.json: 100%|██████████| 804/804 [00:00<00:00, 3.27MB/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]
model.safetensors:   1%|          | 10.5M/1.88G [00:00<00:20, 92.3MB/s]
model.safetensors:   1%|          | 21.0M/1.88G [00:00<00:22, 81.3MB/s]
model.safetensors:   2%|▏         | 41.9M/1.88G [00:00<00:16, 113MB/s] 
model.safetensors:   4%|▍         | 73.4M/1.88G [00:0

In [30]:
if result.stdout:
    print(result.stdout)

measure{
  key: "STA"
  value: "0.8937522768974304"
}
measure{
  key: "SIM"
  value: "0.77951097183971"
}
measure{
  key: "CHRF"
  value: "0.8668465496664615"
}
measure{
  key: "J"
  value: "0.6000772794743386"
}



In [None]:
# !python /content/drive/MyDrive/colx531_project/evaluation_script/evaluate.py \
#       --input=/content/drive/MyDrive/colx531_project/data/en_valid_input.jsonl \
#       --golden=/content/drive/MyDrive/colx531_project/data/en_valid_gold.jsonl \
#       --prediction=/content/drive/MyDrive/colx531_project/output/en_valid_output_gpt2_1.jsonl

In [31]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))