In [None]:
!pip install transformers
!pip install torchtext torch

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.2 MB/s[0m eta [36m0:00:0

In [None]:
!unzip data

Archive:  data.zip
  inflating: data.csv                


In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_name = "tinkoff-ai/ruDialoGPT-medium"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import pandas as pd

df = pd.read_csv("data.csv")

text_data = ""
for index, row in df.iterrows():
    context_3 = row['context_3'] if not pd.isna(row['context_3']) else ""
    context_2 = row['context_2'] if not pd.isna(row['context_2']) else ""
    context_1 = row['context_1'] if not pd.isna(row['context_1']) else ""
    response = row['response'] if not pd.isna(row['response']) else ""

    if context_3 or context_2 or context_1 or response:
        text_data += f"{context_3}\n{context_2}\n{context_1}\n{response}\n<|endoftext|>\n"

with open("text_data.txt", "w", encoding='utf-8') as f:
    f.write(text_data)

In [None]:
!pip uninstall -y accelerate
!pip install accelerate==0.22.0

[0mCollecting accelerate==0.22.0
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.22.0


In [None]:
special_tokens_dict = {'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))  # Update the model's embeddings to account for the new tokens

filtered_text_data = ""
for line in text_data.split('\n'):
    encoded_line = tokenizer.encode(line)
    filtered_tokens = [tok if tok < tokenizer.vocab_size else tokenizer.unk_token_id for tok in encoded_line]
    filtered_text_data += tokenizer.decode(filtered_tokens) + '\n'
# Save the filtered text data
with open("filtered_text_data.txt", "w", encoding='utf-8') as f:
    f.write(filtered_text_data)


In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="filtered_text_data.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)



In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()
trainer.save_model()

Step,Training Loss
500,2.781
1000,2.5784
1500,2.5157


KeyboardInterrupt: ignored

In [None]:
trainer.save_model()

In [None]:
!zip -r output.zip output

  adding: output/ (stored 0%)
  adding: output/generation_config.json (deflated 24%)
  adding: output/training_args.bin (deflated 49%)
  adding: output/pytorch_model.bin (deflated 7%)
  adding: output/config.json (deflated 51%)
