In [1]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [2]:
!pip install accelerate -U



In [3]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from torch.nn.utils.rnn import pad_sequence

In [4]:

class ChatDataset(Dataset):
    def __init__(self, tokenizer, filepath, block_size=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        # Load the data
        df = pd.read_csv(filepath)
        for _, row in df.iterrows():
            # Encode with padding and truncation
            encoded_input = tokenizer(row['question'], max_length=block_size, padding='max_length', truncation=True)
            encoded_target = tokenizer(row['answer'], max_length=block_size, padding='max_length', truncation=True)

            self.inputs.append(encoded_input['input_ids'])
            self.targets.append(encoded_target['input_ids'])
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]
        target_ids = self.targets[idx]

        # Convert to tensor
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        target_ids = torch.tensor(target_ids, dtype=torch.long)

        return {"input_ids": input_ids, "labels": target_ids}



In [7]:
# Select a pre-trained model
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
model = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
tokenizer.pad_token = tokenizer.eos_token

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

In [17]:
# Load and preprocess the dataset
dataset = ChatDataset(tokenizer=tokenizer, filepath='dataset.csv')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [18]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Then, when initializing your Trainer, pass the data_collator:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    # eval_dataset=test_dataset if you have it
)

# Train the model
trainer.train()

Step,Training Loss
10,0.7943
20,1.0233
30,0.7712
40,0.693
50,0.9408
60,0.5386
70,1.0323
80,0.806
90,0.9807
100,0.9033


TrainOutput(global_step=1820, training_loss=0.672429284944639, metrics={'train_runtime': 1172.4927, 'train_samples_per_second': 6.192, 'train_steps_per_second': 1.552, 'total_flos': 1896980152320000.0, 'train_loss': 0.672429284944639, 'epoch': 20.0})

In [19]:
model_path = 'fine_tuned_gpt2_model'
# Save the fine-tuned model
trainer.save_model(model_path)

# You can now load this model from the /mnt/data/fine_tuned_model directory

In [28]:
from transformers import pipeline

# Load your trained model and tokenizer
# model_path = './results' # Replace with your model's directory
# model = GPT2LMHeadModel.from_pretrained(model_path)
# tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Create a text generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

# Function to generate text based on a prompt
def generate_text(prompt, max_length=50):
    # Encode the initial prompt text
    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")

    # Generate text
    outputs = generator(prompt, max_length=max_length, num_return_sequences=1)
    # Decode and print the output text
    return outputs[0]['generated_text']

# Test the model with a prompt
prompt = "Хочу записаться на"  # Replace with your prompt
generated_text = generate_text(prompt)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Хочу записаться на 👉 👉 👉 👉 👉 👉 👉 👉 👉 👍👉�
