### get data

In [1]:
with open('specific.txt') as file:
    texts = file.readlines()
    texts = [line.rstrip() for line in texts]

### load model

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
model_name_or_path = "./small"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)

In [5]:
tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'});

In [6]:
class CustomDataset(Dataset):

    def __init__(self, texts, tokenizer, prompt, max_len=32):
        self.texts = texts
        self.tokenizer = tokenizer
        self.prompt = prompt
        self.max_len = max_len


    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
        text = self.tokenizer.bos_token + self.prompt + self.texts[idx] + self.tokenizer.eos_token
        tokens = self.tokenizer.encode(text)
        out = torch.zeros(self.max_len, dtype=torch.long)
        out[:len(tokens)] = torch.tensor(tokens, dtype=torch.long)[:self.max_len]
        return out

In [7]:
prompt = 'Вопрос про конкретные инвестиции: '

specific_dataset = CustomDataset(texts[:100], tokenizer, prompt)

### finetuning

In [8]:
from transformers import DataCollatorForLanguageModeling

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=16, # to make "virtual" batch size larger
    )

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=specific_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 16
  Total optimization steps = 1


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1, training_loss=1.6300575733184814, metrics={'train_runtime': 28.6428, 'train_samples_per_second': 3.491, 'train_steps_per_second': 0.035, 'total_flos': 1633075200000.0, 'train_loss': 1.6300575733184814, 'epoch': 1.0})

### generate

In [13]:
import re

def clean_text(text):
    text = re.sub('[^А-ЯЁа-яёA-Za-z0-9,.?!% ]', ' ', text)
    text = re.sub(r" +", " ", text)
    text = re.split(r'[.!?]', text)
    # dirty hack for numbered list
    if len(text) > 1:
        if len(text[0]) < len(text[1]):
            text = text[1]
        else:
            text = text[0]
    text = text.strip()
    return text

In [14]:
def sent_gen(model, tokenizer, prompt, device):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        out = model.generate(input_ids.to(device), 
                            do_sample=True,
                            num_beams=2,
                            temperature=1.5,
                            top_p=0.9,
                            max_length=64,
                            no_repeat_ngram_size=2, 
                            num_return_sequences=1, 
                            early_stopping=True,
                            pad_token_id=tokenizer.eos_token_id,
                            repetition_penalty=10.,
                            bad_words_ids=[[tokenizer.pad_token_id]]
                            ).cpu()
    text = tokenizer.decode(out[0])[len(prompt):]
    return clean_text(text)

In [16]:
sent_gen(model, tokenizer, prompt, DEVICE)

'Существуют ли какие то правила, при которых инвестор может не вкладываться в строительство'