In [1]:
import re

import pandas as pd
from tqdm import trange

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

## Загрузим pre-trained GPT2 и GPT2Tokenizer с Hugging Face, а также воспользуемся их пайплайном

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [3]:
train_path = r'/root/storage/3030/AkhmetzyanovD/projects/mai/cnn_dailymail/train.csv'
test_path = r'/root/storage/3030/AkhmetzyanovD/projects/mai/cnn_dailymail/test.csv'
train_path_txt = train_path.split('.')[0] + '.txt'
test_path_txt = test_path.split('.')[0] + '.txt'
train_data = pd.read_csv(train_path, usecols=['article'])
test_data = pd.read_csv(test_path, usecols=['article'])

train_data = train_data.head(int(len(train_data) / 1000))
test_data = test_data.head(int(len(train_data) / 1000))

In [4]:
model_path = r'/AkhmetzyanovD/projects/mai/results/gpt2'
prompt = 'This will create a '

In [5]:
def build_text_files(data, dest_path):
    f = open(dest_path, 'w')
    out = ''
    for i in trange(len(data)):
        sequence = data.iloc[i, 0]
        summary = sequence.strip()
        summary = re.sub(r"\s", " ", summary)
        out += summary + "  "
    f.write(out)
 
build_text_files(train_data, train_path_txt)

100%|██████████| 287/287 [00:00<00:00, 2398.35it/s]


In [6]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
 
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)
 
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator
 
train_dataset,test_dataset,data_collator = load_dataset(train_path_txt,test_path_txt,tokenizer)



In [7]:
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=28,
    per_device_eval_batch_size=28,
    eval_steps=1000,
    save_steps=500,
    )
 
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
trainer.train()
trainer.save_model()

ClearML Task: created new task id=8a17655e03a849c6b32e95e9afda1fc0
2024-06-03 14:46:37,024 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: http://192.168.143.19:8080/projects/4489be9d2a4b4c68bb78aa62a8c43566/experiments/8a17655e03a849c6b32e95e9afda1fc0/output/log




Step,Training Loss
500,3.0946




ClearML results page: http://192.168.143.19:8080/projects/4489be9d2a4b4c68bb78aa62a8c43566/experiments/8a17655e03a849c6b32e95e9afda1fc0/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start




In [23]:
pipe = pipeline('text-generation', model=model_path, tokenizer='gpt2', max_new_tokens=20)
 
result = pipe(prompt)[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [24]:
result

"This will create a  massive pressure build on our economy,' Mr Bannatyne said. 'There will be massive increases"

: 