Импорты

In [1]:
import pandas as pd
import transformers
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import AutoTokenizer
from datasets import load_dataset
import torch
import evaluate




In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Загрузка тренировочного датасета из файла, собранного в блокноте "bart.ipynb"

In [3]:
dataset_file = './data/train.csv'

dataset = load_dataset('csv', data_files=dataset_file, split='train')

dataset = dataset.train_test_split(test_size=0.1)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 15397
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 1711
    })
})

Создание токенизатора

In [5]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')

In [6]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["article"], max_length=1024, truncation=True)

    labels = tokenizer(
        text_target=examples["abstract"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/15397 [00:00<?, ? examples/s]

Map:   0%|          | 0/1711 [00:00<?, ? examples/s]

Создание модели

In [8]:
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [9]:
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [10]:
output_dir = './exp_t5'

training_args = TrainingArguments(
    output_dir='./results_t5',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=2,
    prediction_loss_only=True,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=True,
    run_name='run_name',
    load_best_model_at_end=False,
    metric_for_best_model="loss",
    weight_decay=0.01,
    greater_is_better=False
)



In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Обучение и сохранение модели

In [12]:
trainer.train()
trainer.save_model(output_dir + '/model')

  0%|          | 0/7698 [00:00<?, ?it/s]

{'loss': 2.3126, 'grad_norm': 3.0708460807800293, 'learning_rate': 1.8700961288646405e-05, 'epoch': 0.06}
{'loss': 2.1003, 'grad_norm': 3.2630558013916016, 'learning_rate': 1.7401922577292805e-05, 'epoch': 0.13}
{'loss': 1.9854, 'grad_norm': 2.9487531185150146, 'learning_rate': 1.610288386593921e-05, 'epoch': 0.19}
{'loss': 1.9744, 'grad_norm': 5.752965450286865, 'learning_rate': 1.4803845154585607e-05, 'epoch': 0.26}
{'loss': 1.9703, 'grad_norm': 2.962507486343384, 'learning_rate': 1.350480644323201e-05, 'epoch': 0.32}
{'loss': 1.9735, 'grad_norm': 2.729011297225952, 'learning_rate': 1.220576773187841e-05, 'epoch': 0.39}
{'loss': 1.9198, 'grad_norm': 2.999840259552002, 'learning_rate': 1.0906729020524814e-05, 'epoch': 0.45}
{'loss': 1.9555, 'grad_norm': 2.32146954536438, 'learning_rate': 9.607690309171214e-06, 'epoch': 0.52}
{'loss': 1.9329, 'grad_norm': 3.156730890274048, 'learning_rate': 8.308651597817616e-06, 'epoch': 0.58}
{'loss': 1.9474, 'grad_norm': 2.8122854232788086, 'learnin

  0%|          | 0/1711 [00:00<?, ?it/s]

{'eval_loss': 1.7968981266021729, 'eval_runtime': 439.5508, 'eval_samples_per_second': 3.893, 'eval_steps_per_second': 3.893, 'epoch': 1.0}
{'train_runtime': 57734.252, 'train_samples_per_second': 0.267, 'train_steps_per_second': 0.133, 'train_loss': 1.9791567399304202, 'epoch': 1.0}


In [13]:
model.save_pretrained("./exp_t5/model")


tokenizer.save_pretrained('./exp_t5/tokenizer')

('./exp_t5/tokenizer\\tokenizer_config.json',
 './exp_t5/tokenizer\\special_tokens_map.json',
 './exp_t5/tokenizer\\tokenizer.json')

Оценка качества модели

In [14]:
rouge = evaluate.load('rouge')

Using the latest cached version of the module from C:\Users\piskarevaiv\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--rouge\b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Sat Oct 26 18:10:31 2024) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


Загрузка данных

In [20]:
path = 'data/'

val_table = {'article': [], 'abstract': []}

table = pd.read_parquet(path + 'validation-00000-of-00001.parquet')
for i in table.sample(len(table) - 6000).values:
    if i[0] != '':
        val_table['article'].append(i[0])
        val_table['abstract'].append(i[1])

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = transformers.AutoModelForSeq2SeqLM.from_pretrained("./exp_t5/model").to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained("./exp_t5/tokenizer")

In [22]:
len(val_table['article'])

632

Разбиение текста на мелкие куски для обработки их моделью по отдельности

In [23]:
def chunk_text(text, tokenizer, max_length=512):
    tokens = tokenizer.encode(text)
    return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

In [24]:
predictions = []

c = 0


for i in val_table['article']:
    all_chunks = []
    chunks = chunk_text(i, tokenizer)

    for chunk in chunks:
        inputs = {'input_ids': torch.tensor([chunk]).to(device)}
        outputs = model.generate(**inputs, max_length=len(val_table['abstract'][c]))
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        all_chunks.append(generated_text)

    result = " ".join(all_chunks)
    predictions.append(result)

    c += 1
    print(c)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


KeyboardInterrupt: 

Подсчет и вывод метрик

In [27]:
results = rouge.compute(predictions=predictions, references=val_table['abstract'][:61])

In [28]:
print(results)

{'rouge1': 0.2780041408837952, 'rouge2': 0.11295556072217279, 'rougeL': 0.1593453808268831, 'rougeLsum': 0.21973789008528022}
