Импорты

In [1]:
import pandas as pd
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset
import torch

Загрузка тренировочного датасета

In [2]:
path = 'data/'

train_table = {'article': [], 'abstract': []}

for j in range(5):
    table = pd.read_parquet(path + f'train-0000{j}-of-00005.parquet')
    for i in table.values:
        if i[0] != '':
            train_table['article'].append(i[0])
            train_table['abstract'].append(i[1])


train = pd.DataFrame(train_table)
train = train.sample(len(train) - 100000)

In [3]:
train.to_csv('./data/train.csv', index=False)

In [4]:
dataset_file = './data/train.csv'

dataset = load_dataset('csv', data_files=dataset_file, split='train')

dataset = dataset.train_test_split(test_size=0.1)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'article', 'abstract'],
        num_rows: 15397
    })
    test: Dataset({
        features: ['Unnamed: 0', 'article', 'abstract'],
        num_rows: 1711
    })
})

Создание токенизатора

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ainize/bart-base-cnn")

In [7]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["article"], max_length=1024, truncation=True)

    labels = tokenizer(
        text_target=examples["abstract"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/15397 [00:00<?, ? examples/s]

Map:   0%|          | 0/1711 [00:00<?, ? examples/s]

Создание модели

In [9]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained("ainize/bart-base-cnn")

In [10]:
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
)






In [12]:
trainer = transformers.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Обучение модели

In [13]:
trainer.train()

  0%|          | 0/7700 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 2.7504, 'grad_norm': 5.065532207489014, 'learning_rate': 1.8701298701298704e-05, 'epoch': 0.13}




{'loss': 2.5273, 'grad_norm': 4.990349769592285, 'learning_rate': 1.7402597402597403e-05, 'epoch': 0.26}
{'loss': 2.4318, 'grad_norm': 4.702807426452637, 'learning_rate': 1.6103896103896105e-05, 'epoch': 0.39}
{'loss': 2.4246, 'grad_norm': 4.0046234130859375, 'learning_rate': 1.4805194805194807e-05, 'epoch': 0.52}
{'loss': 2.3694, 'grad_norm': 4.181368827819824, 'learning_rate': 1.3506493506493508e-05, 'epoch': 0.65}
{'loss': 2.3377, 'grad_norm': 4.4541120529174805, 'learning_rate': 1.2207792207792208e-05, 'epoch': 0.78}
{'loss': 2.3383, 'grad_norm': 8.152591705322266, 'learning_rate': 1.0909090909090909e-05, 'epoch': 0.91}


  0%|          | 0/428 [00:00<?, ?it/s]

{'eval_loss': 2.183960437774658, 'eval_runtime': 208.8489, 'eval_samples_per_second': 8.193, 'eval_steps_per_second': 2.049, 'epoch': 1.0}
{'loss': 2.3282, 'grad_norm': 4.541385173797607, 'learning_rate': 9.610389610389611e-06, 'epoch': 1.04}
{'loss': 2.2043, 'grad_norm': 5.056827068328857, 'learning_rate': 8.311688311688313e-06, 'epoch': 1.17}
{'loss': 2.1978, 'grad_norm': 4.496827602386475, 'learning_rate': 7.012987012987014e-06, 'epoch': 1.3}
{'loss': 2.1957, 'grad_norm': 4.286740303039551, 'learning_rate': 5.7142857142857145e-06, 'epoch': 1.43}
{'loss': 2.2076, 'grad_norm': 4.784675121307373, 'learning_rate': 4.415584415584416e-06, 'epoch': 1.56}
{'loss': 2.2054, 'grad_norm': 6.215847015380859, 'learning_rate': 3.116883116883117e-06, 'epoch': 1.69}
{'loss': 2.1672, 'grad_norm': 5.3382768630981445, 'learning_rate': 1.8181818181818183e-06, 'epoch': 1.82}
{'loss': 2.1901, 'grad_norm': 4.563017845153809, 'learning_rate': 5.194805194805196e-07, 'epoch': 1.95}


  0%|          | 0/428 [00:00<?, ?it/s]

{'eval_loss': 2.1529319286346436, 'eval_runtime': 180.9242, 'eval_samples_per_second': 9.457, 'eval_steps_per_second': 2.366, 'epoch': 2.0}
{'train_runtime': 35237.8051, 'train_samples_per_second': 0.874, 'train_steps_per_second': 0.219, 'train_loss': 2.3215279041637076, 'epoch': 2.0}


TrainOutput(global_step=7700, training_loss=2.3215279041637076, metrics={'train_runtime': 35237.8051, 'train_samples_per_second': 0.874, 'train_steps_per_second': 0.219, 'total_flos': 1.877622449504256e+16, 'train_loss': 2.3215279041637076, 'epoch': 2.0})

In [7]:
model.save_pretrained("./exp_bart/model")

tokenizer.save_pretrained("./exp_bart/tokenizer")

NameError: name 'model' is not defined

In [2]:
%pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting nltk (from rouge_score)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 6.6 MB/s eta 0:00:00
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=b41e1f2d6a66366d00b68692d5b62546b4b8483cecd1259079033988492e660a
  Stored in directory: c:\users\piskarevaiv\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a

Оценка качества модели на тестовых данных

In [2]:
import evaluate

rouge = evaluate.load('rouge')

In [3]:
path = 'data/'

val_table = {'article': [], 'abstract': []}

table = pd.read_parquet(path + 'validation-00000-of-00001.parquet')
for i in table.sample(len(table) - 5500).values:
    if i[0] != '':
        val_table['article'].append(i[0])
        val_table['abstract'].append(i[1])

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = transformers.AutoModelForSeq2SeqLM.from_pretrained("./exp_bart/model").to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained("./exp_bart/tokenizer")

In [5]:
len(val_table['article'])

1132

In [6]:
def chunk_text(text, tokenizer, max_length=512):
    tokens = tokenizer.encode(text)
    return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

In [7]:
predictions = []

c = 0


for i in val_table['article']:
    all_chunks = []
    chunks = chunk_text(i, tokenizer)

    for chunk in chunks:
        inputs = {'input_ids': torch.tensor([chunk]).to(device)}
        outputs = model.generate(**inputs, max_length=len(val_table['abstract'][c]))
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        all_chunks.append(generated_text)

    result = " ".join(all_chunks)
    predictions.append(result)

    c += 1
    print(c)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


1
2
3
4
5
6
7
8


KeyboardInterrupt: 

In [20]:
results = rouge.compute(predictions=predictions, references=val_table['abstract'])

In [21]:
print(results)

{'rouge1': 0.32069306260711017, 'rouge2': 0.14221256800742654, 'rougeL': 0.1804596682478011, 'rougeLsum': 0.2874759406992602}


In [8]:
import random

i = random.randint(0, 1132)

print(val_table['abstract'][i])
print()
print(predictions[i])

study design :  systematic review.clinical question :  do the rates and timing of adjacent segment disease ( asd ) differ between cervical total disc arthroplasty ( c - adr ) and anterior cervical discectomy and fusion ( acdf ) in patients treated for cervical degenerative disc disease?methods :  a systematic search of medline / pubmed and bibliographies of key articles was done to identify studies with long - term follow - up for symptomatic and/or radiographic asd comparing c - adr with fusion for degenerative disc disease of the cervical spine . 
 the focus was on studies with longer follow - up ( 4860 months ) of primary us food and drug administration trials of prestige st , prodisc - c , and bryan devices as available . 
 trials of other discs with a minimum of 24 months follow - up were considered for inclusion . 
 studies evaluating lordosis / angle changes at adjacent segments and case series were excluded.results :  from 14 citations identified , four reports from three rando

IndexError: list index out of range

In [9]:
path = 'data/'

val_table = {'article': [], 'abstract': []}

table = pd.read_parquet(path + 'train-00000-of-00005.parquet')
for i in table.sample(len(table) - 23500).values:
    if i[0] != '':
        val_table['article'].append(i[0])
        val_table['abstract'].append(i[1])

In [10]:
len(val_table['abstract'])

474

In [11]:
predictions = []

c = 0


for i in val_table['article']:
    all_chunks = []
    chunks = chunk_text(i, tokenizer)

    for chunk in chunks:
        inputs = {'input_ids': torch.tensor([chunk]).to(device)}
        outputs = model.generate(**inputs, max_length=len(val_table['abstract'][c]))
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        all_chunks.append(generated_text)

    result = " ".join(all_chunks)
    predictions.append(result)

    c += 1
    print(c)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [12]:
results = rouge.compute(predictions=predictions, references=val_table['abstract'])


In [13]:
print(results)


{'rouge1': 0.3232253895679683, 'rouge2': 0.14367403467879308, 'rougeL': 0.18194230240794224, 'rougeLsum': 0.2881419929752157}


: 