# Ноутбук для обучения модели энкодера на первоначальных синтетических данных и замера качества результатов

In [None]:
!git clone https://github.com/pavel-blinov/RuMedBench.git

In [None]:
!pip install seqeval
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install evaluate

In [None]:
import pandas as pd

train = pd.read_json('/kaggle/working/RuMedBench/data/RuMedTop3/train_v1.jsonl',
                     lines=True)

test = pd.read_json('/kaggle/working/RuMedBench/data/RuMedTop3/test_v1.jsonl',
                     lines=True)

dev = pd.read_json('/kaggle/working/RuMedBench/data/RuMedTop3/dev_v1.jsonl',
                     lines=True)

In [None]:
# подгрузка синтетических данных
# в случае проверки на первонаяальных данных не выполнялась
train_synth = pd.read_excel('/kaggle/input/gen-5-5-final/train_augmented_3')
train_synth = train_synth[['idx', 'symptoms', 'code']]
train = pd.concat([train, train_synth], ignore_index=True)

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
# преобразование в датасет для trainer'a

to_rename = {'symptoms': 'text', 'code': 'label'}
train.drop(columns='idx', inplace=True)
test.drop(columns='idx', inplace=True)
dev.drop(columns='idx', inplace=True)

labels = train['code'].unique().tolist()
n_labels = len(labels)

id2label = dict(zip(range(n_labels), labels))
label2id = dict(zip(labels, range(n_labels)))

train['code'] = train['code'].replace(label2id)
test['code'] = test['code'].replace(label2id)
dev['code'] = dev['code'].replace(label2id)

data = DatasetDict({
    'train': Dataset.from_pandas(train.rename(columns=to_rename)),
    'dev': Dataset.from_pandas(dev.rename(columns=to_rename)),
    'test': Dataset.from_pandas(test.rename(columns=to_rename)),
})
data

In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer

model_checkpoint = "alexyalunin/RuBioRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,
                                          truncation=True, padding=True,
                                          max_length=512)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=512)

tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=n_labels,
    id2label=id2label, label2id=label2id
)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "True"

In [None]:
# инициализвация обучения

training_args = TrainingArguments(
    output_dir="my_model_augmented",
    learning_rate=2e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = AutoModelForSequenceClassification.from_pretrained(
#     '/content/my_model/checkpoint-8252', num_labels=n_labels,
#     id2label=id2label, label2id=label2id
# ).to(device)

In [None]:
def predict_top_3(text):
  text = tokenizer(text)
  batch = {k: torch.tensor(v).reshape(1, -1).to(device) for k, v in text.items()}
  outputs = model(input_ids=batch['input_ids'][:, :512],
                  attention_mask=batch['attention_mask'][:, :512])

  return outputs.logits.argsort()[0][-3:].detach().to('cpu').tolist()[::-1]

In [None]:
from tqdm import tqdm
tqdm.pandas()
test['prediction'] = test['symptoms'].progress_apply(predict_top_3)

In [None]:
test.to_json('test.jsonl', orient='records', lines=True)

In [None]:
# -*- coding: utf-8 -*-

import os
import json
import argparse
import numpy as np
from sklearn.metrics import accuracy_score
from seqeval.metrics import f1_score
from seqeval.metrics import accuracy_score as seq_accuracy_score

def hit_at_3(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    hit_count = 0
    for l, row in zip(y_true, y_pred):
        hit_count += l in row
    return hit_count/float(len(y_true))

fname = 'test.jsonl'

metrics = {}
label_id = 'code'


with open(fname) as f:
    result = [json.loads(line) for line in list(f)]

gt = [d[label_id] for d in result]
top1 = [d['prediction'][0] for d in result]
top3 = [set(d['prediction']) for d in result]
acc = accuracy_score(gt, top1)*100
hit = hit_at_3(gt, top3)*100
metrics['acc'] = acc
metrics['hit3'] = hit

top3_acc, top3_hit = metrics.get('acc', 0), metrics.get('hit3', 0)

result_line = '| {:.2f} / {:.2f} |'.format(
    top3_acc, top3_hit,
)
print('| RuMedTop3\t  |')
print(result_line)