In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import json

In [None]:
def _ok(s: str, max_len: int) -> bool:
    return bool(s) and len(s) <= max_len

In [None]:
def _extract_translation(ex):
    tr = ex.get("translation (translation)")
    if isinstance(tr, str):
        try:
            tr = json.loads(tr)
        except Exception:
            tr = {"en": ex.get("en", ""), "ru": ex.get("ru", "")}
    en = tr.get("en", "")
    ru = tr.get("ru", "")
    return {"src": en, "tgt": ru}

In [None]:
def load_wmt(max_len: int = 1000, test_size: float = 0.20, seed: int = 42):
    raw = load_dataset("yezhengli9/wmt20-en-ru")
    base = raw["train"].map(_extract_translation, remove_columns=raw["train"].column_names)
    base = base.filter(lambda ex: _ok(ex["src"], max_len) and _ok(ex["tgt"], max_len))

    tmp = base.shuffle(seed=seed).train_test_split(test_size=test_size, seed=seed)
    ds = DatasetDict({
        "train": tmp["train"],
        "test": tmp["test"],
    })
    return ds

In [None]:
ds = load_wmt()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/413 [00:00<?, ?B/s]

data/train-00000-of-00001-bdeca064a0bfda(…):   0%|          | 0.00/694k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2002 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1601
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 401
    })
})

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.6.0


In [None]:
import evaluate
chrf = evaluate.load('chrf')
src = [
    "No evidence of acute myocardial infarction.",
    "The patient was prescribed 5 mg of warfarin daily.",
    "CT scan revealed a 2 cm lesion in the left lobe of the liver."
]
ref = [
    "Признаков острого инфаркта миокарда не выявлено.",
    "Пациенту назначено 5 мг варфарина ежедневно.",
    "КТ выявила очаг размером 2 см в левой доле печени."
]
hyp = [
    "Нет признаков острого инфаркта миокарда.",
    "Пациенту назначено 5 мг варфарина каждый день.",
    "КТ показала очаг 2 мм в левой доле печени."
]

for i in range(len(src)):
    print(f"SRC: {src[i]}")
    print(f"REF: {ref[i]}")
    print(f"HYP: {hyp[i]}")
    print(f"chrF: {chrf.compute(predictions=[hyp[i]], references=[ref[i]])}")

SRC: No evidence of acute myocardial infarction.
REF: Признаков острого инфаркта миокарда не выявлено.
HYP: Нет признаков острого инфаркта миокарда.
chrF: {'score': 73.64024156814074, 'char_order': 6, 'word_order': 0, 'beta': 2}
SRC: The patient was prescribed 5 mg of warfarin daily.
REF: Пациенту назначено 5 мг варфарина ежедневно.
HYP: Пациенту назначено 5 мг варфарина каждый день.
chrF: {'score': 74.27254251887318, 'char_order': 6, 'word_order': 0, 'beta': 2}
SRC: CT scan revealed a 2 cm lesion in the left lobe of the liver.
REF: КТ выявила очаг размером 2 см в левой доле печени.
HYP: КТ показала очаг 2 мм в левой доле печени.
chrF: {'score': 54.70692668956404, 'char_order': 6, 'word_order': 0, 'beta': 2}


# Task 2

In [None]:
def compute_bleu_chrf(hyp: list[str], ref: list[str]):
    bleu = evaluate.load('bleu')
    chrf = evaluate.load('chrf')
    bleu = bleu.compute(predictions=hyp, references=ref)
    chrf = chrf.compute(predictions=hyp, references=ref)
    return {"bleu": bleu['bleu'], "chrf": chrf['score']}

# Task3

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [None]:
tok = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
mdl = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to("cuda")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
@torch.no_grad()
def translate_batch(texts: list[str], max_new_tokens: int = 128) -> list[str]:
    enc = tok(texts, return_tensors="pt", padding=True, truncation=True)
    enc = {k: v.to(mdl.device) for k, v in enc.items()}
    out = mdl.generate(**enc, max_new_tokens=max_new_tokens)
    return tok.batch_decode(out, skip_special_tokens=True)

In [None]:
N = 100 # можно изменять в зависимости от доступных ресурсов
val_src = [r["src"] for r in ds["test"]][:N]
val_ref = [r["tgt"] for r in ds["test"]][:N]

In [None]:
hyp = translate_batch(val_src, max_new_tokens=128)

In [None]:
val_src[:5]

['Mr Agbadua argued that the allegations against Mr Sowore attracted capital punishment and that bail in such circumstances where highly restricted.\n',
 "Kickstarting tourism is one of the centrepieces of Crown Prince Mohammed bin Salman's Vision 2030 reform programme to prepare the biggest Arab economy for a post-oil era.\n",
 "Regulatory filings showed that Scharf's total compensation for 2018 at BNY was $9.4 million.\n",
 'Labor leaders and politicians hammered the company for what appeared to be a hardball bargaining tactic two days into the strike.\n',
 'Jacques Chirac was known to have been suffering from ill health for a long time.\n']

In [None]:
hyp[:5]

['Г-н Агбадуа утверждал, что обвинения в адрес г-на Совора влекут за собой смертную казнь и что залог в таких обстоятельствах является весьма ограниченным.',
 'Одним из центральных элементов программы реформ &lt; &lt; Видение 2030 года &gt; &gt; наследного принца Мохаммеда бин Салмана, направленной на подготовку крупнейшей арабской экономики к постнефтяной эпохе, является &lt; &lt; Кикстарт &gt; &gt; .',
 'Согласно нормативным документам, общая сумма компенсации, выплаченной компании &quot; Шарф &quot; за 2018 год, составила 9,4 млн. долл. США.',
 'Лидеры и политики в сфере труда накачали компанию за то, что казалось тактикой жесткого торга за два дня до забастовки.',
 'Как известно, Жак Ширак уже давно страдает от болезней.']

In [None]:
metrics = compute_bleu_chrf(hyp, val_ref)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [None]:
print({k: round(v, 2) for k, v in metrics.items()})

{'bleu': 0.2, 'chrf': 49.47}


In [None]:
print(val_src[0])
print(val_ref[0])
print(hyp[0])

Mr Agbadua argued that the allegations against Mr Sowore attracted capital punishment and that bail in such circumstances where highly restricted.

Агбадуа утверждал, что обвинения против Соворе предполагают смертную казнь, и освобождение под залог в этих условиях должно быть крайне ограничено.

Г-н Агбадуа утверждал, что обвинения в адрес г-на Совора влекут за собой смертную казнь и что залог в таких обстоятельствах является весьма ограниченным.


# 4. Task 4 Дообучение T5

In [None]:
from transformers import DataCollatorForSeq2Seq, \
                            Seq2SeqTrainingArguments, \
                            Seq2SeqTrainer, \
                            AutoTokenizer, \
                            AutoModelForSeq2SeqLM

In [None]:
def make_tokenize_fn(tok, max_src=256, max_tgt=256):
    def _fn(batch):
        src = list(batch["src"])
        tgt = batch["tgt"]
        model_inputs = tok(src, max_length=max_src, truncation=True)
        with tok.as_target_tokenizer():
            labels = tok(tgt, max_length=max_tgt, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    return _fn

In [None]:
tok = AutoTokenizer.from_pretrained("google/mt5-small")
mdl = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
tok_fn = make_tokenize_fn(tok)

In [None]:
train_ds  = ds["train"].map(tok_fn, batched=True, remove_columns=ds["train"].column_names)
val_ds  = ds["test"].map(tok_fn, batched=True, remove_columns=ds["test"].column_names)

Map:   0%|          | 0/1601 [00:00<?, ? examples/s]



Map:   0%|          | 0/401 [00:00<?, ? examples/s]

In [None]:
collate = DataCollatorForSeq2Seq(tokenizer=tok, model=mdl)

In [None]:
args = Seq2SeqTrainingArguments(
    'results',
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    logging_strategy='steps',
    logging_steps=1,
    predict_with_generate=True,
    generation_max_length=128,
    report_to="none",
    bf16=True
    )

In [None]:
trainer = Seq2SeqTrainer(model=mdl,
                         args=args,
                         train_dataset=train_ds,
                         eval_dataset=val_ds,
                         tokenizer=tok,
                         data_collator=collate)

  trainer = Seq2SeqTrainer(model=mdl,


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.4529,2.940291


KeyboardInterrupt: 

# 5. Валидация решения

In [None]:
res = trainer.predict(val_ds)

In [None]:
preds = res.predictions
preds[preds < 0] = 0 # trainer в качестве пэддинга ставит -100

In [None]:
hyp = [text.replace('<extra_id_0>', '').strip()
                  for text in tok.batch_decode(preds, skip_special_tokens=True)]
val_ref = [r["tgt"] for r in ds["test"]]

In [None]:
print(compute_bleu_chrf(hyp, val_ref))

# LEsson 3  Ner

In [None]:
from pydantic import BaseModel
from typing import List

class Entity(BaseModel):
    text: str
    type: str

class NEROutput(BaseModel):
    entities: List[Entity]

In [None]:
parsed = NEROutput(**{
    "entities": [
        {"text": "Apple", "type": "ORG"},
        {"text": "Goldman Sachs", "type": "ORG"},
        {"text": "2023", "type": "DATE"}
    ]
})
print(parsed)

entities=[Entity(text='Apple', type='ORG'), Entity(text='Goldman Sachs', type='ORG'), Entity(text='2023', type='DATE')]


# Task 3-1

In [None]:
from pydantic import BaseModel
from enum import Enum
from typing import List, Optional
import json

In [None]:
class SeverityEnum(str, Enum):
    moderate = "moderate"
    mild = "mild"

class Finding(BaseModel):
    region: str
    observation: str
    severity: SeverityEnum

class ChestXrayReport(BaseModel):
    study_id: str
    findings: List[Finding]
    conclusion: str
    recommendations: List[str]

In [None]:
example = """
{
  "study_id": "XR12345",
  "findings": [
    {
      "region": "правое лёгкое",
      "observation": "инфильтрат",
      "severity": "moderate"
    },
    {
      "region": "левое лёгкое",
      "observation": "плевральный выпот",
      "severity": "mild"
    }
  ],
  "conclusion": "Данные за двусторонние воспалительные изменения, больше справа.",
  "recommendations": [
    "КТ грудной клетки для уточнения характера изменений",
    "Консультация пульмонолога"
  ]
}
"""



In [None]:
json.loads(example)

{'study_id': 'XR12345',
 'findings': [{'region': 'правое лёгкое',
   'observation': 'инфильтрат',
   'severity': 'moderate'},
  {'region': 'левое лёгкое',
   'observation': 'плевральный выпот',
   'severity': 'mild'}],
 'conclusion': 'Данные за двусторонние воспалительные изменения, больше справа.',
 'recommendations': ['КТ грудной клетки для уточнения характера изменений',
  'Консультация пульмонолога']}

In [None]:
a = ChestXrayReport.model_validate_json(example)

In [None]:
a.findings[0].observation

'инфильтрат'

# Task 2

In [None]:
from openai import OpenAI

In [None]:
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key='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'
)

In [None]:
protocol = """
  Протокол рентгенологического исследования органов грудной клетки
  Исследование: рентгенография органов грудной клетки, прямая и боковая проекции
  Дата исследования: 12.09.2025
  Идентификатор исследования: XR-2025-0912-001
  Описание:
  В правом лёгком, в нижней доле, определяется участок инфильтрации средней интенсивности размером до 4 см.
  В левом лёгком, в проекции нижней доли, отмечается небольшое количество плевральной жидкости.
  Сердце и корни лёгких без особенностей.
  Трахея расположена по средней линии.
  Заключение:
  Рентгенологическая картина соответствует очагово-инфильтративным изменениям в нижней доле правого лёгкого; слева — признаки минимального плеврального выпота.
  Рекомендации:
  Проведение компьютерной томографии грудной клетки для уточнения характера инфильтрации.
  Консультация пульмонолога.
  Контрольное исследование через 10–14 дней.
""" # вставьте пример протокола из условия

In [None]:
prompt_no_format_description = f"""Сформируй структурированное заключение по следующему протоколу.

{protocol}
""" # подготовьте первый вариант промпта, без описания ожидаемого формата ответа

In [None]:
prompt_format_description =  f"""Сформируй структурированное заключение по следующему протоколу.
Каждое заключение должно включать:

- идентификатор исследования (study_id);
- список находок (findings), где каждая находка описывается полями: region (например, «правое лёгкое»), observation (например, «инфильтрат»), severity (категория: mild / moderate / severe);
- заключение врача (conclusion), строка с кратким итогом;
- необязательные рекомендации (recommendation), список текстовых элементов.

Результат должен быть в формате json.
""" # подготовьте второй вариант промпта, с описанием ожидаемого формата ответа

In [None]:
for i, prompt in enumerate([prompt_no_format_description, prompt_format_description]):
    print(f' i = {i}')
    completion = client.chat.completions.parse(
      extra_body={},
      model="nvidia/nemotron-nano-9b-v2:free",  # на момент написания задания эта модель доступна бесплатно, может потребоваться найти аналог
      messages=[
        {
          "role": "user",
          "content": prompt,
        }
      ]
    )
    print(completion.choices[0].message.content)

    completion_pydantic = client.chat.completions.parse(
      extra_body={},
      model="nvidia/nemotron-nano-9b-v2:free",
      messages=[
        {
          "role": "user",
          "content": prompt,
        }
      ],
      response_format=ChestXrayReport
    )

    print(ChestXrayReport.model_validate_json(completion_pydantic.choices[0].message.content))

 i = 0
**Структурированное заключение**  

**1. Основные находки:**  
- **Правое легкое:** Обнаружены очагово-инфильтративные изменения в нижней доле с размером до 4 см.  
- **Левое легкое:** Выявлен минимальный плевральный выпот в проекции нижней доли.  
- **Сердце и корни легких:** Нет аномалий.  
- **Трахея:** Правильно расположена по средней линии.  

**2. Диагностическая оценка:**  
- Правый легкий характеризуется локализованными поражениями, подозреваемыми на инфекционную или воспалительную природу (требуется уточнение с помощью компьютерной томографии).  
- Слева присутствуют признаки плеврального выпота, не требующие немедленного вмешательства, но рекомендуется консультация специалиста для исключения патологических причин.  

**3. Рекомендации:**  
- Провести **компьютерную томографию грудной клетки** для оценки размера, локализации и природы инфильтрации в правом легком.  
- Обратиться к **пульмонологу** для дифференциальной диагностики (возможные причины: пневмония, интерстиц

# Lesson 4 Практика. NER как генерация — дообучение и промптинг

In [None]:
!pip install accelerate vllm==0.10.2 seqeval

Collecting vllm
  Downloading vllm-0.14.1-cp38-abi3-manylinux_2_31_x86_64.whl.metadata (8.9 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting blake3 (from vllm)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting protobuf>=6.30.0 (from vllm)
  Downloading protobuf-6.33.4-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer==0.11.3 (from vllm)
  Downloading lm_format_enforcer-0.11.3-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<1.4.0,>=1.3.0 (from vllm)
  Downloading llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x8

In [None]:
# !pip install vllm==0.10.2

Collecting vllm==0.10.2
  Downloading vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl.metadata (16 kB)
Collecting llguidance<0.8.0,>=0.7.11 (from vllm==0.10.2)
  Downloading llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting xgrammar==0.1.23 (from vllm==0.10.2)
  Downloading xgrammar-0.1.23-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Collecting setuptools<80,>=77.0.3 (from vllm==0.10.2)
  Downloading setuptools-79.0.1-py3-none-any.whl.metadata (6.5 kB)
Collecting compressed-tensors==0.11.0 (from vllm==0.10.2)
  Downloading compressed_tensors-0.11.0-py3-none-any.whl.metadata (7.0 kB)
Collecting depyf==0.19.0 (from vllm==0.10.2)
  Downloading depyf-0.19.0-py3-none-any.whl.metadata (7.3 kB)
Collecting torch==2.8.0 (from vllm==0.10.2)
  Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchaudio==2.8.0 (from vllm==0.10.2)
  Downloading torchaudio-2.8.0-cp312-cp312-manyl

# Task 4-1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
main_path = '/content/drive/MyDrive/PRACTICUM_DLE/sprint_7/'

In [None]:
data_path = main_path + 'medicine_dataset/'

In [None]:
from datasets import DatasetDict, Dataset
import json

In [None]:
with open(data_path + "train_v1.jsonl.txt", "r") as fp:
    train_ds = [json.loads(x) for x in fp.readlines()]
    train_ds = Dataset.from_list(train_ds)

with open(data_path + "dev_v1.jsonl.txt", "r") as fp:
    dev_ds = [json.loads(x) for x in fp.readlines()]
    dev_ds = Dataset.from_list(dev_ds)

with open(data_path + "test_v1.jsonl.txt", "r") as fp:
    test_ds = [json.loads(x) for x in fp.readlines()]
    test_ds = Dataset.from_list(test_ds)

In [None]:
ner_dataset = DatasetDict()
ner_dataset["train"] = train_ds
ner_dataset["dev"] = dev_ds
ner_dataset["test"] = test_ds

In [None]:
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'tokens', 'ner_tags'],
        num_rows: 3440
    })
    dev: Dataset({
        features: ['idx', 'tokens', 'ner_tags'],
        num_rows: 676
    })
    test: Dataset({
        features: ['idx', 'tokens', 'ner_tags'],
        num_rows: 693
    })
})

In [None]:
labels_list = set()

for x in ner_dataset["train"]:
    labels_list = labels_list.union(x["ner_tags"])

labels_list = list(labels_list)

In [None]:
from IPython.display import HTML, display
import html

In [None]:
def visualize_tokens(tokens, tags):
    """
    tokens: список токенов
    tags: BIO-теги для этих токенов
    """

    html_output = ""
    i = 0
    while i < len(tokens):
        tag = tags[i]

        if tag.startswith("B-"):  # начало сущности
            ent_type = tag[2:]
            ent_tokens = [tokens[i]]

            # собираем все I- токены этого же типа
            j = i + 1
            while j < len(tokens) and tags[j] == f"I-{ent_type}":
                ent_tokens.append(tokens[j])
                j += 1

            # объединяем в один span
            ent_text = " ".join(ent_tokens)
            html_output += f"<span style='background-color: #ffd54f; padding:2px; margin:1px; border-radius:4px;'>{html.escape(ent_text)} <sub>{ent_type}</sub></span> "
            i = j
        else:
            # токен вне сущности
            html_output += html.escape(tokens[i]) + " "
            i += 1

    display(HTML(html_output))

In [None]:
for i in range(3):
    ex = ner_dataset['train'][i]
    print('\nПример', i)
    visualize_tokens(ex['tokens'], ex['ner_tags'])


Пример 0



Пример 1



Пример 2


# Task 4-2

In [None]:
from typing import Any, Tuple, List, Dict
import evaluate
import numpy as np

In [None]:
seqeval = evaluate.load('seqeval')

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
def convert_preds_to_labels(predictions: np.ndarray, label_ids: np.ndarray, label_list: List[str]) -> Tuple[List[List[str]], List[List[str]]]:
    """
    Преобразует predictions (логиты) и истинные label_ids в списки меток в строковом виде для seqeval.
    Ожидается: predictions.shape = (batch, seq_len, num_labels) или (batch, seq_len) если уже argmax.
    label_ids — числовые метки с -100 для игнорируемых токенов.
    """
    if predictions.ndim == 3:
        preds = np.argmax(predictions, axis=-1)
    else:
        preds = predictions
    true_labels = []
    pred_labels = []
    for pred_row, label_row in zip(preds, label_ids):
        tl_row = []
        pl_row = []
        for p, l in zip(pred_row, label_row):
            if l == -100:
                continue
            tl_row.append(label_list[l])
            pl_row.append(label_list[p])
        true_labels.append(tl_row)
        pred_labels.append(pl_row)
    return pred_labels, true_labels

In [None]:
def compute_metrics_trainer(eval_pred: Any) -> Dict[str, Any]:
    """Функция для Trainer.compute_metrics — возвращает dict с результатами seqeval."""
    predictions, label_ids = eval_pred
    pred_labels, true_labels = convert_preds_to_labels(predictions, label_ids, labels_list)
    result = seqeval.compute(predictions=pred_labels, references=true_labels)
    # Возвращаем основные поля (seqeval уже возвращает nested dict)
    return {
        'precision': result.get('precision', None) or result.get('overall_precision'),
        'recall': result.get('recall', None) or result.get('overall_recall'),
        'f1': result.get('f1', None) or result.get('overall_f1'),
    }

# Донастройка BERT

In [None]:
def tokenize_and_align_labels(batch, tokenizer, labels_list):
    # batch: dict с 'tokens' и 'ner_tags' (каждый — список примеров)
    tokenized_inputs = tokenizer(batch['tokens'],
                                 is_split_into_words=True,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=128,
                                 return_tensors=None)
    all_labels = []
    for i, labels in enumerate(batch['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels_list.index(labels[word_idx]))
            else:
                if labels[word_idx] == "O":
                    label_ids.append(labels_list.index(labels[word_idx]))
                else:
                    label_ids.append(labels_list.index("I-"+labels[word_idx].split("-")[1]))
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs['labels'] = all_labels
    return tokenized_inputs

# Task 4-3

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenized_ds = ner_dataset.map(tokenize_and_align_labels, batched=True, fn_kwargs={"tokenizer": tokenizer, "labels_list": labels_list})

Map:   0%|          | 0/3440 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/693 [00:00<?, ? examples/s]

In [None]:
num_labels = len(labels_list)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    output_dir='bert-ner',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_trainer,
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.383067,0.212361,0.430177,0.28435
2,0.502900,0.271972,0.365237,0.532905,0.43342
3,0.282400,0.273127,0.302573,0.547352,0.389714


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1290, training_loss=0.35460195615310075, metrics={'train_runtime': 312.2532, 'train_samples_per_second': 33.05, 'train_steps_per_second': 4.131, 'total_flos': 674212676382720.0, 'train_loss': 0.35460195615310075, 'epoch': 3.0})

# Инференс

In [None]:
from transformers import pipeline

In [None]:
model.config.label2id = {l: i for i, l in enumerate(labels_list)}
model.config.id2label = {i: l for i, l in enumerate(labels_list)}

In [None]:
p = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [None]:
sent = " ".join(ner_dataset['test'][0]["tokens"])
print(p(sent))

[{'entity_group': 'DI', 'score': np.float32(0.89082783), 'word': 'простуда на губах', 'start': 8, 'end': 25}, {'entity_group': 'Drugform', 'score': np.float32(0.925561), 'word': 'мазь', 'start': 55, 'end': 59}, {'entity_group': 'Drugname', 'score': np.float32(0.9933857), 'word': 'Ацикловир', 'start': 60, 'end': 69}, {'entity_group': 'DI', 'score': np.float32(0.6754481), 'word': 'название', 'start': 92, 'end': 100}]


In [None]:
from typing import List, Dict, Tuple

In [None]:
def convert_pipeline_output_to_bio(pipeline_output: List[Dict], tokens: List[str], label_list: List[str]) -> List[str]:
    tags = ["O"] * len(tokens)
    token_char_indices = []
    current_char_idx = 0
    for token in tokens:
        token_char_indices.append((current_char_idx, current_char_idx + len(token)))
        current_char_idx += len(token) + 1 # Account for space

    for entity in pipeline_output:
        start_char = entity['start']
        end_char = entity['end']
        entity_type = entity['entity_group']

        start_token_idx = -1
        end_token_idx = -1

        # Find the token indices corresponding to the entity character span
        for i, (token_start_char, token_end_char) in enumerate(token_char_indices):
            # Check if the entity span starts within or at the beginning of the token span
            if start_char >= token_start_char and start_char < token_end_char:
                 start_token_idx = i
            # Check if the entity span ends within or at the end of the token span
            if end_char > token_start_char and end_char <= token_end_char:
                 end_token_idx = i

        if start_token_idx != -1 and end_token_idx != -1:
            tags[start_token_idx] = f"B-{entity_type}"
            for i in range(start_token_idx + 1, end_token_idx + 1):
                tags[i] = f"I-{entity_type}"

    return tags

In [None]:
test_sentence_tokens = ner_dataset['test'][0]["tokens"]
test_sent = " ".join(test_sentence_tokens)
pipeline_result = p(test_sent)
bio_tags = convert_pipeline_output_to_bio(pipeline_result, test_sentence_tokens, labels_list)
print("Tokens:", test_sentence_tokens)
print("BIO Tags:", bio_tags)

visualize_tokens(test_sentence_tokens, bio_tags)

Tokens: ['Вылезла', 'простуда', 'на', 'губах', ',', 'заказала', 'мужу', ',', 'чтоб', 'купил', 'мазь', 'Ацикловир', ',', 'ну', 'вообщем', 'он', 'забыл', 'название', ',', 'и', 'купил', ',', 'то', ',', 'что', 'собственно', 'ему', 'предложили', 'в', 'аптеке', ',', 'прорекламировав', 'так', ',', 'что', 'мол', 'жена', 'оценит', 'некий', 'матирующий', 'эффект', 'от', 'данного', 'средства', '.']
BIO Tags: ['O', 'B-DI', 'I-DI', 'I-DI', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Drugform', 'B-Drugname', 'O', 'O', 'O', 'O', 'O', 'B-DI', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
eval_bert_finetune_metrics = trainer.evaluate(tokenized_ds['test'])

In [None]:
eval_bert_finetune_metrics

{'eval_loss': 0.29913195967674255,
 'eval_precision': 0.30718336483931946,
 'eval_recall': 0.540765391014975,
 'eval_f1': 0.39180229053646776,
 'eval_runtime': 6.3239,
 'eval_samples_per_second': 109.584,
 'eval_steps_per_second': 13.757,
 'epoch': 3.0}

# LLM + prompt

# Task 4-4

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

In [None]:
slug = "Qwen/Qwen3-0.6B"
qwen_tokenizer = AutoTokenizer.from_pretrained(slug)
qwen_model = AutoModelForCausalLM.from_pretrained(slug, device_map="auto")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
import torch

In [None]:
@torch.no_grad
def inference(model, tokenizer, sentence):
    prompt = 'Extract entities (ADR, DI, Drugclass, Drugform, Drugname, Finding) from the sentence. Return as JSON list of {"text":..., "type":...}.\n\n'+sentence+'\n\n ```json\n'
    messages = [
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # conduct text completion
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=32768
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()


    content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
    return content

In [None]:
def prompt_output_to_bio(tokens: List[str], prompt_output: str, label_list: List[str]) -> List[int]:
    ner_tags = [label_list.index('O')] * len(tokens)
    # Попробуем найти JSON
    parsed = None
    pattern = r"```json\n([\s\S]*?)\n```"
    matches = re.findall(pattern, prompt_output)
    try:
        parsed = json.loads(matches[0])
    except Exception:
        # Попробуем найти шаблон 'entity: Type; entity2: Type'
        parts = [p.strip() for p in prompt_output.replace(';', '\n').split('\n') if p.strip()]
        parsed = []
        for p in parts:
            if ':' in p:
                left, right = p.split(':', 1)
                parsed.append({'text': left.strip().strip('"'), 'type': right.strip()})

    # parsed ожидается как список объектов {'text':..., 'type':...}
    if isinstance(parsed, list):
        for ent in parsed:
            text = ent.get('text') if isinstance(ent, dict) else None
            typ = ent.get('type') if isinstance(ent, dict) else None
            if not text or not typ:
                continue
            # простая стратегия: ищем последовательность токенов, равную text.split()
            ent_toks = text.split()
            # naive search
            for i in range(len(tokens) - len(ent_toks) + 1):
                window = tokens[i:i+len(ent_toks)]
                if [w.lower().strip('.,') for w in window] == [w.lower().strip('.,') for w in ent_toks]:
                    b_label = f'B-{typ}'
                    i_label = f'I-{typ}'
                    if b_label in label_list:
                        ner_tags[i] = label_list.index(b_label)
                        for j in range(1, len(ent_toks)):
                            ner_tags[i+j] = label_list.index(i_label) if i_label in label_list else ner_tags[i+j]
                    break
    return ner_tags

In [None]:
raw_output = inference(qwen_model, qwen_tokenizer, test_sent)
print('Выход LLM:', raw_output)

bio_tags = prompt_output_to_bio(test_sentence_tokens, raw_output, labels_list)
print('Сконвертированные теги:', [labels_list[tag] for tag in bio_tags])

Выход LLM: ```json
[
  {"text": "Простуда", "type": "ADR"},
  {"text": "Мужу", "type": "Drugform"},
  {"text": "Мазь Ацикловир", "type": "Drugname"},
  {"text": "Заказала", "type": "Finding"},
  {"text": "Забыла название", "type": "Finding"},
  {"text": "Купил", "type": "Finding"},
  {"text": "Мол жена оценит", "type": "Finding"},
  {"text": "Некий матирующий эффект", "type": "Finding"}
]
```
Сконвертированные теги: ['O', 'B-ADR', 'O', 'O', 'O', 'B-Finding', 'B-Drugform', 'O', 'O', 'B-Finding', 'B-Drugname', 'I-Drugname', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Finding', 'I-Finding', 'I-Finding', 'B-Finding', 'I-Finding', 'I-Finding', 'O', 'O', 'O', 'O']


In [None]:
visualize_tokens(test_sentence_tokens, [labels_list[tag] for tag in bio_tags])

# VLLM + Structured Outputs

# Task 4-5

In [None]:
from enum import StrEnum
import pydantic
from typing import List

class EntityType(StrEnum):
    adr = "ADR"
    di = "DI"
    drugclass = "Drugclass"
    drugform = "Drugform"
    drugname = "Drugname"
    finding = "Finding"

class Entity(pydantic.BaseModel):
    text: str
    type: EntityType

class Result(pydantic.BaseModel):
    entities: List[Entity]

In [None]:
from vllm import SamplingParams
from vllm.sampling_params import GuidedDecodingParams

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [None]:
json_schema = Result.model_json_schema()
guided = GuidedDecodingParams(json=json_schema)
sampling_params = SamplingParams(guided_decoding=guided, max_tokens=500)

In [None]:
from vllm import LLM

INFO 01-27 20:35:35 [__init__.py:216] Automatically detected platform cuda.


In [None]:
slug = "Qwen/Qwen3-1.7B"
llm = LLM(model=slug, guided_decoding_backend='xgrammar', max_num_batched_tokens=512, max_model_len=4096, gpu_memory_utilization=0.8)

INFO 01-27 20:35:37 [utils.py:328] non-default args: {'max_model_len': 4096, 'gpu_memory_utilization': 0.8, 'max_num_batched_tokens': 512, 'disable_log_stats': True, 'guided_decoding_backend': 'xgrammar', 'model': 'Qwen/Qwen3-1.7B'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

INFO 01-27 20:36:22 [__init__.py:742] Resolved architecture: Qwen3ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-27 20:36:22 [__init__.py:1815] Using max model len 4096
INFO 01-27 20:36:29 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=512.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

INFO 01-27 20:43:31 [llm.py:295] Supported_tasks: ['generate']
INFO 01-27 20:43:31 [__init__.py:36] No IOProcessor plugins requested by the model


In [None]:
def schema_inference(model, sentence, sampling_params):
    prompt = 'Extract entities of class ADR (adverse drug reaction), DI (drug interference), Drugclass, Drugform, Drugname or Finding from the sentence. Return as JSON list of {"text": quote_from_text, "type": assigned_class}. Skip non-mentioned classes\n' + sentence
    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params, use_tqdm=False)
    return Result.model_validate_json(outputs[0].outputs[0].text).entities

In [None]:
def schema_to_bio(tokens, parsed, label_list):
    ner_tags = [label_list.index('O')] * len(tokens)
    # parsed - список объектов класса Entity
    for ent in parsed:
        text = ent.text
        typ = str(ent.type)
        if not text or not typ:
            continue
        # простая стратегия: ищем последовательность токенов в предложении, равную text.split()
        ent_toks = text.split()
        #
        for i in range(len(tokens) - len(ent_toks) + 1):
            window = tokens[i:i+len(ent_toks)]
            if [w.lower().strip('.,') for w in window] == [w.lower().strip('.,') for w in ent_toks]:
                b_label = f'B-{typ}'
                i_label = f'I-{typ}'
                if b_label in label_list: # проверяем, что такой тег действительно есть в нашей задаче, иначе мы не знаем, с чем его сравнивать.
                    ner_tags[i] = label_list.index(b_label)  # сначала добавим в список тегов открывающийся тег
                    for j in range(1, len(ent_toks)):
                        ner_tags[i+j] = label_list.index(i_label) if i_label in label_list else ner_tags[i+j] # а затем - все внутренние теги
                break
    return ner_tags


def evaluate_llm_schema_on_dataset(model, dataset, label_list, sampling_params):
    true_labels = []
    pred_labels = []

    for example in tqdm(dataset):
        tokens = example['tokens']
        true_tags = example['ner_tags']

        sentence = " ".join(tokens)
        try:
          raw_output = schema_inference(model, sentence, sampling_params=sampling_params)
        except pydantic.ValidationError:
          # если что-то пошло не по плану - например, генерация зациклилась, то распарсить ответ не получится - пропустим такие примеры
          raw_output = dict()

        pred_tags = schema_to_bio(tokens, raw_output, label_list)

        true_labels.append(true_tags)
        pred_labels.append([label_list[tag] for tag in pred_tags])


    results = seqeval.compute(predictions=pred_labels, references=true_labels)
    return results

In [None]:
num_samples = 100
eval_llm_schema_metrics = evaluate_llm_schema_on_dataset(llm, ner_dataset['test'].select(range(num_samples)), labels_list, sampling_params)

NameError: name 'ner_dataset' is not defined

# Дообучение в формате T5

# Task 4-6

In [None]:
from typing import List, Tuple

In [None]:
def extract_entities_intervals(tags: List[int]) -> List[Tuple[str, int, int]]:
    """
    Преобразует список тегов (в виде индексов) в интервалы сущностей.
    Возвращает список кортежей: (label, start_idx, end_idx).
    """
    entities = []
    start, end, ent_type = None, None, None

    for i, label in enumerate(tags):

        if label == "O":
            if ent_type is not None:
                entities.append((ent_type, start, end))
                ent_type, start, end = None, None, None
        elif label.startswith("B-"):
            if ent_type is not None:
                entities.append((ent_type, start, end))
            ent_type = label[2:]
            start, end = i, i
        elif label.startswith("I-") and ent_type == label[2:]:
            end = i
        else:
            # случай разметки не по правилам BIO
            if ent_type is not None:
                entities.append((ent_type, start, end))
            ent_type, start, end = None, None, None

    if ent_type is not None:
        entities.append((ent_type, start, end))

    return entities

In [None]:
def make_target_from_entities(tokens: List[str], tags: List[int]) -> str:
    ents = extract_entities_intervals(tags)
    parts = []
    for ty, s, e in ents:
        text = ' '.join(tokens[s:e+1])
        parts.append(f'"{text}": "{ty}"')
    return "{" + (', '.join(parts) if parts else '') + "}"

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
def prepare_seq2seq_dataset(dataset):
    # функция, которая преобразует каждый сплит
    def _convert_split(split):
        records = []
        for ex in dataset[split]:
            input_text = " ".join(ex["tokens"]) # склеенное входное предложение
            target_text = make_target_from_entities(ex["tokens"], ex["ner_tags"]) # строка с ожидаемым результатом генерации
            records.append({"input_text": input_text, "target_text": target_text})
        return Dataset.from_list(records)

    out = {}
    for split in dataset.keys():
        out[split] = _convert_split(split)
    return DatasetDict(out)

In [None]:
def tokenize_seq2seq(batch):
    # токенизация входной строки
    model_inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=256)
    # токенизация выхода
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=256)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
seq2seq_dataset = prepare_seq2seq_dataset(ner_dataset)
tokenized_dataset = seq2seq_dataset.map(tokenize_seq2seq, batched=True)

In [None]:
print(seq2seq_dataset['train'][0])

In [None]:
from peft import get_peft_model, PromptTuningConfig, TaskType
from typing import List
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
model_name = 'google/mt5-small'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
main_path = '/content/drive/MyDrive/PRACTICUM_DLE/sprint_7/'

In [None]:
data_path = main_path + 'medicine_dataset/'

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir=main_path + "t5_ner",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    logging_dir=main_path + "logs",
    report_to="none"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

# Task 4-7

In [None]:
def t5_output_to_bio(decoded_text, tokens, label_list):
    ner_tags = ["O"] * len(tokens)
    entities = {}

    try:
        entities = eval(decoded_text)
    except:
        decoded_text = decoded_text.replace("{", "").replace("}", "")
        parts = [p.strip() for p in decoded_text.split(',') if p.strip()]
        for p in parts:
            if ':' in p:
                text, typ = p.split(':', 1)
                entities[text.strip()] = typ.strip()

    for entity_text, entity_type in entities.items():
        ent_toks = entity_text.split()
        for i in range(len(tokens) - len(ent_toks) + 1):
            window = tokens[i:i+len(ent_toks)]

            if [w.lower().strip('.,') for w in window] == [w.lower().strip('.,') for w in ent_toks]:
                b_label = f"B-{entity_type}"
                i_label = f"I-{entity_type}"
                if b_label in label_list:
                    ner_tags[i] = b_label
                    for j in range(1, len(ent_toks)):
                        if i + j < len(tokens):
                            ner_tags[i+j] = i_label if i_label in label_list else ner_tags[i+j]
                break
    return ner_tags

In [None]:
raw_t5_output = '{"мазь": "Drugform"}'
tags = t5_output_to_bio(raw_t5_output, test_sentence_tokens, labels_list)

In [None]:
print("Сконвертированные теги: ", tags)

In [None]:
import numpy as np
import torch

In [None]:
def evaluate_t5_with_seqeval(trainer, dataset, raw_dataset, label_list):
    predictions = trainer.predict(dataset, max_length=256)

    preds = np.where(predictions.predictions != -100, predictions.predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, )
    decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

    true_labels = []
    pred_labels = []

    for i in range(len(dataset)):
        original_tokens = raw_dataset[i]["tokens"]
        true_tags = raw_dataset[i]["ner_tags"]
        true_labels.append(true_tags)

        pred_bio_tags = t5_output_to_bio(decoded_preds[i], original_tokens, labels_list)
        pred_labels.append(pred_bio_tags)

    t5_seqeval_results = seqeval.compute(predictions=pred_labels, references=true_labels)
    return t5_seqeval_results



In [None]:
@torch.no_grad
def t5_inference(sentence: str, model, tokenizer) -> List[str]:
    input_ids = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=256).input_ids.to(model.device)
    generated_ids = model.generate(input_ids, max_new_tokens=256)
    decoded_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return decoded_output

In [None]:
raw_t5_output = t5_inference(test_sent, model, tokenizer)
tags = t5_output_to_bio(raw_t5_output, test_sentence_tokens, labels_list)

In [None]:
print("Выход T5: ", raw_t5_output)
print("Сконвертированные теги: ", tags)

In [None]:
visualize_tokens(test_sentence_tokens, tags)

In [None]:
num_samples = 100  # если есть время, проведите замер на полном тестовом датасете
eval_t5_results = evaluate_t5_with_seqeval(trainer, tokenized_dataset['test'].select(range(num_samples)), ner_dataset['test'].select(range(num_samples)), labels_list)

# lesson 7 Практика. Дообучение T5 с помощью LoRA на датасете SQuAD

# Task 7-1

In [14]:
pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.27.3 rapidfuzz-3.14.3


In [21]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
MODEL_ID = "ai-forever/ruT5-base"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
MAX_SRC_LEN = 512
MAX_TGT_LEN = 64

In [5]:
raw = load_dataset("kuznetsoffandrey/sberquad")

README.md: 0.00B [00:00, ?B/s]

sberquad/train-00000-of-00001.parquet:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

sberquad/validation-00000-of-00001.parqu(…):   0%|          | 0.00/3.43M [00:00<?, ?B/s]

sberquad/test-00000-of-00001.parquet:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45328 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5036 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/23936 [00:00<?, ? examples/s]

In [6]:
def format_example(ex):
    q = ex["question"].strip()
    c = ex["context"].strip()
    y = ex["answers"]["text"][0].strip() if ex["answers"]["text"] else ""
    src = f"context: {c} \nquestion: {q}"
    tgt = y
    return {"input_text": src, "labels_text": tgt}

In [7]:
raw['train'][0]

{'id': 62310,
 'title': 'SberChallenge',
 'context': 'В протерозойских отложениях органические остатки встречаются намного чаще, чем в архейских. Они представлены известковыми выделениями сине-зелёных водорослей, ходами червей, остатками кишечнополостных. Кроме известковых водорослей, к числу древнейших растительных остатков относятся скопления графито-углистого вещества, образовавшегося в результате разложения Corycium enigmaticum. В кремнистых сланцах железорудной формации Канады найдены нитевидные водоросли, грибные нити и формы, близкие современным кокколитофоридам. В железистых кварцитах Северной Америки и Сибири обнаружены железистые продукты жизнедеятельности бактерий.',
 'question': 'чем представлены органические остатки?',
 'answers': {'text': ['известковыми выделениями сине-зелёных водорослей'],
  'answer_start': [109]}}

In [8]:
def tokenize(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_SRC_LEN,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        batch["labels_text"],
        max_length=MAX_TGT_LEN,
        truncation=True,
        padding="max_length",
    )
    labels_ids = labels["input_ids"]
    labels_ids = [[(tid if tid != tokenizer.pad_token_id else -100) for tid in seq] for seq in labels_ids]
    model_inputs["labels"] = labels_ids
    return model_inputs

In [9]:
formatted_train = raw["train"].shard(num_shards=10, index=0).map(format_example)
formatted_val = raw["validation"].shard(num_shards=10, index=0).map(format_example)

Map:   0%|          | 0/4533 [00:00<?, ? examples/s]

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

In [10]:
cols_to_remove = [c for c in raw["train"].column_names if c not in ["input_text", "labels_text"]]
formatted_train = formatted_train.remove_columns(cols_to_remove)
formatted_val = formatted_val.remove_columns(cols_to_remove)

In [11]:
tokenized_train = formatted_train.map(tokenize, batched=True, remove_columns=["input_text", "labels_text"])
tokenized_val = formatted_val.map(tokenize, batched=True, remove_columns=["input_text", "labels_text"])

Map:   0%|          | 0/4533 [00:00<?, ? examples/s]

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

In [12]:
print(tokenized_train[0].keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


# MEtrics

In [17]:
import Levenshtein

In [18]:
def compute_cer(preds, refs):
    errors = 0
    lens = 0
    for p, r in zip(preds, refs):
        errors += Levenshtein.distance(p, r)
        lens += len(r)
    return round(errors / lens * 100, 2) # количество ошибок на длину правильных текстов


In [19]:
print(compute_cer(['Привт'], ['Привет']))

16.67


# task 7-2

In [22]:
import torch
import evaluate
import numpy as np

In [23]:
squad_metric = evaluate.load("squad")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [24]:
def compute_metrics_with_evaluate(preds, refs):
    ids = list(range(len(refs)))
    predictions = [{"id": str(i), "prediction_text": p} for i, p in zip(ids, preds)]
    references = [{"id": str(i), "answers": {"text": [r], "answer_start": [0]}} for i, r in zip(ids, refs)]
    squad_res = squad_metric.compute(predictions=predictions, references=references)
    cer = compute_cer(preds, refs)
    return {
        "EM": squad_res["exact_match"],
        "F1": squad_res["f1"],
        "CER": cer,
        "count": len(refs)
    }

# Task 7-3  Файнтюн полной модели

In [25]:
import torch
from transformers import AutoModelForSeq2SeqLM, \
                        DataCollatorForSeq2Seq, \
                        Seq2SeqTrainingArguments, \
                        Seq2SeqTrainer

In [26]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [27]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [28]:
args = Seq2SeqTrainingArguments(
    output_dir="rut5_base_full",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    logging_steps=1,
    eval_strategy="epoch",
    predict_with_generate=True,
    gradient_accumulation_steps=1,
    optim='adafactor'
)

In [29]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=collator,
)

In [30]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
res = trainer.predict(tokenized_val)
preds = res.predictions
preds[preds < 0] = 0
pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.array(tokenized_val['labels'])
labels[labels < 0] = 0
label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)
print(compute_metrics_with_evaluate(pred_texts, label_texts))

# Task 7-4 Файнтюн адаптера

In [31]:
import torch
from transformers import AutoModelForSeq2SeqLM, \
                        DataCollatorForSeq2Seq, \
                        Seq2SeqTrainingArguments, \
                        Seq2SeqTrainer
from peft import LoraConfig, get_peft_model

In [32]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

In [33]:
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q", "k", "v",],
    task_type="SEQ_2_SEQ_LM",
)

In [34]:
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainable params: 1,327,104 || all params: 224,230,656 || trainable%: 0.5918


In [35]:
args = Seq2SeqTrainingArguments(
    output_dir="rut5_base_full",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    logging_steps=1,
    eval_strategy="epoch",
    predict_with_generate=True,
    gradient_accumulation_steps=1,
    optim='adafactor'
)

In [36]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=collator,
)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
res = trainer.predict(tokenized_val)
preds = res.predictions
preds[preds < 0] = 0
pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.array(tokenized_val['labels'])
labels[labels < 0] = 0
label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)
print(compute_metrics_with_evaluate(pred_texts, label_texts))