In [2]:
!nvidia-smi

Sun Mar 16 20:34:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [24]:
#!pip install accelerate
#!pip install transformers==4.45.2
#!pip install bitsandbytes
!pip install datasets
!pip install rouge-score
!pip install pymorphy3
!pip install seqeval
#!pip install peft
#!pip install flash_attn

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=cd27a73edda390d9732568bca01aed6cd53e7c69349d539486632fa5e5b724ac
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
!git clone https://github.com/RefalMachine/llmtf_open
%cd llmtf_open
!wget https://raw.githubusercontent.com/dialogue-evaluation/RuOpinionNE-2024/master/train.jsonl

Cloning into 'llmtf_open'...
remote: Enumerating objects: 639, done.[K
remote: Counting objects: 100% (219/219), done.[K
remote: Compressing objects: 100% (148/148), done.[K
remote: Total 639 (delta 150), reused 135 (delta 70), pack-reused 420 (from 1)[K
Receiving objects: 100% (639/639), 2.32 MiB | 7.59 MiB/s, done.
Resolving deltas: 100% (435/435), done.
/content/llmtf_open
--2025-03-16 20:35:06--  https://raw.githubusercontent.com/dialogue-evaluation/RuOpinionNE-2024/master/train.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1291979 (1.2M) [text/plain]
Saving to: ‘train.jsonl’


2025-03-16 20:35:06 (47.0 MB/s) - ‘train.jsonl’ saved [1291979/1291979]



In [4]:
!ls

conversation_configs  llm_as_a_judge_baselines		  run_evaluate_multinode_multigpu.sh
Dockerfile	      llmtf				  run_evaluate_singlenode_multigpu.sh
eval_grammar.py       README.md				  run_llm_as_a_judge.py
evaluate_model.py     requirements.txt			  todo.txt
examples	      run_evaluate_multinode_multigpu.py  train.jsonl


In [66]:
import codecs
import json
import copy
from collections import OrderedDict, defaultdict
import numpy as np
from tqdm import tqdm
import os
from datasets import load_dataset, Dataset
from typing import Dict, List, Tuple
from llmtf.metrics import mean, metric_max_over_ground_truths, f1_macro_score
import transformers.data.metrics.squad_metrics as squad_metrics
import re
from llmtf.base import Task, SimpleFewShotHFTask, LLM
from difflib import SequenceMatcher
import pandas as pd
import sys
import string

class MultiQ(SimpleFewShotHFTask):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.method = 'generate'
        self.dataset_name = 'multiq'
        self._max_new_tokens = 64

    @classmethod
    def name(cls):
        return 'darumeru/MultiQ'

    def dataset_args(self) -> Dict:
        return {'path': 'RefalMachine/darumeru', 'name': self.dataset_name}

    def aggregation(self) -> Dict:
        return {"f1": mean, "em": mean}

    def evaluate(self, sample, y_pred) -> Dict:
        y_true = [answer["segment"] for answer in sample['outputs']]
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, y_pred, y_true)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, y_pred, y_true)

        return {
            "f1": f1,
            "em": em,
        }
    def test_split_name(self) -> str:
        return 'test'

    def prompt_split_name(self) -> str:
        return 'prompt'

    def create_messages(self, sample, with_answer=None) -> List[Dict]:
        # ignoring with_answer because it's already taken into account in the darumeru dataset
        messages = sample['messages']
        inputs = sample['inputs']
        for m in messages:
            m['content'] = m['content'].format(**inputs)
        return messages

class conll(SimpleFewShotHFTask):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.method = 'generate'
        self.dataset_name = 'eriktks/conll2003'
        self._max_new_tokens = 1
    @classmethod
    def name(cls) -> str:
        return 'eriktks/conll2003'
    @property
    def choices(self) ->List:
        return ['0', '1', '2', '3', '4', '5', '6', '7', '8']
    def create_messages(self, sample, with_answer):
        messages = []
        instruction_user = 'Твоя задача решить задачу NER: извлечение именованных сущностей. Тебе нужно отнести каждый токен текста к одной из следующих категорий: 0-нет сущности, 1 - начало личности, 2 - продолжение личности, 3 - начало организации, 4 - продолжение организации, 5 - начало локации, 6 - продолжение локации, 7 - начало остального, 8 - продолжение остального. Ответом должна служить последовательность этих чисел, где каждое число относит соответствующий токен. \Токены: {tokens}'
        instruction_bot = 'Ответ: {ner_tags}'
        instruction_bot_incomplete = 'Ответ:'
        bot_content = instruction_bot.format(**sample) if with_answer else instruction_bot_incomplete
        messages.append({'role': 'user', 'content': instruction_user.format(**sample)})
        messages.append({'role': 'bot', 'content': bot_content})
        return messages
    def test_split_name(self) -> str:
        return 'validation'
    def prompt_split_name(self) -> str:
        return 'train'
    def dataset_args(self) -> Dict:
        return {'path': 'eriktks/conll2003'}
    def evaluate(self, sample, y_pred) -> Dict:
        y_true = str(sample['ner_tags'])
        return {"f1": metric_max_over_ground_truths(squad_metrics.compute_f1, y_pred, y_true)}
    def aggregation(self) -> Dict:
        return {"f1": mean}
    def prompt_dataset_start_idx(self) -> int:
        return 0


In [8]:
from llmtf.model import HFModel
from llmtf.evaluator import Evaluator


s_model = HFModel(attn_implementation='sdpa', device_map='cuda')
s_model.from_pretrained('RefalMachine/RuadaptQwen2.5-1.5B-instruct')
s_model.generation_config.max_new_tokens = 200
s_model.generation_config.repetition_penalty = 1.0
s_model.generation_config.do_sample = True
s_model.generation_config.temperature = 0.1
s_model.generation_config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/822 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.34M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

INFO: 2025-03-16 20:36:20,028: llmtf.base.hfmodel: Set eos_token_id in generation_config to [145111]
INFO:llmtf.base.hfmodel:Set eos_token_id in generation_config to [145111]
INFO: 2025-03-16 20:36:20,031: llmtf.base.hfmodel: Model id: RefalMachine/RuadaptQwen2.5-1.5B-instruct
INFO:llmtf.base.hfmodel:Model id: RefalMachine/RuadaptQwen2.5-1.5B-instruct
INFO: 2025-03-16 20:36:20,032: llmtf.base.hfmodel: Leading space: False
INFO:llmtf.base.hfmodel:Leading space: False


GenerationConfig {
  "bos_token_id": 145109,
  "do_sample": true,
  "eos_token_id": [
    145111
  ],
  "max_length": 32768,
  "max_new_tokens": 200,
  "pad_token_id": 145109,
  "stop_strings": [
    "<|im_end|>"
  ],
  "temperature": 0.1,
  "top_k": 40,
  "top_p": 0.9,
  "trust_remote_code": false
}

In [None]:
from llmtf.model import HFModel
from llmtf.evaluator import Evaluator


f_model = HFModel(attn_implementation='sdpa', device_map='cuda')
f_model.from_pretrained('openchat/openchat-3.5-0106')
f_model.generation_config.max_new_tokens = 200
f_model.generation_config.repetition_penalty = 1.0
f_model.generation_config.do_sample = False
f_model.generation_config.temperature = 0.0
f_model.generation_config

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 34.12 MiB is free. Process 27488 has 14.71 GiB memory in use. Of the allocated memory 14.31 GiB is allocated by PyTorch, and 300.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [67]:
s_task = conll()
f_task = conll()

In [68]:
s_evaluator = Evaluator()

s_evaluator.evaluate_dataset(
    task=s_task,
    model=s_model,
    max_len=4000,
    output_dir = './conll2003_s',
    few_shot_count=0,
    generation_config=None,
    batch_size=4,
    max_sample_per_dataset=20
)

f_evaluator = Evaluator()

f_evaluator.evaluate_dataset(
    task=f_task,
    model=f_model,
    max_len=4000,
    output_dir = './conll2003_f',
    few_shot_count=0,
    generation_config=None,
    batch_size=4,
    max_sample_per_dataset=88
)

INFO: 2025-03-16 21:34:50,998: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
INFO:llmtf.base.hfmodel:Updated generation_config.eos_token_id: [145111]
INFO: 2025-03-16 21:34:50,999: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
INFO:llmtf.base.hfmodel:Updated generation_config.stop_strings: ['<|im_end|>']
100%|██████████| 20/20 [00:00<00:00, 982.85it/s]
INFO: 2025-03-16 21:34:51,944: llmtf.base.eriktks/conll2003: Loading Dataset: 0.94s
INFO:llmtf.base.eriktks/conll2003:Loading Dataset: 0.94s
100%|██████████| 5/5 [02:07<00:00, 25.49s/it]
INFO: 2025-03-16 21:36:59,411: llmtf.base.eriktks/conll2003: Processing Dataset: 127.46s
INFO:llmtf.base.eriktks/conll2003:Processing Dataset: 127.46s
INFO: 2025-03-16 21:36:59,413: llmtf.base.eriktks/conll2003: Results for eriktks/conll2003:
INFO:llmtf.base.eriktks/conll2003:Results for eriktks/conll2003:
INFO: 2025-03-16 21:36:59,416: llmtf.base.eriktks/conll2003: {'f1': 0.11037128712871287}
INFO:llmt

"\nf_evaluator = Evaluator()\n\nf_evaluator.evaluate_dataset(\n    task=f_task,\n    model=f_model,\n    max_len=4000,\n    output_dir = './conll2003_f',\n    few_shot_count=0,\n    generation_config=None,\n    batch_size=4,\n    max_sample_per_dataset=88\n)"

In [46]:
task = MultiQ()

In [None]:
from llmtf.model import HFModel

model = HFModel(attn_implementation='sdpa', device_map='cuda')
model.from_pretrained('RefalMachine/RuadaptQwen2.5-1.5B-instruct')

model.generation_config.max_new_tokens = 200
model.generation_config.repetition_penalty = 1.0
model.generation_config.do_sample = False
model.generation_config.temperature = 0.0
model.generation_config

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.34M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

INFO: 2025-03-06 13:57:15,779: llmtf.base.hfmodel: Set eos_token_id in generation_config to [145111]
INFO:llmtf.base.hfmodel:Set eos_token_id in generation_config to [145111]
INFO: 2025-03-06 13:57:15,781: llmtf.base.hfmodel: Model id: RefalMachine/RuadaptQwen2.5-1.5B-instruct
INFO:llmtf.base.hfmodel:Model id: RefalMachine/RuadaptQwen2.5-1.5B-instruct
INFO: 2025-03-06 13:57:15,783: llmtf.base.hfmodel: Leading space: False
INFO:llmtf.base.hfmodel:Leading space: False


GenerationConfig {
  "bos_token_id": 145109,
  "eos_token_id": [
    145111
  ],
  "max_length": 32768,
  "max_new_tokens": 200,
  "pad_token_id": 145109,
  "stop_strings": [
    "<|im_end|>"
  ],
  "temperature": 0.0,
  "top_k": 40,
  "top_p": 0.9,
  "trust_remote_code": false
}

In [47]:
from llmtf.evaluator import Evaluator
evaluator = Evaluator()

evaluator.evaluate_dataset(
    task=task,
    model=s_model,
    output_dir='./multiq',
    max_len=4000,
    few_shot_count=0,
    generation_config=None, # will use model.generation_config by default
    batch_size=4,
    max_sample_per_dataset=200
)

INFO: 2025-03-16 21:13:50,509: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [145111]
INFO:llmtf.base.hfmodel:Updated generation_config.eos_token_id: [145111]
INFO: 2025-03-16 21:13:50,510: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
INFO:llmtf.base.hfmodel:Updated generation_config.stop_strings: ['<|im_end|>']
100%|██████████| 200/200 [00:00<00:00, 722.96it/s]
INFO: 2025-03-16 21:13:51,706: llmtf.base.darumeru/MultiQ: Loading Dataset: 1.19s
INFO:llmtf.base.darumeru/MultiQ:Loading Dataset: 1.19s
  0%|          | 0/50 [00:02<?, ?it/s]
INFO: 2025-03-16 21:13:54,653: llmtf.base.darumeru/MultiQ: Processing Dataset: 2.94s
INFO:llmtf.base.darumeru/MultiQ:Processing Dataset: 2.94s


<class 'list'>


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
!ls ./multiq/

darumeru_MultiQ.jsonl  darumeru_MultiQ_params.jsonl  darumeru_MultiQ_total.jsonl


In [None]:
!cat ./multiq/darumeru_MultiQ_total.jsonl

{
    "task_name": "darumeru/MultiQ",
    "results": {
        "f1": 0.3217560871948909,
        "em": 0.23
    },
    "leaderboard_result": 0.27587804359744544
}


In [None]:
!cat ./multiq/darumeru_MultiQ_params.jsonl

{
    "custom_generation_config": null,
    "model_params": {
        "model_name_or_path": "RefalMachine/RuadaptQwen2.5-1.5B-instruct",
        "generation_config": {
            "bos_token_id": 145109,
            "eos_token_id": [
                145111
            ],
            "max_length": 32768,
            "max_new_tokens": 200,
            "pad_token_id": 145109,
            "stop_strings": [
                "<|im_end|>"
            ],
            "temperature": 0.0,
            "top_k": 40,
            "top_p": 0.9,
            "transformers_version": "4.45.2",
            "trust_remote_code": false
        },
        "conversation_template": {
            "system_prompt": "",
            "system_message_template": "<|im_start|>system\n{content}<|im_end|>\n",
            "user_message_template": "<|im_start|>user\n{content}<|im_end|>\n",
            "bot_message_template": "<|im_start|>assistant\n{content}<|im_end|>\n",
            "bot_message_template_incomplete": "<|im_st