### Импорты ###

In [1]:
# %pip install vllm

import torch
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import destroy_model_parallel
import pandas as pd
from collections import defaultdict
import random
from random import choices
import ast
import os
import gc
import json

SEED = 42
random.seed(SEED)



In [2]:
import requests, zipfile, io

# download source files
url = 'https://raw.githubusercontent.com/rossyaykin/RuOpinionNE/refs/heads/main/src/src.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('')

### Загрузка данных ###

In [3]:
from src.utils import load_jsonl

In [5]:
train_path = "full.jsonl"
val_path = "gold.jsonl"
test_path = "validation.jsonl"

url = 'https://raw.githubusercontent.com/rossyaykin/RuOpinionNE/refs/heads/main/data/full.jsonl'
train = load_jsonl(url, train_path)
url = 'https://raw.githubusercontent.com/rossyaykin/RuOpinionNE/refs/heads/main/data/gold.jsonl'
val = load_jsonl(url, val_path)
url = 'https://raw.githubusercontent.com/rossyaykin/RuOpinionNE/refs/heads/main/data/validation.jsonl'
test = load_jsonl(url, test_path)

print(len(train), len(val), len(test))

2556 512 1316


### Определения ###

In [6]:
from src.utils import dict2tuple, extract_tuple, form_prompt, str2list, short_report, df2structure
from src.evaluation import do_eval_core

In [7]:
def run(model, sampling_params, train, test, n_shots = 5, chat_template = False):
    """generate preds as a list of lists"""
    
    prompts = list()
    for entry in test:
        examples = [dict2tuple(x) for x in choices(train, k = n_shots)]
        prompt = form_prompt(examples, entry['text'])
        if chat_template:
            prompt = llm.get_tokenizer().apply_chat_template([{"role": "user", "content": prompt}],
                                                             tokenize=False,
                                                             add_generation_prompt=True)
        prompts.append(prompt)
    
    output = llm.generate(prompts, sampling_params)
    # (sent_id, text, target, pred)
    result = [(test[i]['sent_id'],
               test[i]['text'],
               dict2tuple(test[i])[1],
               extract_tuple(output[i].outputs[0].text)) for i in range(len(test))]
    return result

def get_path(model_name, sampling_params, n_shots, chat_template = False, short_path = False):
    full_name = model_name.split('/')[1]
    if short_path:
        model_tag = ''
        for c in full_name:
            if not c.isalpha():
                break
            model_tag+=c
    else:
        model_tag = full_name
    
    n_shots, temp = str(n_shots), str(sampling_params.temperature)
    path = f'results/{model_tag}/{model_tag}_bl_{n_shots}shot_{temp}temp'
    if chat_template:
        path+='_chat'
    # returns full path but without ".csv"
    return path

def to_jsonl(data, target):
    """takes a list of dicts and path;
    saves the list to jsonl"""

    with open(target, "w") as f:
        for item in data:
            f.write(f"{json.dumps(item, ensure_ascii=False)}\n")

def save(data, path):
    outdir, outname = '/'.join(path.split('/')[:-1]), path.split('/')[-1]
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    to_jsonl(data, f'{path}.jsonl')

### Инференс ###

In [5]:
# model_name = "Vikhrmodels/Vikhr-7B-instruct_0.4"
# model_name = "Vikhrmodels/Vikhr-Gemma-2B-instruct"
# model_name = "Vikhrmodels/it-5.4-fp16-orpo-v2"
# model_name = "IlyaGusev/saiga_llama3_8b"
# model_name = "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half"
# model_name = "lightblue/suzume-llama-3-8B-multilingual"

# model_names = list()
# for dirr in os.listdir('./modelcache/'):
#     if 'models' in dirr:
#         model_name = '/'.join(dirr.split('--')[1:3])
#         model_names.append(model_name)
# model_names

['ai-forever/ruBert-large',
 'ai-forever/ruElectra-large',
 'DeepPavlov/rubert-base-cased-conversational',
 'google-bert/bert-base-multilingual-cased',
 'IlyaGusev/saiga_llama3_8b']

при порождении для val датасета можно оценить результаты работы модели локально

In [7]:
%%time
num_shots = [9, 11, 13, 15]
temperatures = [0.1, 0.2, 0.3, 0.4]
model_names = ["IlyaGusev/saiga_llama3_8b"]

for model_name in model_names:
    # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
    llm = LLM(model=model_name, dtype='float16')
    
    for temperature in temperatures:
        # https://docs.vllm.ai/en/latest/getting_started/quickstart.html
        # https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py
        sampling_params = SamplingParams(temperature=temperature,
                                         top_p=0.9,
                                         max_tokens = 256,
                                         seed = SEED)
        
        for n_shots in num_shots:
            path = get_path(model_name, sampling_params, n_shots)
            result = run(llm, sampling_params, train, val, n_shots)
            
            # output = pd.DataFrame(result, columns = ['sent_id', 'text', 'target', 'pred'])
            # save(output, path)
            output = pd.DataFrame([(x[0], x[1], x[2], str2list(x[3])) for x in result],
                      columns = ['sent_id', 'text', 'target', 'pred'])
            print(model_name, temperature, n_shots)
            short_report(output)
            # save(output, path, raw = False)
            output = df2structure(output)
            print(f'f1: {do_eval_core(val, output)}')
            # save(output, path)

INFO 11-19 22:26:32 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='IlyaGusev/saiga_llama3_8b', speculative_config=None, tokenizer='IlyaGusev/saiga_llama3_8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=IlyaGusev/saiga_llama3_8b, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 11-19 22:26:33 selector.py:217] Cannot use FlashAttention-2 bac

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 11-19 22:26:37 model_runner.py:879] Starting to load model IlyaGusev/saiga_llama3_8b...
INFO 11-19 22:26:38 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 11-19 22:26:38 selector.py:116] Using XFormers backend.
INFO 11-19 22:26:39 weight_utils.py:236] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:02<00:07,  2.65s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.52s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:06<00:02,  2.11s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:08<00:00,  2.36s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:08<00:00,  2.24s/it]



INFO 11-19 22:28:55 model_runner.py:890] Loading model weights took 14.9595 GB
INFO 11-19 22:28:59 gpu_executor.py:121] # GPU blocks: 5765, # CPU blocks: 2048
INFO 11-19 22:29:01 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-19 22:29:01 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-19 22:29:25 model_runner.py:1300] Graph capturing finished in 24 secs.


Processed prompts:   0%|          | 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  51%|█████     | 259/512 [02:37<01:50,  2.29it/s, est. speed input: 1492.45 toks/s, output: 420.02 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:22<00:00,  1.95it/s, est. speed input: 1777.22 toks/s, output: 498.96 toks/s]

IlyaGusev/saiga_llama3_8b 0.1 9





Count: 52
Accuracy: 0.102
NaNs: 9
0.11352306090157463


Processed prompts:  30%|██▉       | 153/512 [01:49<02:02,  2.94it/s, est. speed input: 1480.24 toks/s, output: 357.42 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:57<00:00,  1.72it/s, est. speed input: 1835.74 toks/s, output: 440.29 toks/s]

IlyaGusev/saiga_llama3_8b 0.1 11
Count: 68
Accuracy: 0.133
NaNs: 8





0.12291522862448095


Processed prompts:  12%|█▎        | 64/512 [01:07<07:12,  1.04it/s, est. speed input: 1186.45 toks/s, output: 242.14 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:28<00:00,  1.56it/s, est. speed input: 1934.01 toks/s, output: 398.63 toks/s]

IlyaGusev/saiga_llama3_8b 0.1 13
Count: 65
Accuracy: 0.127
NaNs: 3





0.12471659469773115


Processed prompts:   0%|          | 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:44<00:00,  1.49it/s, est. speed input: 2075.36 toks/s, output: 380.70 toks/s]

IlyaGusev/saiga_llama3_8b 0.1 15





Count: 59
Accuracy: 0.115
NaNs: 7
0.10470942643959763


Processed prompts:   0%|          | 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  53%|█████▎    | 271/512 [02:58<05:20,  1.33s/it, est. speed input: 1399.08 toks/s, output: 389.09 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:23<00:00,  1.95it/s, est. speed input: 1788.57 toks/s, output: 497.98 toks/s]

IlyaGusev/saiga_llama3_8b 0.2 9





Count: 58
Accuracy: 0.113
NaNs: 3
0.10919280246307014


Processed prompts:  43%|████▎     | 222/512 [02:37<02:19,  2.08it/s, est. speed input: 1532.12 toks/s, output: 360.20 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:00<00:00,  1.70it/s, est. speed input: 1851.16 toks/s, output: 436.34 toks/s]

IlyaGusev/saiga_llama3_8b 0.2 11
Count: 66
Accuracy: 0.129
NaNs: 6





0.12289588396936653


Processed prompts:  26%|██▋       | 135/512 [01:46<02:19,  2.69it/s, est. speed input: 1542.93 toks/s, output: 324.25 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:31<00:00,  1.55it/s, est. speed input: 1886.30 toks/s, output: 395.99 toks/s]

IlyaGusev/saiga_llama3_8b 0.2 13
Count: 72
Accuracy: 0.141
NaNs: 5





0.133165613095629


Processed prompts:  13%|█▎        | 66/512 [01:19<11:28,  1.54s/it, est. speed input: 1149.95 toks/s, output: 212.25 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:41<00:00,  1.50it/s, est. speed input: 2072.54 toks/s, output: 383.28 toks/s]

IlyaGusev/saiga_llama3_8b 0.2 15
Count: 77
Accuracy: 0.150
NaNs: 2





0.11612862596589649


Processed prompts:   0%|          | 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  69%|██████▉   | 352/512 [03:24<01:02,  2.56it/s, est. speed input: 1554.98 toks/s, output: 440.02 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:24<00:00,  1.93it/s, est. speed input: 1767.69 toks/s, output: 495.14 toks/s]

IlyaGusev/saiga_llama3_8b 0.3 9





Count: 65
Accuracy: 0.127
NaNs: 5
0.11914003895746596


Processed prompts:  58%|█████▊    | 297/512 [03:17<01:21,  2.65it/s, est. speed input: 1644.46 toks/s, output: 384.51 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:58<00:00,  1.72it/s, est. speed input: 1857.90 toks/s, output: 439.77 toks/s]

IlyaGusev/saiga_llama3_8b 0.3 11





Count: 77
Accuracy: 0.150
NaNs: 3
0.11822936951656818


Processed prompts:  40%|███▉      | 203/512 [02:42<04:38,  1.11it/s, est. speed input: 1536.79 toks/s, output: 319.49 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:30<00:00,  1.55it/s, est. speed input: 1916.35 toks/s, output: 396.63 toks/s]

IlyaGusev/saiga_llama3_8b 0.3 13
Count: 77
Accuracy: 0.150
NaNs: 6





0.12846402998286416


Processed prompts:  24%|██▍       | 123/512 [01:59<05:51,  1.11it/s, est. speed input: 1418.01 toks/s, output: 264.10 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:44<00:00,  1.49it/s, est. speed input: 2057.68 toks/s, output: 380.79 toks/s]

IlyaGusev/saiga_llama3_8b 0.3 15





Count: 86
Accuracy: 0.168
NaNs: 4
0.1144956821889277


Processed prompts:   0%|          | 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  68%|██████▊   | 350/512 [03:30<01:25,  1.90it/s, est. speed input: 1516.75 toks/s, output: 424.69 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:23<00:00,  1.94it/s, est. speed input: 1769.94 toks/s, output: 496.64 toks/s]

IlyaGusev/saiga_llama3_8b 0.4 9





Count: 68
Accuracy: 0.133
NaNs: 8
0.11560101782474515


Processed prompts:  59%|█████▉    | 302/512 [03:15<01:06,  3.18it/s, est. speed input: 1660.37 toks/s, output: 395.37 toks/s]



Processed prompts: 100%|██████████| 512/512 [04:59<00:00,  1.71it/s, est. speed input: 1844.72 toks/s, output: 437.59 toks/s]

IlyaGusev/saiga_llama3_8b 0.4 11
Count: 83
Accuracy: 0.162
NaNs: 6





0.12406213407401646


Processed prompts:  40%|███▉      | 203/512 [02:50<06:39,  1.29s/it, est. speed input: 1499.12 toks/s, output: 305.34 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:30<00:00,  1.55it/s, est. speed input: 1906.73 toks/s, output: 397.08 toks/s]

IlyaGusev/saiga_llama3_8b 0.4 13
Count: 70
Accuracy: 0.137
NaNs: 3





0.12015075044909788


Processed prompts:  24%|██▍       | 122/512 [01:55<05:14,  1.24it/s, est. speed input: 1449.41 toks/s, output: 269.96 toks/s]



Processed prompts: 100%|██████████| 512/512 [05:41<00:00,  1.50it/s, est. speed input: 2066.16 toks/s, output: 384.26 toks/s]

IlyaGusev/saiga_llama3_8b 0.4 15
Count: 95
Accuracy: 0.186
NaNs: 3





0.12673075957632834
CPU times: user 1h 25min 3s, sys: 1min 2s, total: 1h 26min 5s
Wall time: 1h 25min 51s


при порождении для test датасета для оценки результатов необходимо отправить .jsonl на Codalab

In [8]:
%%time
num_shots = [13]
temperatures = [0.2]
model_names = ["IlyaGusev/saiga_llama3_8b"]

for model_name in model_names:
    # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
    llm = LLM(model=model_name, dtype='float16')
    
    for temperature in temperatures:
        # https://docs.vllm.ai/en/latest/getting_started/quickstart.html
        # https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py
        sampling_params = SamplingParams(temperature=temperature,
                                         top_p=0.9,
                                         max_tokens = 256,
                                         seed = SEED)
        
        for n_shots in num_shots:
            path = get_path(model_name, sampling_params, n_shots)
            result = run(llm, sampling_params, train, test, n_shots)
            output = pd.DataFrame([(x[0], x[1], x[2], str2list(x[3])) for x in result],
                      columns = ['sent_id', 'text', 'target', 'pred'])
            output = df2structure(output)
            print(model_name, temperature, n_shots)
            print(path)
            save(output, path)

INFO 11-20 00:10:57 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='IlyaGusev/saiga_llama3_8b', speculative_config=None, tokenizer='IlyaGusev/saiga_llama3_8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=IlyaGusev/saiga_llama3_8b, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 11-20 00:10:59 selector.py:217] Cannot use FlashAttention-2 bac

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 11-20 00:11:01 model_runner.py:879] Starting to load model IlyaGusev/saiga_llama3_8b...
INFO 11-20 00:11:01 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 11-20 00:11:01 selector.py:116] Using XFormers backend.
INFO 11-20 00:11:02 weight_utils.py:236] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:02<00:07,  2.64s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.52s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:02,  2.01s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:08<00:00,  2.38s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:08<00:00,  2.23s/it]



INFO 11-20 00:13:22 model_runner.py:890] Loading model weights took 14.9595 GB
INFO 11-20 00:13:26 gpu_executor.py:121] # GPU blocks: 5765, # CPU blocks: 2048
INFO 11-20 00:13:28 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-20 00:13:28 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-20 00:13:52 model_runner.py:1300] Graph capturing finished in 24 secs.


Processed prompts:   0%|          | 0/1316 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  25%|██▌       | 335/1316 [04:05<13:46,  1.19it/s, est. speed input: 1687.98 toks/s, output: 349.80 toks/s]



Processed prompts:  61%|██████    | 806/1316 [08:49<03:04,  2.77it/s, est. speed input: 1864.42 toks/s, output: 389.41 toks/s]



Processed prompts:  91%|█████████▏| 1204/1316 [13:10<00:50,  2.23it/s, est. speed input: 1875.81 toks/s, output: 389.90 toks/s]



Processed prompts: 100%|██████████| 1316/1316 [14:00<00:00,  1.57it/s, est. speed input: 1927.63 toks/s, output: 400.85 toks/s]


IlyaGusev/saiga_llama3_8b 0.2 13
results/saiga_llama3_8b/saiga_llama3_8b_bl_13shot_0.2temp
CPU times: user 15min 34s, sys: 49.7 s, total: 16min 24s
Wall time: 17min 1s
