In [1]:
!nvidia-smi

Sun Dec 14 19:06:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 591.44                 Driver Version: 591.44         CUDA Version: 13.1     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   51C    P8             13W /  290W |    3789MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [2]:
import random
from typing import List, Dict

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from tqdm import tqdm


# Основная суть в том, чтобы подготовить вручную input_ids, labels так, чтобы все влезало в ограниченный контекст + labels были -100 на токенах пользователя
# Взято из репозитория saiga

class ChatDataset(Dataset):
    def __init__(
        self,
        original_records: List[Dict],
        tokenizer: AutoTokenizer,
        max_tokens_count: int,
        sample_rate: float = 1.0,
        only_target_loss: bool = True,
        add_global_bos: bool = True,
        add_global_eos: bool = True,
        labels_pad_token_id: int = -100,
        shuffle=False
    ):
        self.original_records = original_records
        self.sample_rate = sample_rate
        self.tokenizer = tokenizer
        self.max_tokens_count = max_tokens_count
        self.only_target_loss = only_target_loss
        self.labels_pad_token_id = labels_pad_token_id
        self.add_global_bos = add_global_bos
        self.add_global_eos = add_global_eos
        self.is_printed = False

        self.records = []
        for record in tqdm(original_records):
            record_str = self.tokenizer.apply_chat_template(record["messages"], tokenize=False, add_generation_prompt=False)
            if len(record_str) > 4 * self.max_tokens_count:
                continue

            if random.random() > self.sample_rate:
                continue
            tensors = self.convert_record(record)
            if tensors is None:
                continue
            self.records.append(tensors)

        if shuffle:
            random.shuffle(self.records)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, index):
        return self.records[index]

    def get_tokens(self, messages):
        #print(messages)
        tokens = self.tokenizer.apply_chat_template(
            messages,
            add_special_tokens=False,
            tokenize=True,
            add_generation_prompt=False,
        )
        if tokens[0] == self.tokenizer.bos_token_id:
            tokens = tokens[1:]
        return tokens

    def convert_record(self, record):
        input_ids, labels = [], []

        for i, message in enumerate(record["messages"]):
            if message['role'] == 'bot':
                message['role'] = 'assistant'
                record["messages"][i]['role'] = 'assistant'

            message_input_ids = self.get_tokens([message])
            message_labels = message_input_ids
            if len(input_ids) + len(message_input_ids) > self.max_tokens_count - 2:
                break

            labels_mask = [
                self.labels_pad_token_id for _ in range(len(message_input_ids))
            ]
            if (
                message["role"] not in ("assistant", "bot", "gpt")
                and self.only_target_loss
            ):
                message_labels = labels_mask

            input_ids.extend(message_input_ids)
            labels.extend(message_labels)

        if not input_ids:
            return None

        original_input_ids = self.get_tokens(record["messages"])
        if input_ids != original_input_ids[: len(input_ids)]:
            print(input_ids)
            print(original_input_ids[: len(input_ids)])
        assert input_ids == original_input_ids[: len(input_ids)]

        if self.add_global_bos and input_ids[0] != self.tokenizer.bos_token_id:
            input_ids.insert(0, self.tokenizer.bos_token_id)
            labels.insert(0, self.labels_pad_token_id)

        if input_ids[-2] == self.tokenizer.eos_token_id:
            input_ids = input_ids[:-1]
            labels = labels[:-1]

        if self.add_global_eos and input_ids[-1] != self.tokenizer.eos_token_id:
            input_ids.append(self.tokenizer.eos_token_id)
            labels.append(self.tokenizer.eos_token_id)

        if not self.is_printed:
            print(input_ids)
            print(labels)
            print(
                "Full prompt:" +
                self.tokenizer.decode(input_ids, skip_special_tokens=False)
            )
            #assert '\n' in self.tokenizer.decode(input_ids, skip_special_tokens=False)
            self.is_printed = True

        if len([i for i in labels if i != -100]) == 0:
            return None

        input_ids = torch.LongTensor(input_ids)
        labels = torch.LongTensor(labels)
        attention_mask = input_ids.new_ones(input_ids.size())
        assert (
            input_ids.size(0)
            == labels.size(0)
            == attention_mask.size(0)
            <= self.max_tokens_count
        )
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }

In [25]:
import torch
import torch._dynamo
torch.set_float32_matmul_precision('high')
torch._dynamo.config.suppress_errors = True
torch._dynamo.disable()

from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    DataCollatorForTokenClassification,
    GenerationConfig, 
    TrainingArguments, 
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
import random
from typing import List, Dict
from tqdm import tqdm
warnings.filterwarnings("ignore")
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from peft import get_peft_model, LoraConfig
import re
from peft import prepare_model_for_kbit_training

In [4]:
dataset = load_dataset('Aniemore/cedr-m7')
dataset_train = dataset['train'].select(range(1000))
dataset_test = dataset['test'].select(range(100))
dataset_train[0]

{'text': 'Суровый гималайский медведь .',
 'labels': ['neutral'],
 'source': 'lj',
 'label2ids': [0]}

In [5]:
label_names = ['anger', 'disgust', 'enthusiasm', 'fear', 'happiness', 'neutral', 'sadness']

In [6]:
train_labels = [ex["labels"][0] for ex in dataset_train]
test_labels = [ex["labels"][0] for ex in dataset_test]

In [7]:
from collections import Counter
print("Train distribution:", Counter(train_labels))
print("Test distribution:", Counter(test_labels))

Train distribution: Counter({'neutral': 389, 'happiness': 205, 'sadness': 205, 'enthusiasm': 92, 'fear': 68, 'anger': 36, 'disgust': 5})
Test distribution: Counter({'neutral': 35, 'sadness': 22, 'enthusiasm': 15, 'happiness': 13, 'fear': 8, 'anger': 6, 'disgust': 1})


In [8]:
labels_str = ", ".join(label_names)

In [9]:
def sample2messages(sample):
    text = sample['text']
    prompt = f"""Определи эмоцию в следующем тексте.
Текст: "{text}"
Эмоция (выбери только одно из: {labels_str}):"""
    return [{'role': 'user', 'content': prompt}]

In [10]:
def sample2messages_with_label(sample):
    text = sample['text']
    
    if isinstance(sample['labels'], list):
        label = sample['labels'][0] if len(sample['labels']) > 0 else 'neutral'
    else:
        label = sample['labels']
    
    prompt = f"""Определи эмоцию в следующем тексте.
Текст: "{text}"
Эмоция (выбери только одно из: {labels_str}):"""
    
    return [
        {'role': 'user', 'content': prompt},
        {'role': 'assistant', 'content': label}
    ]

In [11]:
instruct_model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token
pad_token = tokenizer.pad_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
## чат темплейт у Qwen несколько странный в том плане, что он форсит системный промпт всегда, даже, когда мы этого не просим. Это осложняет некоторые процессы подготовки данных, когда требуется токенизация элементов диалога последовательно
## поэтому как вариант берем от RuadaptQwen2.5, который получен путем фикса этого случая (фикс не полный, с tool calling я не фиксил)
tokenizer = AutoTokenizer.from_pretrained('RefalMachine/RuadaptQwen2.5-1.5B-instruct')
chat_template = tokenizer.chat_template
chat_template

Exception: data did not match any variant of untagged enum ModelWrapper at line 724962 column 3

In [13]:
with open('ruadapt_chat_template.txt', 'r', encoding='utf-8') as f:
    chat_template = f.read()

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", trust_remote_code=True)
tokenizer.chat_template = chat_template
messages = [{"role": "user", "content": "Привет!"}]
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<|im_start|>user
Привет!<|im_end|>
<|im_start|>assistant



In [14]:
model_name = 'Qwen/Qwen2.5-1.5B'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.bos_token = bos_token
tokenizer.eos_token = eos_token
tokenizer.pad_token = pad_token
tokenizer.chat_template = chat_template
tokenizer.padding_side = 'left'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
def prepare_records(dataset):
    records = []
    for row in dataset:
        label_list = row['labels']
        
        messages = sample2messages_with_label({
            'text': row['text'],
            'labels': label_list
        })
        records.append({"messages": messages})
    return records

train_records = prepare_records(dataset_train)
test_records = prepare_records(dataset_test)

In [21]:
print(train_records)

[{'messages': [{'role': 'user', 'content': 'Определи эмоцию в следующем тексте.\nТекст: "Суровый гималайский медведь ."\nЭмоция (выбери только одно из: anger, disgust, enthusiasm, fear, happiness, neutral, sadness):'}, {'role': 'assistant', 'content': 'neutral'}]}, {'messages': [{'role': 'user', 'content': 'Определи эмоцию в следующем тексте.\nТекст: "Так, обнаружено несколько проблем с дисплеем (выгорание, странные зеленые полосы), дефекты динамика и некорректная работа Face ID."\nЭмоция (выбери только одно из: anger, disgust, enthusiasm, fear, happiness, neutral, sadness):'}, {'role': 'assistant', 'content': 'neutral'}]}, {'messages': [{'role': 'user', 'content': 'Определи эмоцию в следующем тексте.\nТекст: "У меня остается только один вопрос - является ли этот приступ отчаяния ( а точнее приступ удивления , почему мне не становится лучше , почему мне ничего не помогает ) еще одним испытанием ?"\nЭмоция (выбери только одно из: anger, disgust, enthusiasm, fear, happiness, neutral, sad

In [22]:
only_target_loss = True
max_tokens_count = 1024
datasets = []
for records in (train_records, test_records):
    datasets.append(
        ChatDataset(
            records,
            tokenizer,
            max_tokens_count=max_tokens_count,
            sample_rate=1.0,
            only_target_loss=only_target_loss,
            add_global_eos=False,
            add_global_bos=False
        )
    )
train_dataset, val_dataset = datasets

 18%|█████████████▌                                                               | 176/1000 [00:00<00:00, 1759.93it/s]

[151644, 872, 198, 20353, 8005, 42975, 60542, 20928, 126242, 133126, 5805, 92029, 71019, 10090, 70895, 1504, 624, 33995, 14949, 6597, 25, 330, 19311, 3780, 23862, 34623, 24725, 16104, 15952, 18530, 126302, 127620, 35650, 4824, 659, 698, 92211, 126242, 70729, 320, 126405, 9923, 125340, 73626, 131550, 23064, 25, 19234, 11, 67062, 11, 35132, 11, 8679, 11, 23009, 11, 20628, 11, 50878, 1648, 151645, 198, 151644, 77091, 198, 59568, 151645]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 151644, 77091, 198, 59568, 151645]
Full prompt:<|im_start|>user
Определи эмоцию в следующем тексте.
Текст: "Суровый гималайский медведь ."
Эмоция (выбери только одно из: anger, disgus

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2110.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1998.06it/s]

[151644, 872, 198, 20353, 8005, 42975, 60542, 20928, 126242, 133126, 5805, 92029, 71019, 10090, 70895, 1504, 624, 33995, 14949, 6597, 25, 330, 20195, 1456, 130699, 7665, 48807, 38180, 5805, 49845, 12228, 129402, 1959, 67879, 131991, 130699, 137200, 12150, 46195, 23064, 10885, 7819, 659, 698, 92211, 126242, 70729, 320, 126405, 9923, 125340, 73626, 131550, 23064, 25, 19234, 11, 67062, 11, 35132, 11, 8679, 11, 23009, 11, 20628, 11, 50878, 1648, 151645, 198, 151644, 77091, 198, 59568, 151645]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 151644, 77091, 198, 59568, 151645]
Full prompt:<|im_start|>user
Определи эмоцию




In [23]:
test_messages = sample2messages_with_label(dataset_train[0])
print(tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=False))

<|im_start|>user
Определи эмоцию в следующем тексте.
Текст: "Суровый гималайский медведь ."
Эмоция (выбери только одно из: anger, disgust, enthusiasm, fear, happiness, neutral, sadness):<|im_end|>
<|im_start|>assistant
neutral<|im_end|>



In [26]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=f"cuda:0",
    torch_dtype=torch.float16,
    attn_implementation="sdpa",
    use_cache=False
)
prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNo

In [27]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNo

In [28]:
from transformers import GenerationConfig

def generate(messages, model, tokenizer, generation_config):
    print(tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False, add_generation_prompt=True))
    input_ids = tokenizer.apply_chat_template(messages, return_tensors='pt', add_special_tokens=False, add_generation_prompt=True)
    input_ids = input_ids.to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            generation_config=generation_config
        )
    outputs = []
    for sample_output_ids, sample_input_ids in zip(output_ids, input_ids):
        sample_output_ids = sample_output_ids[len(sample_input_ids):]
        sample_output = tokenizer.decode(sample_output_ids, skip_special_tokens=True)
        outputs.append(sample_output)

    if len(outputs) == 1:
        outputs = outputs[0]
    return outputs



generation_config = GenerationConfig.from_dict(
    {
        'top_k': 40,
        'top_p': 0.9,
        'temperature': 0.2,
        'repetition_penalty': 1.0,
        'max_new_tokens': 64,
        'do_sample': True,
        'pad_token_id': tokenizer.pad_token_id,
        'bos_token_id': tokenizer.bos_token_id,
        'eos_token_id': tokenizer.eos_token_id
    }
)
generation_config

GenerationConfig {
  "do_sample": true,
  "eos_token_id": 151645,
  "max_new_tokens": 64,
  "pad_token_id": 151643,
  "temperature": 0.2,
  "top_k": 40,
  "top_p": 0.9
}

In [29]:
generate(messages, model, tokenizer, generation_config)

<|im_start|>user
Привет!<|im_end|>
<|im_start|>assistant



'Привет! остальному пользователю\nПривет! остальным пользователям\nПривет! остальным пользователям\nПривет! остальным пользователям\nПривет! остальным пользователям\nПривет! остальным пользователям\nПривет! остальным пользователям\n'

In [30]:
lora_config = {
    "r": 16,
    "lora_alpha": 16,
    "lora_dropout": 0.0,
    "bias": "none",
    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"]
}
lora_config = LoraConfig(**lora_config)
lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False, r=16, target_modules={'v_proj', 'o_proj', 'q_proj', 'k_proj'}, lora_alpha=16, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [31]:
lora_config.modules_to_save

In [32]:
model = get_peft_model(model, lora_config)
if model.config.tie_word_embeddings and lora_config.modules_to_save is not None and 'lm_head' in lora_config.modules_to_save:
    print('Tie embeddings')
    assert 'embed_tokens' not in lora_config.modules_to_save
    model.base_model.model.model.embed_tokens.weight = model.base_model.model.lm_head.modules_to_save["default"].weight

In [33]:
model

PeftModel(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
              

In [34]:
training_args = {
    "eval_strategy": "steps",
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "eval_steps": 16,
    "save_steps": 128,
    "logging_steps": 1,
    "learning_rate": 0.00005,
    "num_train_epochs": 1,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 16,
    "bf16": False,
    "fp16": True,
    "optim": "paged_adamw_8bit", # тут отличие
    "save_total_limit": 1,
    "gradient_checkpointing": True,
    "seed": 1337,
    "max_grad_norm": 1.0,
    "weight_decay": 0.05
}
training_args = TrainingArguments(output_dir='./instruct', **training_args)

In [35]:
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)
if len(trainer.label_names) == 0:
    trainer.label_names.append('labels')

In [36]:
trainer.train()

Step,Training Loss,Validation Loss
16,9.632,9.579678
32,5.9837,5.921026
48,4.1494,4.146464
64,3.7132,3.596321
80,3.3242,3.369171
96,3.2388,3.277082
112,3.205,3.261899


TrainOutput(global_step=125, training_loss=5.103595802307129, metrics={'train_runtime': 535.7811, 'train_samples_per_second': 1.866, 'train_steps_per_second': 0.233, 'total_flos': 721801167618048.0, 'train_loss': 5.103595802307129, 'epoch': 1.0})

In [37]:
model.save_pretrained("./emotion_lora_instruct")
tokenizer.save_pretrained("./emotion_lora_instruct")

('./emotion_lora_instruct\\tokenizer_config.json',
 './emotion_lora_instruct\\special_tokens_map.json',
 './emotion_lora_instruct\\vocab.json',
 './emotion_lora_instruct\\merges.txt',
 './emotion_lora_instruct\\added_tokens.json',
 './emotion_lora_instruct\\tokenizer.json')

In [51]:
import gc, torch
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel

label_names = ['anger', 'disgust', 'enthusiasm', 'fear', 'happiness', 'neutral', 'sadness']
labels_str = ", ".join(label_names)

In [53]:
def generate_prediction(model, tokenizer, sample):
    messages = sample2messages(sample)
    
    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True,
        truncation=True,
        max_length=512
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=10,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(
        outputs[0][inputs.shape[-1]:],
        skip_special_tokens=True
    )
    return response.strip()

In [54]:
def extract_emotion(pred_raw, label_names):
    pred_raw = pred_raw.lower().strip()
    for label in label_names:
        if label.lower() in pred_raw:
            return label
    return label_names[0]

In [57]:
for obj_name in list(globals().keys()):
    if obj_name.startswith("model"):
        del globals()[obj_name]
gc.collect()
torch.cuda.empty_cache()

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_baseline = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)
model_baseline.eval()

test_sample = dataset_test.select(range(100))

predictions_baseline = []
ground_truth = []

for sample in test_sample:
    pred_raw = generate_prediction(model_baseline, tokenizer, sample)
    pred = extract_emotion(pred_raw, label_names)
    predictions_baseline.append(pred)
    ground_truth.append(sample['labels'][0])

print("\nРезультаты базовой модели:")
acc_baseline = accuracy_score(ground_truth, predictions_baseline)
print(f"Accuracy: {acc_baseline:.4f}\n")
print(classification_report(
    ground_truth,
    predictions_baseline,
    labels=label_names,
    zero_division=0
))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Результаты базовой модели:
Accuracy: 0.3700

              precision    recall  f1-score   support

       anger       0.19      0.67      0.30         6
     disgust       0.00      0.00      0.00         1
  enthusiasm       0.33      0.13      0.19        15
        fear       0.23      1.00      0.37         8
   happiness       0.88      0.54      0.67        13
     neutral       0.48      0.31      0.38        35
     sadness       0.71      0.23      0.34        22

    accuracy                           0.37       100
   macro avg       0.40      0.41      0.32       100
weighted avg       0.52      0.37      0.37       100



In [58]:
del model_baseline
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_name)

model_fine_tuned_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)

model_fine_tuned = PeftModel.from_pretrained(
    model_fine_tuned_base,
    "./emotion_lora_instruct"
)
model_fine_tuned.eval()

predictions_fine_tuned = []
ground_truth_ft = []

for sample in test_sample:
    pred_raw = generate_prediction(model_fine_tuned, tokenizer, sample)
    pred = extract_emotion(pred_raw, label_names)
    predictions_fine_tuned.append(pred)
    ground_truth_ft.append(sample['labels'][0])

print("\nРезультаты fine-tuned модели:")
acc_fine_tuned = accuracy_score(ground_truth_ft, predictions_fine_tuned)
print(f"Accuracy: {acc_fine_tuned:.4f}\n")
print(classification_report(
    ground_truth_ft,
    predictions_fine_tuned,
    labels=label_names,
    zero_division=0
))

print(f"Baseline Accuracy:    {acc_baseline:.4f}")
print(f"Fine-tuned Accuracy:  {acc_fine_tuned:.4f}")
print(f"Улучшение:            {(acc_fine_tuned - acc_baseline)*100:+.2f}%")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Результаты fine-tuned модели:
Accuracy: 0.5100

              precision    recall  f1-score   support

       anger       1.00      0.17      0.29         6
     disgust       0.00      0.00      0.00         1
  enthusiasm       0.00      0.00      0.00        15
        fear       0.29      0.50      0.36         8
   happiness       0.88      0.54      0.67        13
     neutral       0.51      0.60      0.55        35
     sadness       0.58      0.82      0.68        22

    accuracy                           0.51       100
   macro avg       0.46      0.37      0.36       100
weighted avg       0.50      0.51      0.48       100

Baseline Accuracy:    0.3700
Fine-tuned Accuracy:  0.5100
Улучшение:            +14.00%
