In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
import pandas as pd
from transformers import (set_seed, AutoConfig, AutoModelForCausalLM,
                          AutoTokenizer, 
                            BitsAndBytesConfig)
import torch
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [4]:
device = "cuda"
model_name = "yandex/YandexGPT-5-Lite-8B-instruct"

In [5]:
train = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="train")

In [6]:
emotion_cols = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
emotion_map = {
    'anger': 'гнев',
    'disgust': 'отвращение', 
    'fear': 'страх',
    'joy': 'радость',
    'sadness': 'грусть',
    'surprise': 'удивление'
}

In [7]:
def create_labels(examples):
    labels = []
    for i in range(len(examples['text'])):
        label = [examples[col][i] for col in emotion_cols]
        labels.append(label)
    examples['labels'] = labels
    return examples

train = train.map(create_labels, batched=True)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_cache=False,
    trust_remote_code=True,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
test = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="test")

In [11]:
seed = 42
set_seed(seed)

In [12]:
def create_prompt(query_text):
    prompt = f"""Ты эксперт по анализу эмоций в тексте. 
    Определи, какие эмоции выражены в тексте из списка [гнев, отвращение, страх, радость, грусть, удивление.]
    Эмоций может быть несколько, а может и вовсе не быть. Формат вывода: только названия эмоций. Если нет эмоций, то оставь пустой список.

    ВАЖНЫЕ ПРАВИЛА:
    - выбирай эмоцию, ТОЛЬКО если она выражена ЯВНО через конкретные слова, фразы или контекст
    - НЕ додумывай скрытые эмоции - только то, что написано прямо
    - При сомнениях не выбирай эмоцию
    
    Пример вывода: [гнев]
    """
    prompt += f"Проанализируй этот текст по этим критериям:\n"
    prompt += f"Текст: {query_text}\n"
    prompt += f"Ответ:"
    
    return prompt


In [13]:
responses = []
prompts = []

for row in tqdm(test):
    labels = []
    
    prompt = create_prompt(
        row['text'], 
    )
    
    prompts.append(prompt)
    
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    
    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(
        input_ids=input_ids,
        temperature=0.1,
        do_sample=True,
        top_k=1,
        top_p=0.9,
        max_new_tokens=512,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    generated_ids = outputs[0][input_ids.shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    responses.append(response)

  0%|          | 0/2000 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 2000/2000 [09:09<00:00,  3.64it/s]


In [14]:
true_emotions = []

In [15]:
for i, row in tqdm(enumerate(test)):
    true_emotion = [row[col] for col in emotion_cols]
    true_emotions.append(true_emotion)

2000it [00:00, 12765.83it/s]


In [16]:
emotion_values = list(emotion_map.values())

def parse_response_to_binary(response: str) -> list[int]:
    response = response.lower()
    return [1 if emo in response else 0 for emo in emotion_values]

In [17]:
pred_emotions = [parse_response_to_binary(response) for response in responses]

In [18]:
for average in ['micro', 'macro']:
    recall = recall_score(true_emotions, pred_emotions, average=average, zero_division=0)
    precision = precision_score(true_emotions, pred_emotions, average=average, zero_division=0)
    f1 = f1_score(true_emotions, pred_emotions, average=average, zero_division=0)
    print(f'{average.upper()} recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1:.4f}')

MICRO recall: 0.7996, precision: 0.8902, f1: 0.8425
MACRO recall: 0.7794, precision: 0.9096, f1: 0.8281


In [19]:
class_recall = recall_score(true_emotions, pred_emotions, average=None, zero_division=0)
class_precision = precision_score(true_emotions, pred_emotions, average=None, zero_division=0)
class_f1 = f1_score(true_emotions, pred_emotions, average=None, zero_division=0)

for i, (eng_emotion, rus_emotion) in enumerate(emotion_map.items()):
    print(f'{rus_emotion}: recall: {class_recall[i]:.4f}, precision: {class_precision[i]:.4f}, f1: {class_f1[i]:.4f}')

гнев: recall: 0.8982, precision: 0.8712, f1: 0.8845
отвращение: recall: 0.6967, precision: 1.0000, f1: 0.8213
страх: recall: 0.8889, precision: 0.9412, f1: 0.9143
радость: recall: 0.8860, precision: 0.8814, f1: 0.8837
грусть: recall: 0.7943, precision: 0.7943, f1: 0.7943
удивление: recall: 0.5122, precision: 0.9692, f1: 0.6702
