In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import torch
from sklearn.utils.class_weight import compute_class_weight

In [None]:
import re

def super_clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    
    # 1. –¢—Ä–∞–Ω—Å–ª–∏—Ç, –∫–∞–∫ –∏ –±—ã–ª–æ
    cyrillic_map = str.maketrans("abvgdeziklmnoprstufhcy", "–∞–±–≤–≥–¥–µ–∑–∏–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—ã")
    text = text.translate(cyrillic_map)
    text = text.replace('blia', '–±–ª—è').replace('blya', '–±–ª—è').replace('ebat', '–µ–±–∞—Ç—å')

    # 2. –ê–≥—Ä–µ—Å—Å–∏–≤–Ω–æ–µ —É–¥–∞–ª–µ–Ω–∏–µ —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª–µ–π –º–µ–∂–¥—É –±—É–∫–≤–∞–º–∏
    # –≠—Ç–æ –Ω–∞–π–¥–µ—Ç "–± –ª —è —Ç —å", "–±.–ª.—è.—Ç.—å", "–±-–ª.—è —Ç.—å" –∏ —Ç.–¥.
    # –ú—ã –∏—â–µ–º –±—É–∫–≤—É, –∑–∞ –∫–æ—Ç–æ—Ä–æ–π —Å–ª–µ–¥—É–µ—Ç –¥–æ 3-—Ö —Å–∏–º–≤–æ–ª–æ–≤-—Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª–µ–π, –∏ –ø–æ—Ç–æ–º —Å–Ω–æ–≤–∞ –±—É–∫–≤–∞.
    # –ò —Ç–∞–∫ –ø–æ–≤—Ç–æ—Ä—è–µ–º.
    def merge_separated_letters(match):
        return match.group(0).replace('.', '').replace(' ', '').replace('-', '')

    text = re.sub(r'([–∞-—è—ë])([\s\.\-]+[–∞-—è—ë]){2,}', merge_separated_letters, text)

    # 3. –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è "–µ" –∏ "—ë"
    text = text.replace('—ë', '–µ')
    
    # 4. –£–¥–∞–ª–µ–Ω–∏–µ –ø–æ–≤—Ç–æ—Ä—è—é—â–∏—Ö—Å—è –±—É–∫–≤ (—Ä–∞—Å—Ç—è–Ω—É—Ç–æ—Å—Ç—å)
    text = re.sub(r'([–∞-—è])\1+', r'\1', text)
    
    # 5. –ë–æ–ª–µ–µ —É–º–Ω—ã–µ –∑–∞–º–µ–Ω—ã –∑–≤–µ–∑–¥–æ—á–µ–∫ –∏ –ø—Ä–æ–ø—É—Å–∫–æ–≤.
    # –ó–∞–º–µ–Ω—è–µ–º –≤—Å–µ, —á—Ç–æ –ø–æ—Ö–æ–∂–µ –Ω–∞ –º–∞—Ç, –Ω–∞ –µ–≥–æ –∫–æ—Ä–µ–Ω—å.
    text = re.sub(r'—Ö[—É–µ—ë*@#$ ]{1,5}[–π—è–∏—é–µ]', '—Ö—É–π', text)
    text = re.sub(r'–ø[–∏–µ—ë*@#$ ]{1,5}[–∑—Å][–¥]', '–ø–∏–∑–¥', text)
    text = re.sub(r'[–µ—ë][–±*@#$ ]{1,5}[–∞–æ—É—è]', '–µ–±', text)
    text = re.sub(r'–±[–ª*@#$ ]{1,5}[—è]', '–±–ª—è', text)
    text = re.sub(r'–º[—É*@#$ ]{1,5}[–¥][–∞–µ–∏–æ]', '–º—É–¥', text)
    
    return text


In [None]:
import random

def augment_profanity(df):
    profanity_samples = df[(df['label'] == 1) & (df['text'].str.contains(r'—Ö—É–π|–ø–∏–∑–¥|–µ–±|–±–ª—è|–º—É–¥'))].copy()
    if profanity_samples.empty:
        return df

    new_rows = []
    
    for _, row in profanity_samples.iterrows():
        text = row['text']
        
        # –í–∞—Ä–∏–∞–Ω—Ç 1: –†–∞—Å—Ç—è–≥–∏–≤–∞–Ω–∏–µ –±—É–∫–≤
        if random.random() < 0.5:
            words = text.split()
            if not words: continue
            word_to_stretch = random.choice(words)
            if len(word_to_stretch) > 2:
                char_to_stretch = random.choice(word_to_stretch)
                stretched_word = word_to_stretch.replace(char_to_stretch, char_to_stretch * random.randint(3, 7))
                new_text = text.replace(word_to_stretch, stretched_word)
                new_rows.append({'text': new_text, 'label': 1})

        # –í–∞—Ä–∏–∞–Ω—Ç 2: –í—Å—Ç–∞–≤–∫–∞ —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª–µ–π
        if random.random() < 0.5:
            words = text.split()
            if not words: continue
            word_to_separate = random.choice(words)
            if len(word_to_separate) > 2:
                separator = random.choice(['.', ' ', '-', '*'])
                separated_word = separator.join(list(word_to_separate))
                new_text = text.replace(word_to_separate, separated_word)
                new_rows.append({'text': new_text, 'label': 1})

    if not new_rows:
        return df
        
    augmented_df = pd.DataFrame(new_rows)
    return pd.concat([df, augmented_df], ignore_index=True)


In [None]:
# --- 1. –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –∏ –∑–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö -1--
MODEL_NAME = "cointegrated/rubert-tiny2" # –û–±—â–∞—è –º–æ–¥–µ–ª—å –¥–ª—è –≤—Å–µ—Ö, –±—ã—Å—Ç—Ä–∞—è –∏ –∫–∞—á–µ—Å—Ç–≤–µ–Ω–Ω–∞—è
MAX_LENGTH = 256 # –î–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–∞, –º–æ–∂–Ω–æ —É–≤–µ–ª–∏—á–∏—Ç—å –¥–æ 512, –µ—Å–ª–∏ –æ—Ç–∑—ã–≤—ã –¥–ª–∏–Ω–Ω—ã–µ
BATCH_SIZE = 16 # –£–º–µ–Ω—å—à–∞–π—Ç–µ –¥–æ 8, –µ—Å–ª–∏ –Ω–µ —Ö–≤–∞—Ç–∞–µ—Ç –≤–∏–¥–µ–æ–ø–∞–º—è—Ç–∏
EPOCHS = 1 # 1 —ç–ø–æ—Ö–∞ - –¥–ª—è —Ñ–∞–π–Ω-—Ç—å—é–Ω–∞

print("–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")
train_df = pd.read_csv('train.csv')
#test_df = pd.read_csv('test.csv')

# –®–ê–ì 1: –ê–£–ì–ú–ï–ù–¢–ê–¶–ò–Ø
# –°–Ω–∞—á–∞–ª–∞ –∞—É–≥–º–µ–Ω—Ç–∏—Ä—É–µ–º –Ω–∞ "—á–∏—Å—Ç—ã—Ö" –¥–∞–Ω–Ω—ã—Ö, —á—Ç–æ–±—ã —Å–æ–∑–¥–∞—Ç—å "–≥—Ä—è–∑–Ω—ã–µ" –ø—Ä–∏–º–µ—Ä—ã
print("–ê—É–≥–º–µ–Ω—Ç–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö...")
# –ó–∞–ø—É—Å–∫–∞–µ–º –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏—é –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ä–∞–∑, —á—Ç–æ–±—ã —Å–æ–∑–¥–∞—Ç—å –±–æ–ª—å—à–µ –ø—Ä–∏–º–µ—Ä–æ–≤
augmented_train_df = train_df.copy()
for i in range(3): # 3 –ø—Ä–æ—Ö–æ–¥–∞ –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–∏
    print(f"–ü—Ä–æ—Ö–æ–¥ –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–∏ ‚Ññ{i+1}")
    augmented_train_df = augment_profanity(augmented_train_df)
print(f"–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞ –ø–æ—Å–ª–µ –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–∏: {len(augmented_train_df)}")

print("–ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Å—É–ø–µ—Ä-–æ—á–∏—Å—Ç–∫–∏ –∫ –¥–∞–Ω–Ω—ã–º...")
augmented_train_df['text'] = augmented_train_df['text'].apply(super_clean_text)

train_df = augmented_train_df

# –û—á–∏—Å—Ç–∫–∞ –æ—Ç –ø—É—Å—Ç—ã—Ö —Å—Ç—Ä–æ–∫ –Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π
train_df.dropna(subset=['text'], inplace=True)
train_df['text'] = train_df['text'].astype(str)
#test_df['text'] = test_df['text'].astype(str)

print("–†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ train/eval...")
# –°—Ç—Ä–∞—Ç–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω–æ–µ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –±–∞–ª–∞–Ω—Å–∞ –∫–ª–∞—Å—Å–æ–≤
train_subset_df, eval_df = train_test_split(
    train_df,
    test_size=0.15, # 15% –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏—é
    random_state=42,
    stratify=train_df['label']
)

# –û—Å—Ç–∞–≤–∏–º –ø–æ–ª–Ω—ã–π train_df –¥–ª—è —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
# train_subset_df - –¥–ª—è –±—ã—Å—Ç—Ä–æ–π –ø—Ä–æ–≤–µ—Ä–∫–∏ –≥–∏–ø–æ—Ç–µ–∑

train_dataset = Dataset.from_pandas(train_subset_df)
eval_dataset = Dataset.from_pandas(eval_df)
full_train_dataset = Dataset.from_pandas(train_df) # –î–ª—è —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
#test_dataset = Dataset.from_pan     das(test_df)

print("–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

print("–¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
tokenized_full_train = full_train_dataset.map(tokenize_function, batched=True)
#tokenized_test = test_dataset.map(tokenize_function, batched=True)


–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...
–ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Å—É–ø–µ—Ä-–æ—á–∏—Å—Ç–∫–∏ –∫ –¥–∞–Ω–Ω—ã–º...
–†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ train/eval...
–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞...
–¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö...


Map:   0%|          | 0/204135 [00:00<?, ? examples/s]

Map:   0%|          | 0/36024 [00:00<?, ? examples/s]

Map:   0%|          | 0/240159 [00:00<?, ? examples/s]

In [4]:
# --- 2. –§—É–Ω–∫—Ü–∏—è –¥–ª—è –º–µ—Ç—Ä–∏–∫–∏ (–æ–±—â–∞—è –¥–ª—è –≤—Å–µ—Ö) ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='binary') # 'binary' –¥–ª—è F1 –ø–æ –∫–ª–∞—Å—Å—É 1
    acc = accuracy_score(labels, predictions)
    return {"f1": f1, "accuracy": acc}

In [29]:
# --- 3. –û–±—â–∏–µ –∞—Ä–≥—É–º–µ–Ω—Ç—ã –¥–ª—è –æ–±—É—á–µ–Ω–∏—è ---
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True, # –û—á–µ–Ω—å –≤–∞–∂–Ω–æ!
    metric_for_best_model="f1", # –û–ø—Ç–∏–º–∏–∑–∏—Ä—É–µ–º—Å—è –ø–æ F1
    greater_is_better=True,
    fp16=torch.cuda.is_available(), # –ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ –≤–∫–ª. —Å–º–µ—à–∞–Ω–Ω–æ–π —Ç–æ—á–Ω–æ—Å—Ç–∏, –µ—Å–ª–∏ –µ—Å—Ç—å GPU
)

In [30]:
# --- –ü–∞–π–ø–ª–∞–π–Ω 2: BERT —Å Weighted Loss ---
print("\n--- –ó–∞–ø—É—Å–∫ –ü–∞–π–ø–ª–∞–π–Ω–∞ 2: BERT —Å Weighted Loss ---")
import torch.nn as nn

# --- –†–∞—Å—á–µ—Ç –≤–µ—Å–æ–≤ –¥–ª—è –∫–ª–∞—Å—Å–æ–≤ ---
print("–†–∞—Å—á–µ—Ç –≤–µ—Å–æ–≤ –¥–ª—è –∫–ª–∞—Å—Å–æ–≤...")
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")
print(f"–í–µ—Å–∞ –¥–ª—è –∫–ª–∞—Å—Å–æ–≤: {class_weights_tensor}")


--- –ó–∞–ø—É—Å–∫ –ü–∞–π–ø–ª–∞–π–Ω–∞ 2: BERT —Å Weighted Loss ---
–†–∞—Å—á–µ—Ç –≤–µ—Å–æ–≤ –¥–ª—è –∫–ª–∞—Å—Å–æ–≤...
–í–µ—Å–∞ –¥–ª—è –∫–ª–∞—Å—Å–æ–≤: tensor([0.5706, 4.0434], device='cuda:0')


In [31]:
class WeightedLossTrainer(Trainer):
    # –î–æ–±–∞–≤–ª–µ–Ω **kwargs –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏ —Å –Ω–æ–≤—ã–º–∏ –≤–µ—Ä—Å–∏—è–º–∏ Trainer
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –º–µ—Ç–æ–∫
        labels = inputs.pop("labels")

        # –ü—Ä—è–º–æ–π –ø—Ä–æ—Ö–æ–¥ –º–æ–¥–µ–ª–∏
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –≤–µ—Å–æ–≤ –∫–ª–∞—Å—Å–æ–≤ (–ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ—Ç—Å—è, —á—Ç–æ class_weights_tensor –æ–ø—Ä–µ–¥–µ–ª–µ–Ω)
        # –£–±–µ–¥–∏—Ç–µ—Å—å, —á—Ç–æ class_weights_tensor –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –Ω–∞ —Ç–æ–º –∂–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–µ (CPU/CUDA), —á—Ç–æ –∏ logits
        # class_weights_tensor = class_weights_tensor.to(logits.device)
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)

        # –í—ã—á–∏—Å–ª–µ–Ω–∏–µ –≤–∑–≤–µ—à–µ–Ω–Ω—ã—Ö –ø–æ—Ç–µ—Ä—å
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )

        return (loss, outputs) if return_outputs else loss

In [32]:
# --- –ö–∞—Å—Ç–æ–º–Ω—ã–π Trainer ---
import os


# –°–¥–µ–ª–∞–π—Ç–µ —Ç–∞–∫:
# --- –ò–ó–ú–ï–ù–ï–ù–ò–Ø –ó–î–ï–°–¨ ---
# –£–∫–∞–∑—ã–≤–∞–µ–º –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω—ã–π –ø—É—Ç—å –∫–∞–∫ –∏ —Ä–∞–Ω—å—à–µ
relative_path = "./final_weighted_model-1"
# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –µ–≥–æ –≤ –ø–æ–ª–Ω—ã–π (–∞–±—Å–æ–ª—é—Ç–Ω—ã–π) –ø—É—Ç—å, –ø–æ–Ω—è—Ç–Ω—ã–π —Å–∏—Å—Ç–µ–º–µ
SAVED_MODEL_PATH = os.path.abspath(relative_path)

print(f"–ó–∞–≥—Ä—É–∑–∫–∞ —É–∂–µ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏–∑ {SAVED_MODEL_PATH} –¥–ª—è –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏—è –æ–±—É—á–µ–Ω–∏—è...")
# –¢–µ–ø–µ—Ä—å from_pretrained –ø–æ–π–º–µ—Ç, —á—Ç–æ —ç—Ç–æ –ª–æ–∫–∞–ª—å–Ω–∞—è –ø–∞–ø–∫–∞, –∞ –Ω–µ —Ä–µ–ø–æ–∑–∏—Ç–æ—Ä–∏–π –Ω–∞ —Ö–∞–±–µ
model_weighted = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL_PATH, num_labels=2, local_files_only=True)
trainer_weighted = WeightedLossTrainer(
    model=model_weighted,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

print("–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏...")
trainer_weighted.train()

print("–û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏...")


–ó–∞–≥—Ä—É–∑–∫–∞ —É–∂–µ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏–∑ C:\Users\malan\PycharmProjects\PythonProject\final_weighted_model-1 –¥–ª—è –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏—è –æ–±—É—á–µ–Ω–∏—è...
–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏...


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0493,0.076077,0.962342,0.990645
2,0.0144,0.101713,0.967495,0.991978


–û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏...


In [33]:
eval_results_weighted = trainer_weighted.evaluate()
print(f"–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤–∞–ª–∏–¥–∞—Ü–∏–∏ Weighted Loss: {eval_results_weighted}")


–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤–∞–ª–∏–¥–∞—Ü–∏–∏ Weighted Loss: {'eval_loss': 0.1017126590013504, 'eval_f1': 0.9674952198852772, 'eval_accuracy': 0.9919775705085498, 'eval_runtime': 60.5817, 'eval_samples_per_second': 594.635, 'eval_steps_per_second': 37.173, 'epoch': 2.0}


In [34]:
# --- –û–±—É—á–µ–Ω–∏–µ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ ---
print("–ü–µ—Ä–µ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö...")

# –°–æ–∑–¥–∞–µ–º –∫–æ–ø–∏—é –∞—Ä–≥—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
final_training_args_dict = training_args.to_dict()

# --- –í–ù–û–°–ò–ú –ò–ó–ú–ï–ù–ï–ù–ò–Ø –î–õ–Ø –§–ò–ù–ê–õ–¨–ù–û–ì–û –û–ë–£–ß–ï–ù–ò–Ø ---
# –û—Ç–∫–ª—é—á–∞–µ–º –æ—Ü–µ–Ω–∫—É, —Ç.–∫. eval_dataset –Ω–µ –±—É–¥–µ—Ç
final_training_args_dict['eval_strategy'] = 'no'
# –¢–∞–∫–∂–µ –æ—Ç–∫–ª—é—á–∞–µ–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø–æ —ç–ø–æ—Ö–∞–º (—Å–æ—Ö—Ä–∞–Ω–∏–º –æ–¥–∏–Ω —Ä–∞–∑ –≤ –∫–æ–Ω—Ü–µ)
final_training_args_dict['save_strategy'] = 'no'
# –ò –∑–∞–≥—Ä—É–∑–∫—É –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏, —Ç.–∫. –Ω–µ—Ç –º–µ—Ç—Ä–∏–∫–∏ –¥–ª—è –≤—ã–±–æ—Ä–∞
final_training_args_dict['load_best_model_at_end'] = False

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–ª–æ–≤–∞—Ä—å –æ–±—Ä–∞—Ç–Ω–æ –≤ –æ–±—ä–µ–∫—Ç TrainingArguments
final_args = TrainingArguments(**final_training_args_dict)

# –°–æ–∑–¥–∞–µ–º —Ñ–∏–Ω–∞–ª—å–Ω—ã–π —Ç—Ä–µ–Ω–µ—Ä —Å –ù–û–í–´–ú–ò –∞—Ä–≥—É–º–µ–Ω—Ç–∞–º–∏
final_trainer_weighted = WeightedLossTrainer(
    model=model_weighted,   # –ò—Å–ø–æ–ª—å–∑—É–µ–º —É–∂–µ –¥–æ–æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å —Å –ø–µ—Ä–≤–æ–≥–æ —ç—Ç–∞–ø–∞!
    args=final_args,        # –ò—Å–ø–æ–ª—å–∑—É–µ–º –Ω–æ–≤—ã–µ –∞—Ä–≥—É–º–µ–Ω—Ç—ã –±–µ–∑ –≤–∞–ª–∏–¥–∞—Ü–∏–∏
    train_dataset=tokenized_full_train,
    # eval_dataset –∏ compute_metrics –∑–¥–µ—Å—å —É–∂–µ –Ω–µ –Ω—É–∂–Ω—ã
)

# –ó–∞–ø—É—Å–∫–∞–µ–º —Ñ–∏–Ω–∞–ª—å–Ω–æ–µ –¥–æ–æ–±—É—á–µ–Ω–∏–µ
print("–ó–∞–ø—É—Å–∫ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –¥–æ–æ–±—É—á–µ–Ω–∏—è...")
final_trainer_weighted.train()

print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏...")
# –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –≤ –æ–¥–Ω—É –ø–∞–ø–∫—É
SAVE_PATH = "./final_model_after_cleaning-2"
final_trainer_weighted.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"–§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ '{SAVE_PATH}'")
# --- –û–±—É—á–µ–Ω–∏–µ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ ---
# print("–ü–µ—Ä–µ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö...")
# final_trainer_weighted = WeightedLossTrainer(
#     model=model_weighted,
#     args=training_args,
#     train_dataset=tokenized_full_train,
# )
# final_trainer_weighted.train()

#print("–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –¥–ª—è test.csv...")
#predictions_w, _, _ = final_trainer_weighted.predict(tokenized_test)
#predicted_labels_w = np.argmax(predictions_w, axis=1)

#submission_weighted = pd.DataFrame({'id': test_df['id'], 'label': predicted_labels_w})
#submission_weighted.to_csv('submission_weighted.csv', index=False)
#print("–§–∞–π–ª submission_weighted.csv –≥–æ—Ç–æ–≤!")


–ü–µ—Ä–µ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö...
–ó–∞–ø—É—Å–∫ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –¥–æ–æ–±—É—á–µ–Ω–∏—è...




Step,Training Loss
100,0.0274
200,0.0786
300,0.0512
400,0.0668
500,0.0174
600,0.0995
700,0.03
800,0.0225
900,0.0282
1000,0.0321


–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏...
–§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ './final_model_after_cleaning-2'


In [35]:
# # --- –û–±—É—á–µ–Ω–∏–µ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ ---
# print("–ü–µ—Ä–µ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –≤–µ—Å–∞–º–∏ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö...")
#
# # –°–æ–∑–¥–∞–µ–º –∫–æ–ø–∏—é –∞—Ä–≥—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
# final_training_args = training_args.to_dict()
#
# # –û—Ç–∫–ª—é—á–∞–µ–º –æ—Ü–µ–Ω–∫—É –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø–æ —à–∞–≥–∞–º, —Ç.–∫. eval_dataset –Ω–µ –±—É–¥–µ—Ç
# final_training_args['eval_strategy'] = 'no'
# final_training_args['save_strategy'] = 'no'
# # –¢–∞–∫–∂–µ –æ—Ç–∫–ª—é—á–∞–µ–º –∑–∞–≥—Ä—É–∑–∫—É –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏, —Ç.–∫. –Ω–µ—Ç –º–µ—Ç—Ä–∏–∫–∏ –¥–ª—è –≤—ã–±–æ—Ä–∞
# final_training_args['load_best_model_at_end'] = False
#
# # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–ª–æ–≤–∞—Ä—å –æ–±—Ä–∞—Ç–Ω–æ –≤ TrainingArguments
# final_args = TrainingArguments(**final_training_args)
#
#
# # –°–æ–∑–¥–∞–µ–º —Ñ–∏–Ω–∞–ª—å–Ω—ã–π —Ç—Ä–µ–Ω–µ—Ä —Å –Ω–æ–≤—ã–º–∏ –∞—Ä–≥—É–º–µ–Ω—Ç–∞–º–∏
# final_trainer_weighted = WeightedLossTrainer(
#     model=model_weighted,   # –ò—Å–ø–æ–ª—å–∑—É–µ–º —É–∂–µ –¥–æ–æ–±—É—á–µ–Ω–Ω—É—é –∏ –ª—É—á—à—É—é –º–æ–¥–µ–ª—å —Å –ø–µ—Ä–≤–æ–≥–æ —ç—Ç–∞–ø–∞!
#     args=final_args,        # –ò—Å–ø–æ–ª—å–∑—É–µ–º –Ω–æ–≤—ã–µ –∞—Ä–≥—É–º–µ–Ω—Ç—ã –±–µ–∑ –≤–∞–ª–∏–¥–∞—Ü–∏–∏
#     train_dataset=tokenized_full_train,
#     # eval_dataset –∏ compute_metrics –∑–¥–µ—Å—å –Ω–µ –Ω—É–∂–Ω—ã
# )
#
# # –ó–∞–ø—É—Å–∫–∞–µ–º —Ñ–∏–Ω–∞–ª—å–Ω–æ–µ –¥–æ–æ–±—É—á–µ–Ω–∏–µ
# final_trainer_weighted.train()
#
# print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏...")
# # –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç—Å—è –≤ –ø–∞–ø–∫—É, —É–∫–∞–∑–∞–Ω–Ω—É—é –≤ output_dir
# final_trainer_weighted.save_model("./final_weighted_model")
# tokenizer.save_pretrained("./final_weighted_model") # –°–æ—Ö—Ä–∞–Ω—è–µ–º –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —Ä—è–¥–æ–º
#
# print("–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –¥–ª—è test.csv...")
# predictions_w = final_trainer_weighted.predict(tokenized_test)
# predicted_labels_w = np.argmax(predictions_w.predictions, axis=1)
#
# submission_weighted = pd.DataFrame({'id': test_df['id'], 'label': predicted_labels_w})
# submission_weighted.to_csv('submission_weighted.csv', index=False)
# print("–§–∞–π–ª submission_weighted.csv –∏ –º–æ–¥–µ–ª—å –≤ ./final_weighted_model –≥–æ—Ç–æ–≤—ã!")