In [1]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, model.num_classes), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [4]:
import torch
import torch.nn as nn
    
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput


class AttentionPoolerV1(nn.Module):
    """
    самая простая реализация
    веса токенов одни и те же для всех лейблов.
    по сути только помогает игнорить какой-то общий слабый сигнал для всех классов: например, тишина.
    """
    def __init__(self, d):
        super().__init__()
        self.pooler = nn.Linear(d, 1)

    def forward(self, x, mask):
        d = x.shape[-1]
        w = self.pooler(x) / d ** 0.5  # [N, T, 1]
        w = w + (1.0 - mask[:, :, None].float()) * -10000.0
        w = torch.softmax(w, dim=1)  # [N, T, 1]
        x = (x * w).sum(1)  # [N, D]
        return x

class AttentionPoolerV2(nn.Module):
    """
    v1 + только в качестве mlp взята линейная модель с нелинейностью
    """
    def __init__(self, d):
        super().__init__()
        self.pooler = nn.Sequential(
            nn.Linear(d, d),
            nn.Tanh(),
            nn.Linear(d, 1)
        )

    def forward(self, x, mask):
        w = self.pooler(x)  # [N, T, 1]
        w = w + (1.0 - mask[:, :, None].float()) * -10000.0
        w = torch.softmax(w, dim=1)  # [N, T, 1]
        x = (x * w).sum(1)  # [N, D]
        return x
    
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class BertMPModel(nn.Module):
    def __init__(self, model_name, num_classes, inp_dim=1024, device='cuda'):
        super(BertMPModel, self).__init__()
        self.device = device
        self.num_classes = num_classes
        self.config = AutoConfig.from_pretrained(model_name)
        
        self.base_model = AutoModel.from_pretrained(model_name, config=self.config)
        self.bn = nn.BatchNorm1d(inp_dim)
        self.poooling = MeanPooling()
        # self.poooling = AttentionPooling(inp_dim)
        # self.poooling = AttentionPoolerV2(inp_dim)
        self.fc = nn.Linear(inp_dim, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        out = self.poooling(outputs.last_hidden_state, attention_mask)
        out = self.bn(out)
        outputs = self.fc(out)
        return SequenceClassifierOutput(logits=outputs)

In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import torch.nn as nn
import evaluate
import numpy as np
import torch


torch.manual_seed(42)


accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1_weighted": f1_metric.compute(predictions=predictions, references=labels, average="weighted")
        }

def preprocess_logits_for_metrics(logits, labels):
    return logits[0]


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding=True)


train = pd.read_csv("train_dataset_with_synt.csv")
test = pd.read_csv("test_dataset.csv")
train = train.rename(columns={"subject": "labels"})
test = test.rename(columns={"subject": "labels"})

le = LabelEncoder()
train['labels'] = le.fit_transform(train['labels'])
test['labels'] = le.transform(test['labels'])


model_name = "ai-forever/sbert_large_nlu_ru"
batch_size = 4

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

ds = DatasetDict()

ds['train'] = train_dataset
ds['test'] = test_dataset

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertMPModel(model_name, len(le.classes_))

tokenized_ds = ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="subject_model_on_my_clear_data",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=1,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    lr_scheduler_type="cosine",  
    load_best_model_at_end=True,
    push_to_hub=False,
    label_names=["labels"],
    report_to="none",
    label_smoothing_factor=0.01,
    fp16=True
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

trainer.train()
trainer.evaluate()

Map:   0%|          | 0/19038 [00:00<?, ? examples/s]

Map:   0%|          | 0/4509 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,2.2412,2.083617,{'accuracy': 0.5043246839654025},{'f1': 0.4432349196885461}
2,1.7124,2.090804,{'accuracy': 0.5311599024173874},{'f1': 0.4850894508035282}
3,1.2117,2.239671,{'accuracy': 0.5559991128853404},{'f1': 0.5270725278279339}
4,0.8655,2.620506,{'accuracy': 0.5422488356620093},{'f1': 0.5266269217048666}
5,0.5575,3.106532,{'accuracy': 0.5602129075182968},{'f1': 0.5471032500670828}
6,0.259,4.071661,{'accuracy': 0.5553337768906631},{'f1': 0.5457053870888638}
7,0.188,4.420753,{'accuracy': 0.5533377689066312},{'f1': 0.548229795952305}
8,0.0824,4.644414,{'accuracy': 0.5582168995342648},{'f1': 0.5520268838214742}
9,0.0177,4.97661,{'accuracy': 0.5642049234863606},{'f1': 0.5584830709352084}
10,0.012,4.898055,{'accuracy': 0.5646484808161455},{'f1': 0.5570950851348362}


{'eval_loss': 2.0836169719696045,
 'eval_accuracy': {'accuracy': 0.5043246839654025},
 'eval_f1_weighted': {'f1': 0.4432349196885461},
 'eval_runtime': 104.9694,
 'eval_samples_per_second': 42.955,
 'eval_steps_per_second': 10.746,
 'epoch': 10.0}

In [None]:
import pickle
with open('model/label_encoder_mp.pkl', 'wb') as f:
    pickle.dump(le, f)

In [24]:
torch.save(trainer.model.state_dict(), "model/bert_mp.pth")

In [25]:
model1 = BertMPModel(model_name, len(le.classes_))

In [26]:
model1.load_state_dict(torch.load("model/bert_mp.pth"))

<All keys matched successfully>

In [27]:
training_args = TrainingArguments(
    output_dir="subject_model_on_my_clear_data",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=1,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    lr_scheduler_type="cosine",  
    load_best_model_at_end=True,
    push_to_hub=False,
    label_names=["labels"],
    report_to="none",
    label_smoothing_factor=0.01,
    fp16=True
)

trainer1 = CustomTrainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [28]:
trainer1.evaluate()

{'eval_loss': 4.757267475128174,
 'eval_accuracy': {'accuracy': 0.5657573741406077},
 'eval_f1_weighted': {'f1': 0.5604479363245909},
 'eval_runtime': 103.4901,
 'eval_samples_per_second': 43.569,
 'eval_steps_per_second': 10.9}

In [None]:
model = AutoModel.from_pretrained("<path_to_saved_pretrained_model>")