In [85]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, model.num_classes), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [86]:
import torch
import torch.nn as nn
    
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput


class AttentionPoolerV1(nn.Module):
    """
    самая простая реализация
    веса токенов одни и те же для всех лейблов.
    по сути только помогает игнорить какой-то общий слабый сигнал для всех классов: например, тишина.
    """
    def __init__(self, d):
        super().__init__()
        self.pooler = nn.Linear(d, 1)

    def forward(self, x, mask):
        d = x.shape[-1]
        w = self.pooler(x) / d ** 0.5  # [N, T, 1]
        w = w + (1.0 - mask[:, :, None].float()) * -10000.0
        w = torch.softmax(w, dim=1)  # [N, T, 1]
        x = (x * w).sum(1)  # [N, D]
        return x

class AttentionPoolerV2(nn.Module):
    """
    v1 + только в качестве mlp взята линейная модель с нелинейностью
    """
    def __init__(self, d):
        super().__init__()
        self.pooler = nn.Sequential(
            nn.Linear(d, d),
            nn.Tanh(),
            nn.Linear(d, 1)
        )

    def forward(self, x, mask):
        w = self.pooler(x)  # [N, T, 1]
        w = w + (1.0 - mask[:, :, None].float()) * -10000.0
        w = torch.softmax(w, dim=1)  # [N, T, 1]
        x = (x * w).sum(1)  # [N, D]
        return x
    
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class BertMPModel(nn.Module):
    def __init__(self, model_name, num_classes, inp_dim=1024, device='cuda'):
        super(BertMPModel, self).__init__()
        self.device = device
        self.num_classes = num_classes
        self.config = AutoConfig.from_pretrained(model_name)
        
        self.base_model = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(inp_dim, 2) # output features from bert is 768 and 2 is ur number of labels
        self.bn = nn.BatchNorm1d(inp_dim)
        self.poooling = MeanPooling()
        # self.poooling = AttentionPooling(inp_dim)
        # self.poooling = AttentionPoolerV2(inp_dim)
        self.fc = nn.Linear(inp_dim, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        out = self.poooling(outputs.last_hidden_state, attention_mask)
        out = self.bn(out)
        outputs = self.fc(out)
        return SequenceClassifierOutput(logits=outputs)

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import torch.nn as nn
import evaluate
import numpy as np
import torch


torch.manual_seed(42)


accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1_weighted": f1_metric.compute(predictions=predictions, references=labels, average="weighted")
        }

def preprocess_logits_for_metrics(logits, labels):
    return logits[0]


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding=True)


train = pd.read_csv("train_dataset.csv")
# train, X_test = train_test_split(train, stratify=train['subject'], train_size=0.1)
test = pd.read_csv("test_dataset.csv")
# train = train.rename(columns={"subject": "labels"})
# test = test.rename(columns={"subject": "labels"})

train = train.rename(columns={"group_subject": "labels"})
test = test.rename(columns={"group_subject": "labels"})

le = LabelEncoder()
train['labels'] = le.fit_transform(train['labels'])
test['labels'] = le.transform(test['labels'])


model_name = "ai-forever/sbert_large_nlu_ru"
batch_size = 4

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

ds = DatasetDict()

ds['train'] = train_dataset
ds['test'] = test_dataset

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertMPModel(model_name, len(le.classes_))

tokenized_ds = ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="subject_model_on_my_clear_data/bert_mp_group",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=1,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    lr_scheduler_type="cosine",  
    load_best_model_at_end=True,
    push_to_hub=False,
    label_names=["labels"],
    report_to="none",
    label_smoothing_factor=0.01,
    fp16=True
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

trainer.train()
trainer.evaluate()

Map:   0%|          | 0/19038 [00:00<?, ? examples/s]

Map:   0%|          | 0/4509 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [87]:
train = pd.read_csv("train_dataset_with_synt.csv")

In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import torch.nn as nn
import evaluate
import numpy as np
import torch


torch.manual_seed(42)


<torch._C.Generator at 0x2c16e904050>

In [21]:
# trainer.train("subject_model_on_my_clear_data/checkpoint-40581")

In [5]:
# import pickle
# with open('models/label_encoder_mp.pkl', 'wb') as f:
#     pickle.dump(le, f)

In [6]:
torch.save(trainer.model.state_dict(), "model/bert_ap.pth")

In [7]:
pd.read_csv("submission.csv", delimiter=';')

Unnamed: 0,id,Группа тем,Тема
0,0,Физическая культура и спорт,Строительство спортивной инфраструктуры
1,1,Физическая культура и спорт,Строительство спортивной инфраструктуры
2,2,Физическая культура и спорт,Строительство спортивной инфраструктуры
3,3,Физическая культура и спорт,Строительство спортивной инфраструктуры
4,4,Физическая культура и спорт,Строительство спортивной инфраструктуры
...,...,...,...
9738,9738,Физическая культура и спорт,Строительство спортивной инфраструктуры
9739,9739,Физическая культура и спорт,Строительство спортивной инфраструктуры
9740,9740,Физическая культура и спорт,Строительство спортивной инфраструктуры
9741,9741,Физическая культура и спорт,Строительство спортивной инфраструктуры


In [44]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import re

In [45]:
test = pd.read_csv("test.csv", delimiter=';')

In [46]:
rename_columns = {
    "Текст инцидента": "text",
}
test = test.rename(columns=rename_columns)

In [47]:
test['text'] = test['text'].apply(lambda x: re.sub(r'http\S+', '', x)) 
test['text'] = test['text'].apply(lambda x: re.sub('<[^<]+?>', '', x))

In [48]:
import pickle
with open('models/label_encoder_mp.pkl', 'rb') as pkl_file:
    le = pickle.load(pkl_file)

In [49]:
model_name = "ai-forever/sbert_large_nlu_ru"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertMPModel(model_name, len(le.classes_))
model.load_state_dict(torch.load("models/bert_mp.pth"))
model.cuda();

In [52]:
le.classes_[[1, 1]]

array(['Архитектура города', 'Архитектура города'], dtype=object)

In [56]:
model.eval()
all_preds = []
for batch in tqdm(DataLoader(test['text'].to_list(), batch_size=16, shuffle=False)):
    tokenized = tokenizer(batch, truncation=True, max_length=512, padding=True, return_tensors='pt')
    tokenized.pop("token_type_ids")
    tokenized = {k: v.cuda() for k,v in tokenized.items()}
    with torch.no_grad():
        output = model(**tokenized)
        predictions = np.argmax(output.logits.cpu(), axis=1)
    all_preds += [le.classes_[predictions]]

100%|████████████████████████████████████████████████████████████████████████████████| 609/609 [02:34<00:00,  3.95it/s]


In [74]:
gg = pd.read_csv('first_submit.csv', sep = ';')

In [81]:
gg['Тема'] = np.concatenate(all_preds)

In [76]:
pd.DataFrame(np.concatenate(all_preds), columns=['Тема'])

Unnamed: 0,Тема
0,Ремонт спортивных учреждений
1,Подтопление автомобильных дорог
2,Содержание больниц
3,Дети и многодетные семьи
4,Оказание гос. соц. помощи
...,...
9738,★ Неисправные фонари освещения
9739,Оказание гос. соц. помощи
9740,Безопасность общественных пространств
9741,Оказание гос. соц. помощи


In [83]:
gg.to_csv('second_submit.csv', sep = ';', index = False, encoding = 'utf-8')

In [12]:
training_args = TrainingArguments(
    output_dir="subject_model_on_my_clear_data",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=1,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    lr_scheduler_type="cosine",  
    load_best_model_at_end=True,
    push_to_hub=False,
    label_names=["labels"],
    report_to="none",
    label_smoothing_factor=0.01,
    fp16=True
)

trainer1 = CustomTrainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [13]:
trainer1.evaluate()

{'eval_loss': 4.757267475128174,
 'eval_accuracy': {'accuracy': 0.5657573741406077},
 'eval_f1_weighted': {'f1': 0.5604479363245909},
 'eval_runtime': 99.5047,
 'eval_samples_per_second': 45.314,
 'eval_steps_per_second': 11.336}

In [None]:
df.to_csv(filename, sep = ';', index = False, encoding = 'utf-8')