In [1]:
import gc
import os
import pickle
from typing import List, Dict, Optional, Literal, Tuple
from dataclasses import dataclass

import pandas as pd
import numpy as np

import plotly.express as px
from tqdm.notebook import tqdm, trange

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.filterwarnings("ignore")

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from pytorch_tabular.tabular_datamodule import TabularDataset
from pytorch_tabular import TabularModel
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.utils import get_balanced_sampler, get_class_weighted_cross_entropy

In [2]:
random_state = 47

torch.manual_seed(random_state)
np.random.seed(random_state)
torch.cuda.manual_seed(random_state)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "cointegrated/rubert-tiny2"
batch_size = 64 * 2

# Read data

In [3]:
df = pd.read_csv("data_with_text.csv")

In [5]:
df["search_by_name"] = df["search_by_name"].apply(lambda x: x.split("_")[0])

In [6]:
df.head(2)

Unnamed: 0,vacancy_name,vacancy_type,vacancy_experience,vacancy_id,created_date,salary_gross,salary_to,salary_from,salary_currency,company_name,...,created_quarter,created_month,vacancy_name_score,vacancy_name_len,salary_from_rub_rounded,salary_to_rub_rounded,target_rounded,target,vacancy_description,vacancy_keys
0,"Frontend-разработчик (Angular, TS)",open,between1And3,92688571,2024-02-05 12:10:09.484000+03:00,True,240000.0,180000.0,RUR,Timetta,...,1,2,198.5,34,180000,240000,210000.0,1,Требуемый опыт работы: от 3 лет.Уровень: Middl...,"HTML5, Bootstrap, TypeScript, CSS3, REST API, ..."
1,Младший бэкенд-разработчик,open,noExperience,92755320,2024-02-06 11:55:19.754000+03:00,False,70000.0,50000.0,RUR,Евромобайл,...,1,2,22.0,26,50000,70000,60000.0,0,Требуется младший бэкэнд разработчик (Back-end...,"Git, Базы данных, C#, MS SQL"


# Custom dataset

In [4]:
class CustomVacancyDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer,
        device: torch.device,
        text: str,
        target: str,
        chunk_size: int = 512,
        mode: Optional[Literal["train", "valid", "test"]] = None
    ) -> None:
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.device = device
        self.text = text
        self.target = target
        self.chunk_size = chunk_size
        self.mode = mode

    def __len__(self):
        return len(self.data)

    @staticmethod
    def chunker(tokens: Dict[str, torch.Tensor], chunk_size: int = 512) -> List[str]:
        input_ids = list(tokens["input_ids"][0].split(chunk_size - 2))
        attention_mask = list(tokens["attention_mask"][0].split(chunk_size - 2))

        cls_token_id = 2
        eos_token_id = 3

        for i in range(len(input_ids)):
            input_ids[i] = torch.cat([torch.Tensor([cls_token_id]), input_ids[i], torch.Tensor([eos_token_id])])
            attention_mask[i] = torch.cat([torch.Tensor([1]), attention_mask[i], torch.Tensor([1])])

            pad_len = chunk_size - len(input_ids[i])
            if pad_len > 0:
                input_ids[i]= torch.cat([input_ids[i], torch.Tensor([0]*pad_len)])
                attention_mask[i] = torch.cat([attention_mask[i], torch.Tensor([0]*pad_len)])
                
        tokens["input_ids"] = torch.stack(input_ids)
        tokens["attention_mask"] = torch.stack(attention_mask)
        
        return tokens
    
    def __getitem__(self, idx: int):
        tokens = self.tokenizer(self.data.iloc[idx][self.text], return_tensors="pt", add_special_tokens=False, return_token_type_ids=False)
        tokens = self.chunker(tokens, self.chunk_size)
        
        tokens["input_ids"] = tokens["input_ids"].to(device).long()
        tokens["attention_mask"] = tokens["attention_mask"].to(device).int()

        if self.mode == "test":
            return tokens

        label = self.data.iloc[idx][self.target]
        return tokens, torch.tensor(label)

# Custom model

In [5]:
class BertClassification(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert

        self.seq_0 = nn.Sequential(
            nn.Linear(312, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 7)
        )
        
    def forward(self, inputs):
        x = self.bert(**inputs)
        x = (x["last_hidden_state"] * inputs["attention_mask"][:, :, None]).sum(dim=1) / inputs["attention_mask"][:, :, None].sum(dim=1)
        x = self.seq_0(torch.mean(x, dim=0))

        return x

# Define train\eval func

In [9]:
def get_score(y_true, y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    macro_prec, micro_prec = precision_score(y_true, y_pred, average="macro"), precision_score(y_true, y_pred, average="micro")
    macro_rec = recall_score(y_true, y_pred, average="macro")
    
    return macro_f1, macro_prec, macro_rec, micro_prec

In [10]:
def train_fn(model, loss, data, optimizer):
    model.train()
    metrics = 0.0
    y_true, y_pred = [], []
    

    for i, batch in enumerate(tqdm(data, leave=False, desc="Train")):
        tokens_list, target = batch
        target = target.to(device)
        temp_outputs = []

        optimizer.zero_grad()
        for tokens in tokens_list:
            output = model(tokens)
            temp_outputs.append(output)
        temp_outputs = torch.stack(temp_outputs)
        
        loss_remains = loss(temp_outputs, target)
        loss_remains.backward()
        optimizer.step()

        metrics += loss_remains.item()
        y_pred.append(temp_outputs.to("cpu").detach().numpy().argmax(1))
        y_true.append(target.to("cpu").detach().numpy())
        
    metrics /= len(data)

    return metrics, np.hstack(y_pred), np.hstack(y_true)

In [11]:
def eval_fn(model, loss, data):
    model.eval()
    metrics = 0.0
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch in tqdm(data, leave=False, desc="Eval"):
            tokens_list, target = batch
            target = target.to(device)
            temp_outputs = []
    
            for tokens in tokens_list:
                output = model(tokens)
                temp_outputs.append(output)
            temp_outputs = torch.stack(temp_outputs)
            
            loss_remains = loss(temp_outputs, target)

            metrics += loss_remains.item()
            y_pred.append(temp_outputs.to("cpu").detach().numpy().argmax(1))
            y_true.append(target.to("cpu").detach().numpy())
            
    metrics /= len(data)
    
    return metrics, np.hstack(y_pred), np.hstack(y_true)

In [12]:
def fit(model, loss, optimizer, scheduler, train_loader, valid_loader, device, epochs):
    info = "Epoch: %s Train loss: %.3f Valid loss: %.3f"
    best_macro = 0.0
    best_micro_precision_score = 0.0
    info_metrics = "Macro f1: %.3f | Macro precision: %.3f | Macro recall: %.3f | Micro precision: %.3f"
    
    for epoch in trange(epochs):
        train_loss, y_train_pred, y_train_true = train_fn(model, loss, train_loader, optimizer)
        eval_loss, y_eval_pred, y_eval_true = eval_fn(model, loss, valid_loader)
        
        train_metrics = get_score(y_train_true, y_train_pred)
        eval_metrics = get_score(y_eval_true, y_eval_pred)
        print(info_metrics % train_metrics , "-- train")
        print(info_metrics % eval_metrics, "-- eval")
        
        if best_macro < sum(eval_metrics[:3]) / 3:
            best_macro = sum(eval_metrics[:3]) / 3
            torch.save(model.state_dict(), f"./models/model_best_val_macro_{best_macro:.4f}.pt")

        if best_micro_precision_score < eval_metrics[-1]:
            best_micro_precision_score = eval_metrics[-1]
            torch.save(model.state_dict(), f"./models/model_best_val_micro_{best_micro_precision_score:.4f}.pt")
            
        
        print(info % (epoch + 1, train_loss, eval_loss), "\n")
        scheduler.step()
        gc.collect()
        torch.cuda.empty_cache()
    print(best_macro, best_micro_precision_score)

# Define train\test split

In [13]:
df["vacancy_description"] = df["search_by_name"] + " " + df["vacancy_description"]

In [6]:
train, valid = train_test_split(df, shuffle=True, train_size=0.8, stratify=df["target"], random_state=random_state)

In [7]:
valid, test = train_test_split(valid, shuffle=False, test_size=0.2, random_state=random_state)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
def collate_fn(data):
    data, label = list(zip(*data))
    return data, torch.stack(label)

In [41]:
text_columns = "vacancy_description"
target_columns = "target"
chunk_size = 1024

train_dataset = CustomVacancyDataset(train, tokenizer, device, text_columns, target_columns, chunk_size, mode="train")
valid_dataset = CustomVacancyDataset(valid, tokenizer, device, text_columns, target_columns, chunk_size, mode="valid")
test_dataset = CustomVacancyDataset(test, tokenizer, device, text_columns, target_columns, chunk_size, mode="valid")

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [23]:
batch_example = next(iter(train_loader))[0]

In [45]:
for batch in train_loader:
    token_list, target = batch
    for tokens in token_list:
        if tokens["input_ids"].shape[0] != 1:
            print(tokens["input_ids"][0][0])
            print(tokens["input_ids"][1][tokens["input_ids"][1] != 0])
    # break

tensor(2, device='cuda:0')
tensor([    2, 37975, 17560,   733,   762,  3325,   677,   735,   422,  2274,
           17, 26538,   105,   294, 11094, 22023, 20727,  3131,  3618,   117,
           18,     3], device='cuda:0')
tensor(2, device='cuda:0')
tensor([    2, 32803, 15671,  1512,    16,  1034,   533,  7664,    16, 42800,
         1172,  1276,    16, 59692,  4845, 13778,   626,    16,  1492,   666,
           16,  8008, 12419,   555,    16, 22996,   699,    16, 22996,   542,
           16,  8426,  2920,    16,  1407,    10,  7243,    16,  7243, 20968,
           16,  1727, 10304,    16,  8472,  6970,    16,  1777, 17341, 17856,
           16, 53753, 12810, 19607,  4960,    16, 44093, 51042,   872,    16,
        14035,  4959,    16,  2172, 14536,    16,  1744, 17181,  3155,    16,
         5793,  5772, 11320,  6098,    16,    22,  1521, 12555,    16, 15688,
         1770,  1828,  2859,    16, 42134, 56649,    16,  7324,  8856,    16,
         6069,  1235,    16,  2781,  1388,    16

KeyboardInterrupt: 

# Start train

In [22]:
bert_model = AutoModel.from_pretrained(model_name)
model = BertClassification(bert_model).to(device)

In [23]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr= 2e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [31]:
fit(model, loss, optimizer, scheduler, train_loader, valid_loader, device, 1)

  0%|          | 0/1 [00:00<?, ?it/s]

Train:   0%|          | 0/2331 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2129 > 2048). Running this sequence through the model will result in indexing errors


Eval:   0%|          | 0/467 [00:00<?, ?it/s]

Macro f1: 0.204 | Macro precision: 0.214 | Macro recall: 0.200 | Micro precision: 0.849 -- train
Macro f1: 0.220 | Macro precision: 0.241 | Macro recall: 0.216 | Micro precision: 0.872 -- eval
Epoch: 1 Train loss: 0.421 Valid loss: 0.333 

0.2255302638219592 0.8723140357346385


In [32]:
eval_loss, y_eval_pred, y_eval_true = eval_fn(model, loss, test_loader)
eval_metrics = get_score(y_eval_true, y_eval_pred)
eval_metrics

Eval:   0%|          | 0/117 [00:00<?, ?it/s]

(0.21921668758900337,
 0.2255554105281477,
 0.21537916912758062,
 0.8745642263341379)

In [37]:
n_epoch = 5
info_metrics = "Macro f1: %.3f | Macro precision: %.3f | Macro recall: %.3f | Micro precision: %.3f"

In [38]:
train, test = train_test_split(df, shuffle=True, train_size=0.85, stratify=df[target_columns], random_state=random_state)

test_dataset = CustomVacancyDataset(test, tokenizer, device, text_columns, target_columns, chunk_size, mode="valid")
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
best_test_macro = 0.0
best_test_micro_precision_score = 0.0
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

for train_index, val_index in skf.split(train[text_columns], train[target_columns]):
    bert_model = AutoModel.from_pretrained(model_name)
    model = BertClassification(bert_model).to(device)

    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr= 2e-5)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    train_dataset = CustomVacancyDataset(train.iloc[train_index], tokenizer, device, text_columns, target_columns, chunk_size, mode="train")
    valid_dataset = CustomVacancyDataset(train.iloc[val_index], tokenizer, device, text_columns, target_columns, chunk_size, mode="valid")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=4)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4)
    
    fit(model, loss, optimizer, scheduler, train_loader, valid_loader, device, n_epoch)
    
    eval_loss, y_eval_pred, y_eval_true = eval_fn(model, loss, test_loader)
    eval_metrics = get_score(y_eval_true, y_eval_pred)
    print(info_metrics % eval_metrics, "-- test")

    if best_test_macro < sum(eval_metrics[:3]) / 3:
            best_macro = sum(eval_metrics[:3]) / 3
            torch.save(model.state_dict(), f"./models/model_best_test_macro_{best_macro:.4f}.pt")

    if best_test_micro_precision_score < eval_metrics[-1]:
        best_micro_precision_score = eval_metrics[-1]
        torch.save(model.state_dict(), f"./models/model_best_test_micro_{best_micro_precision_score:.4f}.pt")

  0%|          | 0/5 [00:00<?, ?it/s]

Train:   0%|          | 0/1981 [00:00<?, ?it/s]

In [None]:
"s"