In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Read all the csv files for that task
for i in range(1, 6):
    df_res = pd.read_csv(f'data/TRAIN_RES_{i}.csv')
    # remove the job titles that appear less than 2 times to be able to split dataset with stratify
    common_titles = df_res.job_title.value_counts()[df_res.job_title.value_counts() < 2].index
    df_res.drop(df_res[df_res.job_title.isin(common_titles)].index, inplace = True)

    # Cleanup: drop the 'achievements' column because we have 'achievements_modified'
    # Also replace 'Нет' in 'achievements_modified' with None
    df_res.drop('achievements', axis = 1, inplace = True)
    df_res['achievements_modified'] = df_res['achievements_modified'].replace('Нет', None)

    # train test split
    df_res_train, df_res_test = train_test_split(df_res, test_size = 0.2, random_state = 42, stratify = df_res['job_title'], shuffle = True)

    # sample 20% each part of dataset for the test set
    df_res_test = df_res_test.sample(frac = 0.1, random_state = 42)

    # remove nan values
    df_res_train.dropna(subset = ['achievements_modified', 'demands'], how = 'all', inplace = True)
    
    # If some job appears more than 200 times, sample 200 times. Else if 
    # some job appears less than 200 times, take all of them
    df_res_train = df_res_train.groupby('job_title', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), 200), random_state=42)
    )

    print(df_res_train.shape[0], df_res_test.shape[0])
    
    # Concat to the global dataset
    df_train = pd.concat([df_train, df_res_train])
    df_test = pd.concat([df_test, df_res_test])


  df_res_train = df_res_train.groupby('job_title', group_keys=False).apply(


94208 39997


  df_res_train = df_res_train.groupby('job_title', group_keys=False).apply(


92680 39996


  df_res_train = df_res_train.groupby('job_title', group_keys=False).apply(


92903 39997


  df_res_train = df_res_train.groupby('job_title', group_keys=False).apply(


94054 39996
78625 25745


  df_res_train = df_res_train.groupby('job_title', group_keys=False).apply(


In [3]:
df_train.shape, df_test.shape

((452470, 5), (185731, 5))

In [4]:
# remove the job titles that appear less than 1000 times in the train set
common_titles = df_train.job_title.value_counts()[df_train.job_title.value_counts() < 1000].index

df_train = df_train[~df_train.job_title.isin(common_titles)]

In [5]:
df_train.job_title.value_counts()

job_title
эколог           1000
экономист        1000
экскурсовод      1000
экспедитор       1000
эксперт          1000
                 ... 
агроном          1000
адвокат          1000
администратор    1000
акушерка         1000
аналитик         1000
Name: count, Length: 248, dtype: int64

In [6]:
X_train = df_train.drop('job_title', axis = 1)
X_test = df_test.drop('job_title', axis = 1)
y_train = df_train['job_title']
y_test = df_test['job_title']

In [7]:
# Impute nans with empty string for simplicity
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(fill_value='', strategy='constant')

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns = X_test.columns)

In [8]:
# Concatenate the text features together
X_train_concat = X_train.demands + ' ' + X_train.company_name + ' ' + X_train.achievements_modified

X_test_concat = X_test.demands + ' ' + X_test.company_name + ' ' +  X_test.achievements_modified

In [10]:
X_train_concat.head()

0    Ремонт и техническое обслуживание автортранспо...
1    Капитальный ремонт техники, разборка-сборка ав...
2    Ремонт, обслуживание, ТО легковых автомобилей ...
3                       Ремонт автомобилей АО РСК МиГ 
4    Официальный дилер. Ремонт и обслуживание автом...
dtype: object

In [11]:
# preprocess the labels with one-hot encoding (will be used in the model training)
from sklearn.preprocessing import OneHotEncoder
import pickle
encoder = OneHotEncoder(handle_unknown='ignore')
y_train_enc = encoder.fit_transform(y_train.values.reshape(-1, 1)).toarray()
y_test_enc = encoder.transform(y_test.values.reshape(-1, 1)).toarray()

# save the encoder
with open('models/classification/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [12]:
target_cols = list(encoder.categories_[0])
len(target_cols)

248

In [25]:
# preprocess the labels with ordinal encoding (will be used in the model evaluation)
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

y_train_label = oe.fit_transform(y_train.values.reshape(-1, 1))
y_test_label = oe.transform(y_test.values.reshape(-1, 1))

# save the encoder
with open('models/classification/ordinal_encoder.pkl', 'wb') as f:
    pickle.dump(oe, f)

## Model

In [13]:
from transformers import AutoModel, AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("models/rubert-tiny2")

In [14]:
import torch
from torch import nn
from torch.nn import functional as F

# Take a rubert-tiny2 and add a linear layer after. No freezing = train all the weights
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained("models/rubert-tiny2")
        self.fc = nn.Linear(312, len(target_cols))

    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output = self.fc(features)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BERTClass()
model.to(device)

In [16]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 3

In [17]:
from torch.utils.data import Dataset, DataLoader
import torch
class BERTDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.len = len(X)
        self.X = X.reset_index(drop=True)
        self.y = y
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        text = self.X.iloc[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.y[index], dtype=torch.float)
        }

In [18]:
train_dataset = BERTDataset(X_train_concat, y_train_enc, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(X_test_concat, y_test_enc, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=4, shuffle=False, pin_memory=True)

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [20]:
def train(epoch):
    model.train()
    for _, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%100 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}, processed: {_/len(train_loader)*100}%')


        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [21]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  5.512273788452148, processed: 0.0%
Epoch: 0, Loss:  4.937572479248047, processed: 2.5806451612903225%
Epoch: 0, Loss:  4.456879615783691, processed: 5.161290322580645%
Epoch: 0, Loss:  3.9377903938293457, processed: 7.741935483870968%
Epoch: 0, Loss:  3.5537214279174805, processed: 10.32258064516129%
Epoch: 0, Loss:  3.3198866844177246, processed: 12.903225806451612%
Epoch: 0, Loss:  2.8302321434020996, processed: 15.483870967741936%
Epoch: 0, Loss:  2.8316054344177246, processed: 18.064516129032256%
Epoch: 0, Loss:  2.4199843406677246, processed: 20.64516129032258%
Epoch: 0, Loss:  2.3450231552124023, processed: 23.225806451612904%
Epoch: 0, Loss:  2.5673389434814453, processed: 25.806451612903224%
Epoch: 0, Loss:  2.6115872859954834, processed: 28.387096774193548%
Epoch: 0, Loss:  2.113712787628174, processed: 30.967741935483872%
Epoch: 0, Loss:  1.9976942539215088, processed: 33.5483870967742%
Epoch: 0, Loss:  2.1383304595947266, processed: 36.12903225806451%
Epoch:

In [22]:
# save the model
torch.save(model.state_dict(), 'models/job_name.pt')

In [23]:
# # load the model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = BERTClass()
# model.load_state_dict(torch.load('models/job_name.pt'))
# model.to(device)

BERTClass(
  (roberta): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_a

## Evaluation

In [24]:
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
model.eval()

y_preds = []
bar = tqdm(valid_loader)
with torch.no_grad():
    for _, data in enumerate(valid_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets']
        bar.update(1)
        outputs = model(ids, mask, token_type_ids)
        _, preds = torch.max(outputs, dim = 1)
        y_preds.extend(preds)

100%|█████████▉| 2902/2903 [02:14<00:00, 21.55it/s]

In [26]:
y_preds_processed = np.array([])

for tens in y_preds:
    tens = tens.cpu().numpy()
    y_preds_processed = np.append(y_preds_processed, tens)

In [27]:
f1_score(y_test_label, y_preds_processed, average='macro')

0.37442773399930884