In [1]:
import gc
import os
import pickle
from typing import List, Dict, Optional, Literal, Tuple
from dataclasses import dataclass

import pandas as pd
import numpy as np

import plotly.express as px
from tqdm.notebook import tqdm, trange

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.filterwarnings("ignore")

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from pytorch_tabular.tabular_datamodule import TabularDataset
from pytorch_tabular import TabularModel
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.utils import get_balanced_sampler, get_class_weighted_cross_entropy

In [2]:
torch.manual_seed(47)
np.random.seed(47)
torch.cuda.manual_seed(47)

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [13]:
model_name = "cointegrated/rubert-tiny2"

In [2]:
import pickle
with open("./saved_models/tabular_class.pkl", "rb") as f:
    tabular_model = pickle.load(f)

In [3]:
tabular_model.datamodule.config.categorical_cols

['work_schedule', 'city', 'accredit_it', 'state', 'salary_currency', 'salary_gross', 'vacancy_experience', 'vacancy_type', 'created_quarter', 'created_month', 'trusted']

In [31]:
TabularDataset(
            task=tabular_model.datamodule.config.task,
            data=tabular_model.datamodule._prepare_inference_data(df),
            categorical_cols=tabular_model.datamodule.config.categorical_cols,
            continuous_cols=tabular_model.datamodule.config.continuous_cols,
            target="target"
)[0]

{'target': array([1], dtype=int64),
 'continuous': array([-0.5090045 ,  0.14593425], dtype=float32),
 'categorical': array([1, 3, 2, 1, 1, 2, 2, 1, 1, 1, 1], dtype=int64)}

In [None]:
{'vacancy_name_score': {0: 17.0}, 'vacancy_name_len': {0: 22}, 'work_schedule': {0: 'flexible'}, 'city': {0: 'Москва'}, 'accredit_it': {0: False}, 'state': {0: 'APPROVED'}, 'salary_currency': {0: 'RUR'}, 'salary_gross': {0: False}, 'vacancy_experience': {0: 'noExperience'}, 'vacancy_type': {0: 'open'}, 'created_quarter': {0: 1}, 'created_month': {0: 3}, 'trusted': {0: True}}

In [30]:
example = {'vacancy_name_score': [17.0], 'vacancy_name_len': [22], 'work_schedule': ['flexible'], 'city': ['Москва'], 'accredit_it': [False], 'state': ['APPROVED'], 'salary_currency': ['RUR'], 'salary_gross': [False], 'vacancy_experience': ['noExperience'], 'vacancy_type': ['open'], 'created_quarter': [None], 'created_month': [None], 'trusted': [True], 'vacancy_name': ['Видеооператор-монтажер'], 'created_date': [3]}

In [6]:
df = pd.DataFrame(example)

In [4]:
tabular_model.datamodule.config.continuous_cols

['vacancy_name_score', 'vacancy_name_len']

# Read data

In [9]:
df = pd.read_csv("data_with_text.csv")

In [10]:
numeric_features = ["vacancy_name_score", "vacancy_name_len"]
categorical_features = [
                        "work_schedule", "city", "accredit_it", "state",
                        "salary_currency", "salary_gross", "vacancy_experience", "vacancy_type", "created_quarter",
                        "created_month", "trusted"
]

In [11]:
# df["text"] = [str(i).replace("{", "").replace("}", "") for i in df[numeric_features + categorical_features].to_dict("records")]

In [7]:
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [8]:
# df["len_description"] = df["vacancy_description"].apply(lambda x: len(x))

In [9]:
# descriptions = df[["len_description", "vacancy_description"]].sort_values("len_description", ascending=False)["vacancy_description"].tolist()

In [10]:
# file_exists = os.path.isfile("tokens_desc.pkl")

In [11]:
# if not file_exists:
#     tokens_desc = []

#     for i, desc in enumerate(tqdm(descriptions)):
#         temp_result = tokenizer(desc, padding="max_length", return_tensors="pt")
#         tokens_desc.append(truncer(temp_result))
        
#     with open("tokens_desc.pkl", "wb") as f:
#         pickle.dump(tokens_desc, f)
# else:
#     with open("tokens_desc.pkl", "rb") as f:
#         tokens_desc = pickle.load(f)

# Define custom data

In [20]:
class CustomVacancyDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tabular_dataset: TabularDataset,
        tokenizer,
        device: torch.device,
        text: str,
        target: str,
        mode: Optional[Literal["train", "valid", "test"]] = None,
        chunk_size: int = 1024
    ) -> None:
        super().__init__()
        self.data = data
        self.tabular_dataset = tabular_dataset
        self.tokenizer = tokenizer
        self.device = device
        self.text = text
        self.target = target
        self.mode = mode
        self.chunk_size = chunk_size

    def __len__(self):
        return len(self.data)

    @staticmethod
    def chunker(tokens: Dict[str, torch.Tensor], chunk_size: int = 512) -> List[str]:
        input_ids = list(tokens["input_ids"][0].split(chunk_size - 2))
        attention_mask = list(tokens["attention_mask"][0].split(chunk_size - 2))

        cls_token_id = 2
        eos_token_id = 3

        for i in range(len(input_ids)):
            input_ids[i] = torch.cat([torch.Tensor([cls_token_id]), input_ids[i], torch.Tensor([eos_token_id])])
            attention_mask[i] = torch.cat([torch.Tensor([1]), attention_mask[i], torch.Tensor([1])])

            pad_len = chunk_size - len(input_ids[i])
            if pad_len > 0:
                input_ids[i]= torch.cat([input_ids[i], torch.Tensor([0]*pad_len)])
                attention_mask[i] = torch.cat([attention_mask[i], torch.Tensor([0]*pad_len)])
                
        tokens["input_ids"] = torch.stack(input_ids)
        tokens["attention_mask"] = torch.stack(attention_mask)
        
        return tokens

    def __getitem__(self, idx: int):
        tokens = self.tokenizer(self.data.iloc[idx][self.text], return_tensors="pt", add_special_tokens=False, return_token_type_ids=False)
        tokens = self.chunker(tokens, self.chunk_size)
        
        tokens["input_ids"] = tokens["input_ids"].to(device).long()
        tokens["attention_mask"] = tokens["attention_mask"].to(device).int()
        
        if self.mode == "test":
            return tokens, self.tabular_dataset[idx]

        label = self.data.iloc[idx][self.target]
        return tokens, torch.tensor(label), self.tabular_dataset[idx]

# Define train, valid, test data

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
train, valid = train_test_split(df, shuffle=True, train_size=0.8)

In [16]:
valid, test = train_test_split(valid, shuffle=False, test_size=0.2)

In [17]:
train_tabular_dataset = TabularDataset(
            task=tabular_model.datamodule.config.task,
            data=tabular_model.datamodule._prepare_inference_data(train),
            categorical_cols=tabular_model.datamodule.config.categorical_cols,
            continuous_cols=tabular_model.datamodule.config.continuous_cols,
            target="target"
    )

valid_tabular_dataset = TabularDataset(
            task=tabular_model.datamodule.config.task,
            data=tabular_model.datamodule._prepare_inference_data(valid),
            categorical_cols=tabular_model.datamodule.config.categorical_cols,
            continuous_cols=tabular_model.datamodule.config.continuous_cols,
            target="target"
)

test_tabular_dataset = TabularDataset(
            task=tabular_model.datamodule.config.task,
            data=tabular_model.datamodule._prepare_inference_data(test),
            categorical_cols=tabular_model.datamodule.config.categorical_cols,
            continuous_cols=tabular_model.datamodule.config.continuous_cols,
            target="target"
)

In [18]:
next(iter(test_tabular_dataset))

{'target': array([0], dtype=int64),
 'continuous': array([ 4.432042 , -0.6699225], dtype=float32),
 'categorical': array([ 1, 67,  1,  1,  1,  2,  2,  1,  1,  2,  1], dtype=int64)}

In [23]:
text = "vacancy_description"
target  = "target"
train_data = CustomVacancyDataset(train.reset_index(drop=True), train_tabular_dataset, tokenizer, device, text, target, "train")
valid_data = CustomVacancyDataset(valid.reset_index(drop=True), valid_tabular_dataset, tokenizer, device, text, target, "valid")
test_data = CustomVacancyDataset(test.reset_index(drop=True), test_tabular_dataset, tokenizer, device, text, target, "valid")

In [24]:
def collate_fn(data):
    data, label, tabular_data = list(zip(*data))
    tabular_keys = list(tabular_data[0].keys())
    tabular_res = {key: [] for key in tabular_keys}
    # print(tabular_data)
    for tabular in tabular_data:
        for key in tabular_keys:
            tabular_res[key].append(tabular[key])
    
    for key in tabular_keys:
        if key == "target" or key == "categorical":
            tabular_res[key] = torch.Tensor(tabular_res[key]).long()
        else:
            tabular_res[key] = torch.Tensor(tabular_res[key])
    return data, torch.stack(label), tabular_res

In [25]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Define models

In [15]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

In [16]:
class BertEmbedder(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert

        self.seq_0 = nn.Sequential(
            nn.Linear(312, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 7)
        )
        
    def forward(self, inputs):
        x = self.bert(**inputs)
        x = (x["last_hidden_state"] * inputs["attention_mask"][:, :, None]).sum(dim=1) / inputs["attention_mask"][:, :, None].sum(dim=1)
        x = self.seq_0(torch.mean(x, dim=0))

        return x

In [17]:
class BertClassification(nn.Module):
    def __init__(self, embedder, path2model: str):
        super().__init__()
        self.embedder = embedder
        self.tabular_model = torch.load(path2model)

        self.seq_0 = nn.Sequential(
            nn.Linear(344, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 7)
        )
        
    def forward(self, inputs, tabular_data: Dict):
        temp_x = []
        for inp in inputs:
            # print(inp.sha)
            temp_x.append(self.embedder(inp))
        x = torch.stack(temp_x)
        # print(x.shape)
        tabular_x = self.tabular_model(tabular_data)["backbone_features"]
        
        x = torch.cat((x, tabular_x), dim=1)
        x = self.seq_0(x)

        return x

# Define train and and eval function

## Function

In [18]:
def get_score(y_true, y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    macro_prec, micro_prec = precision_score(y_true, y_pred, average="macro"), precision_score(y_true, y_pred, average="micro")
    macro_rec = recall_score(y_true, y_pred, average="macro")
    
    return macro_f1, macro_prec, macro_rec, micro_prec

In [19]:
def train_fn(model, loss, data, optimizer):
    model.train()
    metrics = 0.0
    y_true, y_pred = [], []
    

    for i, batch in enumerate(tqdm(data, leave=False, desc="Train")):
        tokens_list, target, tabular_data = batch
        for key in tabular_data.keys():
            tabular_data[key] = tabular_data[key].to(device)
        target = target.to(device)
        temp_outputs = []

        optimizer.zero_grad()
        outputs = model(tokens_list, tabular_data)
        loss_remains = loss(outputs, target)
        loss_remains.backward()
        optimizer.step()

        metrics += loss_remains.item()
        y_pred.append(outputs.to("cpu").detach().numpy().argmax(1))
        y_true.append(target.to("cpu").detach().numpy())

    metrics /= len(data)

    return metrics, np.hstack(y_pred), np.hstack(y_true)

In [20]:
def eval_fn(model, loss, data):
    model.eval()
    metrics = 0.0
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch in tqdm(data, leave=False, desc="Eval"):
            tokens_list, target, tabular_data = batch
            for key in tabular_data.keys():
                tabular_data[key] = tabular_data[key].to(device)
            target = target.to(device)
    
            outputs = model(tokens_list, tabular_data)
            loss_remains = loss(outputs, target)

            metrics += loss_remains.item()
            y_pred.append(outputs.to("cpu").detach().numpy().argmax(1))
            y_true.append(target.to("cpu").detach().numpy())
            
    metrics /= len(data)
    
    return metrics, np.hstack(y_pred), np.hstack(y_true)

In [21]:
def fit(model, loss, optimizer, scheduler, train_loader, valid_loader, device, epochs):
    info = "Epoch: %s Train loss: %.3f Valid loss: %.3f"
    best_macro = 0.0
    best_micro_precision_score = 0.0
    info_metrics = "Macro f1: %.3f | Macro precision: %.3f | Macro recall: %.3f | Micro precision: %.3f"
    
    for epoch in trange(epochs):
        train_loss, y_train_pred, y_train_true = train_fn(model, loss, train_loader, optimizer)
        eval_loss, y_eval_pred, y_eval_true = eval_fn(model, loss, valid_loader)
        
        train_metrics = get_score(y_train_true, y_train_pred)
        eval_metrics = get_score(y_eval_true, y_eval_pred)
        print(info_metrics % train_metrics , "-- train")
        print(info_metrics % eval_metrics, "-- eval")
        
        if best_macro < sum(eval_metrics[:3]) / 3:
            best_macro = sum(eval_metrics[:3]) / 3
            torch.save(model.state_dict(), f"./models/model_best_macro_{best_macro:.4f}.pt")

        if best_micro_precision_score < eval_metrics[-1]:
            best_micro_precision_score = eval_metrics[-1]
            torch.save(model.state_dict(), f"./models/model_best_micro_{best_micro_precision_score:.4f}.pt")
            
        
        print(info % (epoch + 1, train_loss, eval_loss), "\n")
        scheduler.step()
        gc.collect()
        torch.cuda.empty_cache()
    print(best_macro, best_micro_precision_score)

## Start train

In [22]:
bert_model =  AutoModel.from_pretrained(model_name)
embedder = BertEmbedder(bert_model)
embedder.load_state_dict(torch.load("models/model_best_val_macro_0.4499.pt"))
embedder.seq_0 = Identity()

model = BertClassification(embedder, path2model="./saved_models/tabular_tf.pt")
model = model.to(device)

In [23]:
# loss = nn.MSELoss()
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr= 2e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [24]:
fit(model, loss, optimizer, scheduler, train_loader, valid_loader, device, 5)

  0%|          | 0/5 [00:00<?, ?it/s]

Train:   0%|          | 0/9323 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2123 > 2048). Running this sequence through the model will result in indexing errors


Eval:   0%|          | 0/1865 [00:00<?, ?it/s]

Macro f1: 0.389 | Macro precision: 0.441 | Macro recall: 0.362 | Micro precision: 0.921 -- train
Macro f1: 0.496 | Macro precision: 0.493 | Macro recall: 0.501 | Micro precision: 0.936 -- eval
Epoch: 1 Train loss: 0.223 Valid loss: 0.177 



Train:   0%|          | 0/9323 [00:00<?, ?it/s]

Eval:   0%|          | 0/1865 [00:00<?, ?it/s]

Macro f1: 0.521 | Macro precision: 0.651 | Macro recall: 0.488 | Micro precision: 0.942 -- train
Macro f1: 0.559 | Macro precision: 0.631 | Macro recall: 0.538 | Micro precision: 0.935 -- eval
Epoch: 2 Train loss: 0.160 Valid loss: 0.185 



Train:   0%|          | 0/9323 [00:00<?, ?it/s]

Eval:   0%|          | 0/1865 [00:00<?, ?it/s]

Macro f1: 0.585 | Macro precision: 0.650 | Macro recall: 0.551 | Micro precision: 0.954 -- train
Macro f1: 0.555 | Macro precision: 0.602 | Macro recall: 0.521 | Micro precision: 0.934 -- eval
Epoch: 3 Train loss: 0.130 Valid loss: 0.200 



Train:   0%|          | 0/9323 [00:00<?, ?it/s]

Eval:   0%|          | 0/1865 [00:00<?, ?it/s]

Macro f1: 0.619 | Macro precision: 0.660 | Macro recall: 0.593 | Micro precision: 0.963 -- train
Macro f1: 0.535 | Macro precision: 0.622 | Macro recall: 0.480 | Micro precision: 0.932 -- eval
Epoch: 4 Train loss: 0.106 Valid loss: 0.224 



Train:   0%|          | 0/9323 [00:00<?, ?it/s]

Eval:   0%|          | 0/1865 [00:00<?, ?it/s]

Macro f1: 0.669 | Macro precision: 0.797 | Macro recall: 0.648 | Micro precision: 0.970 -- train
Macro f1: 0.554 | Macro precision: 0.619 | Macro recall: 0.507 | Micro precision: 0.932 -- eval
Epoch: 5 Train loss: 0.088 Valid loss: 0.237 

0.5758899987715654 0.9362743454795347


In [25]:
eval_loss, y_eval_pred, y_eval_true = eval_fn(model, loss, test_loader)
eval_metrics = get_score(y_eval_true, y_eval_pred)
eval_metrics

Eval:   0%|          | 0/467 [00:00<?, ?it/s]

(0.5250401724561375, 0.599586736641757, 0.4777998630010968, 0.9340305711987128)

## Other expirements

In [10]:
def rounder(value: int):
    value = int(value)
    value_len = len(str(int(value)))
    if value_len % 3 == 0:
        value_part = value_len // 3 + 1
    else:
        value_part = value_len // 3 + 2
    
    right_part = int(value) % 10**value_part
    if right_part == 0:
        return value
    result = (value // 10**value_part + 1) * 10**value_part
    return result

In [11]:
df.loc[df[df["salary_from_rub"].isna()].index, "salary_from_rub"] = df[df["salary_from_rub"].isna()]["salary_to_rub"]

In [12]:
df.loc[df[df["salary_to_rub"].isna()].index, "salary_to_rub"] = df[df["salary_to_rub"].isna()]["salary_from_rub"]

In [13]:
df["salary_from_rub_rounded"] = df["salary_from_rub"].apply(lambda x: rounder(x))
df["salary_to_rub_rounded"] = df["salary_to_rub"].apply(lambda x: rounder(x))

In [14]:
df["target_rounded"] = (df["salary_from_rub_rounded"] + df["salary_to_rub_rounded"]) / 2

In [15]:
salary_from = df["target_rounded"].dropna().sort_values()

In [16]:
salary_mapped = {}
step = 100_000
salary_max = 600_000

salary_list = sorted(list(set(salary_from.tolist())))
last_right_salary = salary_list[0] + step
salary_mapped[(salary_list[0], last_right_salary)] = 0
last_idx = 0

temp_list = []
while last_idx < len(salary_list):
    salary_part_list = salary_list[last_idx:]
    for i, value in enumerate(salary_part_list):
        if value >= salary_max:
            salary_mapped[(temp_list[-1], value)] = len(salary_mapped)
            break
        if last_right_salary <= value:
            if temp_list:
                salary_mapped[(temp_list[-1], value)] = len(salary_mapped)
            temp_list.append(value)
            break
    if value >= salary_max:
        break
    last_right_salary += step
    last_idx += i + 1
    

In [17]:
def salary_mapper(x):
    for (left_salary, right_salary), idx in salary_mapped.items():
        if left_salary <= x and x <= right_salary:
            return idx
    return idx + 1

In [18]:
df["target"] = df["target_rounded"].apply(lambda x: salary_mapper(x))

In [19]:
df["target"].value_counts()

target
0    300591
1     63054
2      6933
3      1396
4       548
6       184
5       184
Name: count, dtype: int64

In [16]:
numeric_features = ["vacancy_name_score", "vacancy_name_len"]
categorical_features = [
                        "work_schedule", "city", "accredit_it", "state",
                        "salary_currency", "salary_gross", "vacancy_experience", "vacancy_type", "created_quarter",
                        "created_month", "trusted"
]

In [19]:


data_config = DataConfig(
    target=[
        "target"
    ],  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=numeric_features,
    categorical_cols=categorical_features,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",  # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming",
).__dict__

model_config = FTTransformerConfig(
    task="classification",
    num_attn_blocks=3,
    num_heads=4,
    learning_rate=1e-3,
    head="LinearHead",
    head_config=head_config,
    metrics=["f1_score","accuracy"], 
    metrics_params=[{"num_classes":7},{}]
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
# sampler = get_balanced_sampler(train['target'].values.ravel())
loss = weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1)

tabular_model.fit(train=train, validation=valid, loss=loss)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)

Seed set to 42


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


D:\pd\ML_part\venv\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
D:\pd\ML_part\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
D:\pd\ML_part\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.0005248074602497723
Restoring states from the checkpoint path at D:\pd\ML_part\.lr_find_5dabd5a0-eba2-4985-ad80-654f7130c5de.ckpt
Restored all states from the checkpoint at D:\pd\ML_part\.lr_find_5dabd5a0-eba2-4985-ad80-654f7130c5de.ckpt


Output()

Output()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [56]:
example_loader = DataLoader(dataset, batch_size=32)

In [58]:
len(torch.load("./saved_models/tabular_tf.pt")(next(iter(example_loader))))

2

In [72]:
pred_df.iloc[0].shape

(8,)

In [69]:
tabular_model.model(next(iter(example_loader)))["logits"].shape

torch.Size([32, 7])

In [None]:
tabular_model.model(next(iter(example_loader)))

In [67]:
[(key, values.shape) for key, values in tabular_model.model(next(iter(example_loader))).items()]

[('logits', torch.Size([32, 7])), ('backbone_features', torch.Size([32, 32]))]

In [25]:
tabular_model.save_model_for_inference(path="./saved_models/tabular_tf.pt")

True

In [25]:
get_score(pred_df["prediction"], test["target"])

(0.1986850779206854,
 0.1897519092581868,
 0.2721189454291428,
 0.8427192276749799)

In [81]:
get_score(pred_df["prediction"], test["target"])

(0.1941162831619823,
 0.3911969928098954,
 0.20373587603129048,
 0.6652587825154197)

In [83]:
get_score(pred_df["prediction"], test["target"])

(0.16070326331854282,
 0.3371015636209266,
 0.1962906118179383,
 0.5799141861088764)

In [22]:
get_score(pred_df["prediction"], test["target"])

(0.29390811612969536,
 0.28408295060416433,
 0.35327816163110254,
 0.8415795119334942)

In [23]:
pred_df[pred_df["prediction"] == test["target"]]["prediction"].value_counts()

prediction
0    11643
1      889
2       13
3        4
4        2
6        1
5        1
Name: count, dtype: int64

In [24]:
pred_df[pred_df["prediction"] != test["target"]]["prediction"].value_counts()

prediction
0    1733
1     557
2      28
3      20
4      16
5       6
6       3
Name: count, dtype: int64

In [32]:
import pickle

with open("./saved_models/tabular_class.pkl", "wb") as f:
    pickle.dump(tabular_model, f)

In [66]:
optimizer = torch.optim.SGD(bert_model.parameters(), lr=1e-6)

train_dataset = DatasetWithLabels(df=train[numeric_features + categorical_features + ["target"]], dataset_root="./")
criterion = TripletLossWithMiner(margin=0.1, miner=AllTripletsMiner(), need_logs=True)
sampler = BalanceSampler(train_dataset.get_labels(), n_labels=2, n_instances=2)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=sampler)

for batch in tqdm(train_loader):
    embeddings = extractor(batch["input_tensors"])
    loss = criterion(embeddings, batch["labels"])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # info for logging: positive/negative distances, number of active triplets
    print(criterion.last_logs)

AssertionError: 

In [58]:
train["target"]

151428    0
66476     0
26240     1
203706    0
28384     0
         ..
117584    0
287411    0
11528     0
313222    2
365703    1
Name: target, Length: 298312, dtype: int64