In [22]:
! nvidia-smi

Wed Jan 11 07:13:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   26C    P8    17W / 480W |  24006MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
import os
import re
import gc
import pdb
import sys
import json
import math
import time
import wandb
import pickle
import shutil
import joblib
import random
import requests
import warnings
from glob import glob
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from pandarallel import pandarallel

import scipy
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
import lightgbm as lgb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

import tokenizers
import sentencepiece
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW
from transformers import logging
from transformers import DataCollatorWithPadding


from cuml import NearestNeighbors
from kaggle.api.kaggle_api_extended import KaggleApi

sys.path.append("/home/working/")
from kagglib.utils.utils import  Timer, reduce_mem_usage, get_logger, decorate, setup, dataset_create_new
from kagglib.utils.exp_manage import set_wandb
from kagglib.tabular.blocks import AbstractBaseBlock, IdentityBlock, LabelEncodingBlock, SVDBlock, run_blocks
from kagglib.tabular.model_selection import train_cv, predict_cv
from kagglib.nlp.preprocessing import resolve_encodings_and_normalize
from kagglib.nlp.model import (
    AttentionPooling,
    MeanPooling,
    WeightedLayerPooling,
    freeze,
    replace_mixout,
    reinit_bert,
)
from kagglib.nlp.activation import softmax, sigmoid
from kagglib.nlp.optimizer import (
    get_scheduler,
    get_optimizer_grouped_parameters,
)

%load_ext autoreload
%autoreload 2
%env TOKENIZERS_PARALLELISM=true

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 300)
pandarallel.initialize(progress_bar=True)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')

logging.set_verbosity_error()
logging.set_verbosity_warning()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: TOKENIZERS_PARALLELISM=true
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Setup & data load

In [24]:
def fbeta_wrapper(y_true, y_pred):
    beta = 2
    return fbeta_score(y_true, y_pred, beta)

class Config:
    AUTHOR = "shu421"

    EXP = "exp008"
    EMB_MODEL_PATH = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    MODEL_PATH = "xlm-roberta-base"
    COMPETITION = "learning-equality-curriculum-recommendations"
    DATASET_PATH = []
    BASE_PATH = "/home/working/"
    api_path = "/root/.kaggle/kaggle.json"


    # Language Model Config
    MODEL_PATH = "xlm-roberta-base"

    # train
    apex=True
    seed = 42
    num_fold = 5
    train_fold = [0, 1, 2, 3, 4,]
    batch_size = 512
    n_epoch = 5
    max_len = 512
    num_classes = 1

    # optimizer
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    betas = (0.9, 0.999)
    lr_weight_decay = 0.95

    # scheduler
    scheduler="cosine"
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 500
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000

    # gradient accumulation
    gradient_accumulation_steps = 1

    # weight and bias
    wandb = False

    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }


    upload_from_colab = True

    # GBDT
    gbdt_model = "XGBoost"
    stopping_rounds = 50
    log_evaluation = 500
    model_params = {
        "objective": "binary:logistic",
        # "eval_metric": fbeta_wrapper,
        "eval_metric": "logloss",
        "learning_rate": 0.3,
        "tree_method": "gpu_hist",
        "random_state": seed,
        "n_estimators": 99999,
    }
    train_params = {
        "verbose": log_evaluation,
    }

# setup
cfg = setup(Config)

In [25]:
class Metrics_Config(Config):
    AUTHOR = "shu421"

    MODEL_PATH = "xlm-roberta-large"

    # train
    apex=True
    seed = 42
    num_fold = 5
    train_fold = [0, 1, 2, 3, 4,]
    batch_size = 512
    n_epoch = 5
    max_len = 512
    num_classes = 1

    # AdaCos
    margin = 0.30

    # optimizer
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    betas = (0.9, 0.999)
    lr_weight_decay = 0.95

    # scheduler
    scheduler="cosine"
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 1000
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000

    # gradient accumulation
    gradient_accumulation_steps = 1

    # weight and bias
    wandb = True

    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

metric_cfg = setup(Metrics_Config)

In [26]:
# set log functions
LOGGER = get_logger(cfg.OUTPUT_EXP)
log_filepath = os.path.join(cfg.OUTPUT, f"{cfg.EXP}.log")
# if os.path.isfile(log_filepath):
#     with open(log_filepath, "w") as f:
#         pass
#     f.close()
if cfg.wandb:
    run = set_wandb(cfg, name=cfg.EXP, group=cfg.MODEL_PATH)
if metric_cfg.wandb:
    run = set_wandb(metric_cfg, name=metric_cfg.EXP + "_metrics", group=metric_cfg.MODEL_PATH, config_path="/root/.kaggle/wandb.json")



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▅▃▂▁
[fold0] avg_val_loss,█▅▃▁▁
[fold0] epoch,▁▃▅▆█
[fold0] lr,▂▃▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁
[fold0] score,▁▂▄██
[fold0] train_loss,█▇▇█▇▆▆▆▆▅▅▆▅▅▄▄▅▃▃▃▄▃▃▃▃▂▃▄▂▃▃▂▁▂▂▁▂▂▂▁
[fold1] lr,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇█████████████████████
[fold1] train_loss,▇▆▇▆█▄▆▇▄▄▆▁█▅▇▅▅▇▆█▃▅▃▅▄▂▂▁▃▃▂▄▆▄▄▃▁▃▁▁

0,1
[fold0] avg_train_loss,10.81912
[fold0] avg_val_loss,10.86653
[fold0] epoch,4.0
[fold0] lr,0.0
[fold0] score,0.0003
[fold0] train_loss,10.80925
[fold1] lr,2e-05
[fold1] train_loss,11.10043


In [27]:
def get_whole_df():
    content_df = pd.read_csv(os.path.join(cfg.INPUT, 'content.csv'))
    topic_df = pd.read_csv(os.path.join(cfg.INPUT, 'topics.csv'))
    correlation_df = pd.read_csv(os.path.join(cfg.INPUT, 'correlations.csv'))
    sub_df = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))
    return content_df, topic_df, correlation_df, sub_df

def preprocess_df(content_df, topic_df, correlation_df):
    content_df = content_df.add_prefix("content_")
    topic_df = topic_df.add_prefix("topic_")
    correlation_df = correlation_df.rename(columns={"content_ids":"content_id"})
    return content_df, topic_df, correlation_df

def get_processed_df():
    content_df, topic_df, correlation_df, sub_df = get_whole_df()
    content_df, topic_df, correlation_df = preprocess_df(content_df, topic_df, correlation_df)
    return content_df, topic_df, correlation_df, sub_df

# Metrics

In [28]:
def comp_fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

def comp_recall_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        recall = len(TP) / len(true)

        score_list.append(recall)
    score = sum(score_list) / len(score_list)
    return score

def calc_comp_score(train_df, oof, correlation_df, thr=0.1):
    """2値分類の予測からcvを計算する"""
    oof_preds = np.where(oof>=thr, 1, 0)
    pred_df = train_df[oof_preds==1]
    pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
    pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
    pred_df = pred_df.fillna("nan")

    cv_score = comp_fbeta_score(correlation_df["content_id"], pred_df["content_id"])
    return cv_score

def get_StratifiedGroupKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_list = list(cv.split(X, y, groups))
    return cv_list

def get_StratifiedKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_list = list(cv.split(X, y))
    return cv_list


# 1st Stage: Candidate generation by Metric Learning

In [29]:
def create_cand_feature_df(input_df, id_col, title_col, lang_col):
    output_df = input_df.copy()
    output_df[title_col] = output_df[title_col].fillna("")
    output_df[lang_col] = output_df[lang_col].fillna("")
    output_df["text"] = output_df[title_col] + "</s></s>" + output_df[lang_col]
    return output_df[[id_col, "text"]]

def create_cand_target_df(target_df):
    target_df_ = target_df.copy()
    target_df_["content_id"] = target_df_["content_id"].apply(lambda x: x.split(" "))
    target_df_ = target_df_.explode("content_id")

    le = LabelEncoder()
    target_df_["target"] = le.fit_transform(target_df_["topic_id"])
    return target_df_[["content_id", "target"]]

def get_cand_train_df(input_df, target_df, id_col, title_col, lang_col):
    cand_feature_df = create_cand_feature_df(input_df, id_col, title_col, lang_col)
    cand_target_df = create_cand_target_df(target_df)
    cand_df = pd.merge(cand_target_df, cand_feature_df, on="content_id", how="left")
    metric_cfg.num_classes = len(np.unique(cand_df["target"]))
    return cand_df

def get_cand_test_df(input_df, id_col, title_col, lang_col):
    cand_feature_df = create_cand_feature_df(input_df, id_col, title_col, lang_col)
    return cand_feature_df

In [30]:
class MetricDataset(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        df[col] = df[col].fillna("no text</s>na")
        self.text = df[col].to_numpy()
        self.label = df["target"].to_numpy()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        label = self.label[index]
        return text, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

class MetricTestDataset(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        df[col] = df[col].fillna("no text</s>na")
        self.text = df[col].to_numpy()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        return text

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [31]:
class AdaCos(nn.Module):
    def __init__(self, num_features, num_classes, m=0.50):
        super(AdaCos, self).__init__()
        self.num_features = num_features
        self.n_classes = num_classes
        self.s = math.sqrt(2) * math.log(num_classes - 1)
        self.m = m
        self.W = nn.Parameter(torch.FloatTensor(num_classes, num_features))
        nn.init.xavier_uniform_(self.W)

    def forward(self, input, label=None):
        # normalize features
        x = F.normalize(input)
        # normalize weights
        W = F.normalize(self.W)
        # dot product
        logits = F.linear(x, W)
        if label is None:
            return logits
        # feature re-scale
        theta = torch.acos(torch.clamp(logits, -1.0 + 1e-7, 1.0 - 1e-7))
        one_hot = torch.zeros_like(logits)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        with torch.no_grad():
            B_avg = torch.where(one_hot < 1, torch.exp(self.s * logits), torch.zeros_like(logits))
            B_avg = torch.sum(B_avg) / input.size(0)
            theta_med = torch.median(theta[one_hot == 1])
            self.s = torch.log(B_avg) / torch.cos(torch.min(math.pi/4 * torch.ones_like(theta_med), theta_med))
        output = self.s * logits

        return output

In [32]:
class MetricModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        cfg.hidden_size = self.config.hidden_size
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        # self.fc = nn.Linear(self.config.hidden_size, cfg.num_classes)
        # self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        # batch, hidden_size
        feature = self.feature(inputs)
        feature = self.ln(feature)
        # feature1 = self.drop1(feature)
        # feature2 = self.drop2(feature)
        # feature3 = self.drop3(feature)
        # feature4 = self.drop4(feature)
        # feature5 = self.drop5(feature)
        # feature = (feature1 + feature2 + feature3 + feature4 + feature5) / 5
        # output = self.fc(feature)
        # return output.squeeze()
        return feature

In [33]:
def get_cand_train_fn(cfg, train_loader, valid_loader, train_df, valid_df, metric, criterion, optimizer, scheduler, model, fold, epoch, best_val_embs, best_val_preds, best_val_score):
    LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
    train_losses = []
    train_nums = []
    model.train()
    scaler = GradScaler(enabled=cfg.apex)
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for step, (inputs, labels) in enumerate(pbar):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(enabled=cfg.apex):
                emb = model(inputs)
            output = metric(emb, labels)
            loss = criterion(output, labels)

            pbar.set_postfix({
                'loss': loss.item(),
                'lr': scheduler.get_lr()[0]
            })
            train_losses.append(loss.item() * len(labels))
            train_nums.append(len(labels))

            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps

            scaler.scale(loss).backward()

            if cfg.clip_grad_norm is not None:
                # scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    cfg.clip_grad_norm
                )

            if (step+1) % cfg.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            if step % cfg.eval_step == 0 and step != 0:
                best_val_preds, best_val_score, val_embs, val_loss = get_cand_valid_fn(
                    cfg,
                    valid_loader,
                    valid_df,
                    metric,
                    criterion,
                    model,
                    fold,
                    epoch,
                    step,
                    best_val_embs,
                    best_val_preds,
                    best_val_score,
                )
                model.train()

            if cfg.wandb:
                wandb.log({f"[fold{fold}] train_loss": loss.item(),
                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
    train_loss = sum(train_losses)/sum(train_nums)
    return train_loss



def get_cand_valid_fn(cfg, valid_loader, valid_df, metric, criterion, model, fold, epoch, step, best_val_embs, best_val_preds, best_val_score):
    val_embs = []
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast(enabled=cfg.apex):
                    emb = model(inputs)
                output = metric(emb, labels)
                loss = criterion(output, labels)
                output = output.detach().cpu().numpy()
                output = softmax(output)
                val_embs.append(emb.detach().cpu())
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))

    val_embs = torch.concat(val_embs)
    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    y_preds = np.argmax(val_preds, axis=1)
    macro_f1 = f1_score(valid_df["target"], y_preds, average="macro")
    micro_f1 = f1_score(valid_df["target"], y_preds, average="micro")

    LOGGER.info(f"Fold: {fold}, Epoch: {epoch}/{cfg.n_epoch}, Step: {step} | val_loss: {np.round(val_loss, 5)}, macro_f1: {np.round(macro_f1, 5)}, micro_f1: {np.round(micro_f1, 5)}")

    if macro_f1 > best_val_score:
        best_val_preds = val_preds
        best_val_score = macro_f1
        torch.save(
            model.state_dict(),
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )

    return best_val_preds, best_val_score, val_embs, val_loss


def get_cand_train_loop(cfg, train_data: pd.DataFrame, cv_list: List, correlation_df: pd.DataFrame):
    """_summary_

    Args:
        cfg (_type_): _description_
        train_data (pd.DataFrame): textとtargetが格納されたデータフレーム
        cv_list (List): _description_
        correlation_df (pd.DataFrame): _description_

    Returns:
        _type_: _description_
    """
    oof_embs = torch.zeros((len(train_data), 1024), dtype=torch.float32)
    oof_pred = np.zeros((len(train_data), cfg.num_classes), dtype=np.float32)
    fold_score = []

    for fold in cfg.train_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        train_idx, valid_idx = cv_list[fold]
        train_df = train_data.iloc[train_idx].reset_index(drop=True)
        valid_df = train_data.iloc[valid_idx].reset_index(drop=True)

        # Datasetの設定
        train_dataset = MetricDataset(cfg, train_df, "text")
        valid_dataset = MetricDataset(cfg, valid_df, "text")
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size * 2,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = MetricModel(metric_cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        # model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(cfg.device)

        criterion = nn.CrossEntropyLoss()
        metric = AdaCos(num_features=model.config.hidden_size, num_classes=cfg.num_classes, m=cfg.margin).to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epoch)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # model-training
        best_val_embs = None
        best_val_preds = None
        best_val_score = -np.inf

        for epoch in range(cfg.n_epoch):
            train_loss = get_cand_train_fn(
                cfg,
                train_loader,
                valid_loader,
                train_df,
                valid_df,
                metric,
                criterion,
                optimizer,
                scheduler,
                model,
                fold,
                epoch,
                best_val_embs,
                best_val_preds,
                best_val_score
                )
            best_val_preds, best_val_score, val_embs, val_loss = get_cand_valid_fn(
                cfg,
                valid_loader,
                valid_df,
                metric,
                criterion,
                model,
                fold,
                epoch,
                "fold",
                best_val_embs,
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch,
                        f"[fold{fold}] avg_train_loss": train_loss,
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        # save embeddings
        oof_embs[valid_idx] = val_embs
        torch.save(oof_embs, os.path.join(cfg.EXP_PREDS, f"oof_embs_fold{fold}.pt"))
        # save oof preds
        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)

        fold_score.append(best_val_score)
        del model
        gc.collect()
        torch.cuda.empty_cache()

    torch.save(oof_embs, os.path.join(cfg.EXP_PREDS, f"oof_embs.pt"))
    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    # score = calc_comp_score(train_data, oof_pred, correlation_df)
    # LOGGER.info(f'fold score: {fold_score}')
    # LOGGER.info(f'CV: {round(score, 4)}')
    # return score

In [34]:
def inference_loop(cfg, test_data: pd.DataFrame):
    test_embs = torch.zeros((cfg.num_fold, len(test_data), 1024), dtype=torch.float32)
    test_embs = []

    for fold in cfg.train_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        test_embs_ = []
        # Datasetの設定
        test_dataset = MetricTestDataset(cfg, test_data, "text")
        test_loader = DataLoader(
            dataset=test_dataset,
            batch_size=cfg.batch_size * 2,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
            collate_fn = DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='longest', max_length = cfg.max_len),
        )


        # model
        model = MetricModel(metric_cfg)
        model.load_state_dict(torch.load(os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")))
        model.to(cfg.device)
        model.eval()
        with torch.no_grad():
            with tqdm(test_loader, total=len(test_loader)) as pbar:
                for inputs in pbar:
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        emb = model(inputs)
                    test_embs_.append(emb.detach().cpu())
        test_embs_ = torch.concat(test_embs_)
        test_embs.append(test_embs_)
    test_embs = torch.stack(test_embs)
    return test_embs

# GBDT

In [35]:
def get_cand_df(content_df, topic_df, correlation_df, content_title_vec, topic_title_vec, content2topic_thr, topic2content_thr):
    """
    knnを使用してtopicに対するcontentの候補を出力する関数
    """
    id2content_dict = dict(content_df["content_id"])
    id2topics_dict = dict(topic_df["topic_id"])

    # contentがどのtopicにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=content2topic_thr, metric="cosine")
    knn_model.fit(topic_title_vec)
    distances, indices = knn_model.kneighbors(content_title_vec)
    knn_pred_c2t_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_c2t_dict[id2topics_dict[j]].append(id2content_dict[idx])

    # topicがどのcontentにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=topic2content_thr, metric="cosine")
    knn_model.fit(content_title_vec)
    distances, indices = knn_model.kneighbors(topic_title_vec)
    knn_pred_t2c_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_t2c_dict[id2topics_dict[idx]].append(id2content_dict[j])

    # idの割り当て
    knn_pred_dict = {k:np.unique(v_c2t+v_t2c) for k,v_c2t, v_t2c in zip(knn_pred_c2t_dict.keys(), knn_pred_c2t_dict.values(), knn_pred_t2c_dict.values())}
    knn_pred_id = {k:[" ".join(v)] for k,v in knn_pred_dict.items()}
    knn_pred_df = pd.DataFrame(knn_pred_id).T.reset_index()
    knn_pred_df.columns = ["topic_id", "content_id"]
    knn_pred_df = knn_pred_df.reset_index(drop=True)

    # candidateの集計結果を出力
    recall = comp_recall_score(correlation_df["content_id"], knn_pred_df["content_id"])
    LOGGER.info(f"recall = {round(recall, 5)}")
    # candidateの集計結果を出力
    f2 = comp_fbeta_score(correlation_df["content_id"], knn_pred_df["content_id"])
    LOGGER.info(f"f2 = {round(f2, 5)}")
    n_bin_data = knn_pred_df["content_id"].apply(lambda x: len(x.split())).sum()
    LOGGER.info(f"n_data = {n_bin_data}")

    # 文字列の候補をlistに変換
    knn_pred_df["content_id"] = knn_pred_df["content_id"].apply(lambda x: x.split(" "))
    cand_df = knn_pred_df.explode("content_id")

    # target作成
    correlation_df_ = correlation_df.copy()
    correlation_df_["content_id"] = correlation_df_["content_id"].apply(lambda x: x.split(" "))
    correlation_df_ = correlation_df_.explode("content_id")
    correlation_df_["target"] = 1
    target_df = pd.DataFrame()
    target_df = pd.merge(cand_df, correlation_df_, on=["topic_id", "content_id"], how="left")
    target_df = target_df["target"].fillna(0).astype(int)

    return cand_df, target_df

def get_feature_df(cand_df, target_df, content_df, topic_df, cv_list, content_title_vec, topic_title_vec):

    # content features
    content_svd_cols = [
        "content_title_vec",
        # "content_desc_vec",
    ]
    content_cat_cols = [
        "content_id",
        "content_kind",
        "content_language",
    ]

    # topic features
    topic_svd_cols = [
        "topic_title_vec",
        # "topic_desc_vec",
    ]
    topic_cat_cols = [
        "topic_id",
        "topic_category",
        "topic_language",
    ]

    content_blocks = [
        # IdentityBlock(use_cols=content_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=content_cat_cols, cfg=cfg), 
        SVDBlock(cols=content_svd_cols, cfg=cfg, dim=32, title_vec=content_title_vec),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    topic_blocks = [
        # IdentityBlock(use_cols=topic_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=topic_cat_cols, cfg=cfg), 
        SVDBlock(cols=topic_svd_cols, cfg=cfg, dim=32, title_vec=topic_title_vec),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    content_feat_df = run_blocks(content_df, blocks=content_blocks, cfg=cfg, test=False)
    topic_feat_df = run_blocks(topic_df, blocks=topic_blocks, cfg=cfg, test=False)

    # content_idとfeatを対応付ける
    content_feat_df = pd.concat([content_df[["content_id"]], content_feat_df], axis=1)
    # topic_idとfeatを対応付ける
    topic_feat_df = pd.concat([topic_df[["topic_id"]], topic_feat_df], axis=1)

    # topicとcontentのfeatをマージ
    train_feat_df = pd.merge(cand_df, content_feat_df, on="content_id", how="left")
    train_feat_df = pd.merge(train_feat_df, topic_feat_df, on="topic_id", how="left")
    train_feat_df = train_feat_df.drop(columns=["topic_id", "content_id"])

    display(train_feat_df.head())
    LOGGER.info(f"n_features: {len(train_feat_df.columns)}")

    return train_feat_df

# Language Model

In [36]:
def create_text_df(input_df):
    output_df = pd.DataFrame({"text":input_df["content_id"] + "</s>" + input_df["content_title"] + "</s>" + input_df["topic_id"] + "</s>" + input_df["topic_title"]}).reset_index(drop=True)
    output_df["target"] = input_df["target"]
    return output_df

In [37]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

class BiEncoderDataset(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        self.text = df[col].to_numpy()
        self.label = df["target"].to_numpy()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        label = self.label[index].astype(np.float32)
        return text, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [38]:
class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.num_classes)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        # batch, hidden_size
        feature = self.feature(inputs)
        feature = self.ln(feature)
        # feature1 = self.drop1(feature)
        # feature2 = self.drop2(feature)
        # feature3 = self.drop3(feature)
        # feature4 = self.drop4(feature)
        # feature5 = self.drop5(feature)
        # feature = (feature1 + feature2 + feature3 + feature4 + feature5) / 5
        output = self.fc(feature)
        return output.squeeze()



In [39]:
def train_fn(cfg, train_loader, valid_loader, train_df, valid_df, criterion, optimizer, scheduler, model, fold, epoch, best_val_preds, best_val_score):
    LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
    train_losses = []
    train_nums = []
    model.train()
    scaler = GradScaler(enabled=cfg.apex)
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for step, (inputs, labels) in enumerate(pbar):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(enabled=cfg.apex):
                output = model(inputs)
            loss = criterion(output, labels)

            pbar.set_postfix({
                'loss': loss.item(),
                'lr': scheduler.get_lr()[0]
            })
            train_losses.append(loss.item() * len(labels))
            train_nums.append(len(labels))

            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps

            scaler.scale(loss).backward()

            if cfg.clip_grad_norm is not None:
                # scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    cfg.clip_grad_norm
                )

            if (step+1) % cfg.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            if step % cfg.eval_step == 0 and step != 0:
                best_val_preds, best_val_score, val_loss = valid_fn(
                    cfg,
                    valid_loader,
                    valid_df,
                    criterion,
                    model,
                    fold,
                    epoch,
                    step,
                    best_val_preds,
                    best_val_score,
                )
                model.train()

            if cfg.wandb:
                wandb.log({f"[fold{fold}] train_loss": loss.item(),
                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
    train_loss = sum(train_losses)/sum(train_nums)
    return train_loss



def valid_fn(cfg, valid_loader, valid_df, criterion, model, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
            # for (inputs, labels) in valid_loader:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast(enabled=cfg.apex):
                    output = model(inputs)

                loss = criterion(output, labels)
                output = output.detach().cpu().numpy()
                output = sigmoid(output)
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    y_preds = np.where(val_preds>=0.5, 1, 0)
    score = fbeta_score(valid_df["target"], y_preds, beta=2)

    LOGGER.info(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epoch}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if score > best_val_score:
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(),
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )

    return best_val_preds, best_val_score, val_loss


def train_loop(cfg, train_data: pd.DataFrame, cv_list: List, correlation_df: pd.DataFrame):
    """_summary_

    Args:
        cfg (_type_): _description_
        train_data (pd.DataFrame): textとtargetが格納されたデータフレーム
        cv_list (List): _description_
        correlation_df (pd.DataFrame): _description_

    Returns:
        _type_: _description_
    """
    oof_pred = np.zeros((len(train_data)), dtype=np.float32)
    fold_score = []

    for fold in cfg.train_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        train_idx, valid_idx = cv_list[fold]
        train_df = train_data.iloc[train_idx].reset_index(drop=True)
        valid_df = train_data.iloc[valid_idx].reset_index(drop=True)

        # Datasetの設定
        train_dataset = BiEncoderDataset(cfg, train_df, "text")
        valid_dataset = BiEncoderDataset(cfg, valid_df, "text")
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        # model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(cfg.device)

        criterion = nn.BCEWithLogitsLoss()

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epoch)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # model-training
        best_val_preds = None
        best_val_score = -np.inf

        for epoch in range(cfg.n_epoch):
            train_loss = train_fn(
                cfg,
                train_loader,
                valid_loader,
                train_df,
                valid_df,
                criterion,
                optimizer,
                scheduler,
                model,
                fold,
                epoch,
                best_val_preds,
                best_val_score
                )

            LOGGER.info(f'Fold{fold}, Epoch{epoch}/{cfg.n_epoch} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg,
                valid_loader,
                valid_df,
                criterion,
                model,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch,
                        f"[fold{fold}] avg_train_loss": train_loss,
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model
        gc.collect()
        torch.cuda.empty_cache()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = calc_comp_score(train_data, oof_pred, correlation_df)
    LOGGER.info(f'fold score: {fold_score}')
    LOGGER.info(f'CV: {round(score, 4)}')
    return score

# Setup & Preprocessing

In [40]:
content_df, topic_df, correlation_df, sub_df = get_processed_df()
topic_df = topic_df[topic_df["topic_id"].isin(correlation_df["topic_id"])].reset_index(drop=True)

# 1st stage: Candidate generation by metric learning model

In [41]:
content_df

Unnamed: 0,content_id,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


In [42]:
topic_df

Unnamed: 0,topic_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
2,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
4,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True
...,...,...,...,...,...,...,...,...,...
61512,t_fff830472691,Scalar Projections,,fef095,source,4,en,t_c75d6acecf78,True
61513,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True
61514,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
61515,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True


In [44]:
metric_cfg.tokenizer = AutoTokenizer.from_pretrained(metric_cfg.MODEL_PATH)
metric_cfg.tokenizer.save_pretrained(os.path.join(metric_cfg.OUTPUT_EXP, 'tokenizer'))

cand_df = get_cand_train_df(content_df, correlation_df, "content_id", "content_title", "content_language")
cand_df

Unnamed: 0,content_id,target,text
0,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg
1,c_376c5a8eb028,0,Да чуем променливото съпротивление</s></s>bg
2,c_5bc0e1e2cba0,0,Променлив резистор (реостат) с графит от молив...
3,c_76231f9d0b5e,0,Последователно свързване на галваничен елемент...
4,c_639ea2ef9c95,1,Dados e resultados de funções: gráficos</s></s>pt
...,...,...,...
279914,c_d64037a72376,61513,Introducción: El periódico</s></s>es
279915,c_46f852a49c08,61514,Proof: Right triangles inscribed in circles -d...
279916,c_6659207b25d5,61514,Area of inscribed equilateral triangle -dubbed...
279917,c_cece166bad6a,61515,Juego con las palabras</s></s>es


In [45]:
cand_df

Unnamed: 0,content_id,target,text
0,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg
1,c_376c5a8eb028,0,Да чуем променливото съпротивление</s></s>bg
2,c_5bc0e1e2cba0,0,Променлив резистор (реостат) с графит от молив...
3,c_76231f9d0b5e,0,Последователно свързване на галваничен елемент...
4,c_639ea2ef9c95,1,Dados e resultados de funções: gráficos</s></s>pt
...,...,...,...
279914,c_d64037a72376,61513,Introducción: El periódico</s></s>es
279915,c_46f852a49c08,61514,Proof: Right triangles inscribed in circles -d...
279916,c_6659207b25d5,61514,Area of inscribed equilateral triangle -dubbed...
279917,c_cece166bad6a,61515,Juego con las palabras</s></s>es


In [50]:
count_target_df = cand_df.groupby("target").count().reset_index(drop=True)
count_target_df

Unnamed: 0,content_id,text
0,4,4
1,4,4
2,1,1
3,5,5
4,3,3
...,...,...
61512,2,2
61513,10,10
61514,2,2
61515,1,1


In [54]:
count_target_df[count_target_df["content_id"]<5]

Unnamed: 0,content_id,text
0,4,4
1,4,4
2,1,1
4,3,3
5,2,2
...,...,...
61510,1,1
61512,2,2
61514,2,2
61515,1,1


In [21]:
cand_cv_list = get_StratifiedKFold_list(X=cand_df, y=cand_df["target"], n_splits=metric_cfg.num_fold, seed=metric_cfg.seed)

get_cand_train_loop(metric_cfg, cand_df, cand_cv_list, correlation_df)
# if cfg.upload_from_colab:
#     dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/437 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: fold | val_loss: 11.04667, macro_f1: 1e-05, micro_f1: 0.00182


  0%|          | 0/437 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: fold | val_loss: 10.96144, macro_f1: 7e-05, micro_f1: 0.00552


  0%|          | 0/437 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: fold | val_loss: 10.91538, macro_f1: 0.00013, micro_f1: 0.00736


  0%|          | 0/437 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: fold | val_loss: 10.87201, macro_f1: 0.00029, micro_f1: 0.00881


  0%|          | 0/437 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: fold | val_loss: 10.86653, macro_f1: 0.0003, micro_f1: 0.00897
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/437 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 496.00 MiB (GPU 0; 23.65 GiB total capacity; 18.36 GiB already allocated; 216.06 MiB free; 20.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [25]:
oof_embs = torch.load(os.path.join(cfg.EXP_PREDS, f"oof_embs.pt"))

In [26]:
topic_cand_df = get_cand_test_df(topic_df, "topic_id", "topic_title", "topic_language")

In [27]:
topic_cand_df

Unnamed: 0,topic_id,text
0,t_00004da3a1b2,Откриването на резисторите</s></s>bg
1,t_00068291e9a4,Entradas e saídas de uma função</s></s>pt
2,t_00069b63a70a,Transcripts</s></s>en
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...
4,t_0008768bdee6,100 સુધીનો સરવાળો</s></s>gu
...,...,...
61512,t_fff830472691,Scalar Projections</s></s>en
61513,t_fff9e5407d13,NA_U06 - El periódico</s></s>es
61514,t_fffbe1d5d43c,Inscribed shapes problem solving</s></s>sw
61515,t_fffe14f1be1e,Lección 7</s></s>es


In [28]:
test_dataset = MetricTestDataset(metric_cfg, topic_cand_df, "text")
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=metric_cfg.batch_size * 2,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
    collate_fn = DataCollatorWithPadding(tokenizer=metric_cfg.tokenizer, padding='longest', max_length = metric_cfg.max_len),
)

In [29]:
topic_embs = inference_loop(metric_cfg, topic_cand_df)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/61 [00:00<?, ?it/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/61 [00:00<?, ?it/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/61 [00:00<?, ?it/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/61 [00:00<?, ?it/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/61 [00:00<?, ?it/s]

In [30]:
topic_embs_mean = torch.mean(topic_embs, axis=0)
topic_embs_mean

tensor([[ 0.8846,  0.4066,  0.1353,  ..., -0.1961,  0.0577,  0.8059],
        [-0.0396, -0.1805, -0.6456,  ..., -0.4180,  0.2658,  1.2716],
        [-0.2154, -0.2629, -1.1299,  ..., -0.0743,  0.5674,  0.2498],
        ...,
        [-0.3028,  0.0772,  0.0930,  ...,  0.6486, -0.0965,  1.3615],
        [-0.5364, -0.7496, -0.3863,  ..., -0.3109,  0.5653,  0.6647],
        [ 0.2283,  0.3086, -0.4741,  ...,  0.5658, -0.2876,  0.7634]])

In [31]:
def normalize(x):
    """cos類似度を出すために距離で正規化する"""
    norm = x / torch.norm(torch.tensor(x, dtype=torch.float64), dim=0)
    return norm

In [32]:
oof_embs = normalize(oof_embs)
topic_embs_mean = normalize(topic_embs_mean)

In [50]:
batch_size = 512
mini_batch_embs = oof_embs.split(batch_size)
content_cand_idx = []
topic_cand_idx = []

for i_batch, idx in enumerate(tqdm(range(len(mini_batch_embs)))):
    cos_sim_batch = torch.mm(mini_batch_embs[idx], topic_embs_mean.transpose(0, 1))
    cos_sim_idx =  torch.where(cos_sim_batch>=0.0035)
    print(len(cos_sim_idx[0]))
    break

  0%|          | 0/547 [00:00<?, ?it/s]

11448


In [51]:
batch_size = 512
mini_batch_embs = oof_embs.split(batch_size)
content_cand_idx = []
topic_cand_idx = []

for i_batch, idx in enumerate(tqdm(range(len(mini_batch_embs)))):
    cos_sim_batch = torch.mm(mini_batch_embs[idx], topic_embs_mean.transpose(0, 1))
    cos_sim_idx =  torch.where(cos_sim_batch>=0.0035)
    # print(len(cos_sim_idx[0]))
    # break
    content_cand_idx.append(i_batch * cos_sim_idx[0])
    topic_cand_idx.append(cos_sim_idx[1])

content_cand_idx = torch.concat(content_cand_idx)
topic_cand_idx = torch.concat(topic_cand_idx)

content_cand_idx = content_cand_idx.numpy()
topic_cand_idx = topic_cand_idx.numpy()

del oof_embs, topic_embs_mean
gc.collect()

  0%|          | 0/547 [00:00<?, ?it/s]

18

In [75]:
cand_pred_df = cand_df.iloc[content_cand_idx].copy()
cand_pred_df["pred"] = topic_cand_idx
cand_pred_df = cand_pred_df.reset_index(drop=True)
cand_pred_df

Unnamed: 0,content_id,target,text,pred
0,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg,36938
1,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg,160
2,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg,1047
3,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg,1356
4,c_1108dd0c7a5d,0,Молив като резистор</s></s>bg,2507
...,...,...,...,...
6968673,c_dd739e116435,43479,বই</s></s>bn,57981
6968674,c_dd739e116435,43479,বই</s></s>bn,58099
6968675,c_dd739e116435,43479,বই</s></s>bn,58664
6968676,c_dd739e116435,43479,বই</s></s>bn,60329


In [81]:
tmp = pd.DataFrame(cand_pred_df.groupby("pred")["content_id"].apply(list).apply(" ".join))
tmp

Unnamed: 0_level_0,content_id
pred,Unnamed: 1_level_1
0,c_4b0997fcccec c_afde5bb12ed6 c_45933bf7e6f4
3,c_1108dd0c7a5d c_1108dd0c7a5d c_1108dd0c7a5d c...
5,c_60fb72557c47 c_3edbb195f2b6 c_7f901356eb2d c...
6,c_266ecc5869d1 c_91db720763f8 c_84322f93463f c...
9,c_ed535db13d82 c_91007b8dc12c c_d0ee58f9bfe8
...,...
61506,c_4ae0c554d49a c_fc31afa0b56b
61507,c_a041a78c851c c_01be5ca99fc8
61509,c_6b8c26d055cf c_5fa3d695d16d c_523d541b7cc5 c...
61510,c_f6cd64dcaf18 c_0956a4742b89 c_0236ef635b3d c...


In [85]:
tmp1 = pd.merge(topic_df[["topic_id"]], tmp, left_index=True, right_index=True, how="left")
tmp1 = tmp1.fillna("c_4b0997fcccec")
tmp1

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_4b0997fcccec c_afde5bb12ed6 c_45933bf7e6f4
1,t_00068291e9a4,c_4b0997fcccec
2,t_00069b63a70a,c_4b0997fcccec
3,t_0006d41a73a8,c_1108dd0c7a5d c_1108dd0c7a5d c_1108dd0c7a5d c...
4,t_0008768bdee6,c_4b0997fcccec
...,...,...
61512,t_fff830472691,c_4b0997fcccec
61513,t_fff9e5407d13,c_4b0997fcccec
61514,t_fffbe1d5d43c,c_4b0997fcccec
61515,t_fffe14f1be1e,c_4b0997fcccec


In [86]:
comp_fbeta_score(correlation_df["content_id"], tmp1["content_id"])

7.179417183023556e-05

In [87]:
comp_recall_score(correlation_df["content_id"], tmp1["content_id"])

0.001622451569734322

# 2nd stage: Filtering candidate by GBDT

In [52]:
cv_list = get_StratifiedGroupKFold_list(X=cand_df, y=target_df, groups=cand_df["topic_id"], n_splits=cfg.num_fold, seed=cfg.seed)

train_feat_df = get_feature_df(
    cand_df,
    target_df,
    content_df,
    topic_df,
    cv_list,
    content_title_vec,
    topic_title_vec,
    )

******************** start run blocks... ********************
	- <kaggle_utils.blocks.LabelEncodingBlock object at 0x7fb14c992250> 0.407[s]
	- <kaggle_utils.blocks.SVDBlock object at 0x7fb14c992cd0> 3.172[s]
run test=False 3.603[s]
******************** start run blocks... ********************
	- <kaggle_utils.blocks.LabelEncodingBlock object at 0x7fb14c992a50> 0.154[s]
	- <kaggle_utils.blocks.SVDBlock object at 0x7fb14c9927d0> 1.568[s]
run test=False 1.735[s]


Unnamed: 0,content_id@LabelEncodingBlock,content_kind@LabelEncodingBlock,content_language@LabelEncodingBlock,content_title_vec_0@SVDBlock,content_title_vec_1@SVDBlock,content_title_vec_2@SVDBlock,content_title_vec_3@SVDBlock,content_title_vec_4@SVDBlock,content_title_vec_5@SVDBlock,content_title_vec_6@SVDBlock,content_title_vec_7@SVDBlock,content_title_vec_8@SVDBlock,content_title_vec_9@SVDBlock,content_title_vec_10@SVDBlock,content_title_vec_11@SVDBlock,content_title_vec_12@SVDBlock,content_title_vec_13@SVDBlock,content_title_vec_14@SVDBlock,content_title_vec_15@SVDBlock,content_title_vec_16@SVDBlock,content_title_vec_17@SVDBlock,content_title_vec_18@SVDBlock,content_title_vec_19@SVDBlock,content_title_vec_20@SVDBlock,content_title_vec_21@SVDBlock,content_title_vec_22@SVDBlock,content_title_vec_23@SVDBlock,content_title_vec_24@SVDBlock,content_title_vec_25@SVDBlock,content_title_vec_26@SVDBlock,content_title_vec_27@SVDBlock,content_title_vec_28@SVDBlock,content_title_vec_29@SVDBlock,content_title_vec_30@SVDBlock,content_title_vec_31@SVDBlock,topic_id@LabelEncodingBlock,topic_category@LabelEncodingBlock,topic_language@LabelEncodingBlock,topic_title_vec_0@SVDBlock,topic_title_vec_1@SVDBlock,topic_title_vec_2@SVDBlock,topic_title_vec_3@SVDBlock,topic_title_vec_4@SVDBlock,topic_title_vec_5@SVDBlock,topic_title_vec_6@SVDBlock,topic_title_vec_7@SVDBlock,topic_title_vec_8@SVDBlock,topic_title_vec_9@SVDBlock,topic_title_vec_10@SVDBlock,topic_title_vec_11@SVDBlock,topic_title_vec_12@SVDBlock,topic_title_vec_13@SVDBlock,topic_title_vec_14@SVDBlock,topic_title_vec_15@SVDBlock,topic_title_vec_16@SVDBlock,topic_title_vec_17@SVDBlock,topic_title_vec_18@SVDBlock,topic_title_vec_19@SVDBlock,topic_title_vec_20@SVDBlock,topic_title_vec_21@SVDBlock,topic_title_vec_22@SVDBlock,topic_title_vec_23@SVDBlock,topic_title_vec_24@SVDBlock,topic_title_vec_25@SVDBlock,topic_title_vec_26@SVDBlock,topic_title_vec_27@SVDBlock,topic_title_vec_28@SVDBlock,topic_title_vec_29@SVDBlock,topic_title_vec_30@SVDBlock,topic_title_vec_31@SVDBlock
0,193.0,4.0,5.0,1.389316,0.047603,0.247973,-0.148177,-0.078973,-0.164957,0.089955,-0.093493,-0.196,-0.182812,-0.007289,0.076333,0.163739,-0.035932,-0.112299,0.144413,-0.088069,-0.088248,0.146181,0.07606,0.06989,-0.009008,-0.225929,-0.241469,0.077414,0.052856,-0.055299,-0.031155,-0.000192,-0.226315,-0.275052,-0.052144,0.0,1.0,2.0,1.493977,0.069178,-0.617215,0.026235,0.006523,0.006687,-0.01393,0.061358,-0.088395,-0.100272,0.076393,-0.220994,0.11979,-0.010235,0.064231,-0.039353,-0.039369,-0.1556,0.197728,-0.208907,0.024392,0.151601,-0.265433,-0.161032,0.131595,-0.013024,0.061113,0.013482,-0.066214,-0.006035,0.011505,0.035534
1,393.0,2.0,4.0,1.361919,0.173129,0.384733,0.018061,0.202741,-0.109553,0.050721,0.282141,0.169805,-0.297116,-0.008145,0.435619,-0.05177,-0.248474,0.459221,0.283747,-0.132196,0.254162,-0.125124,0.07867,0.169688,0.350835,0.207834,-0.520057,0.031558,0.377358,-0.205858,-0.016165,-0.251478,-0.090493,0.141762,-0.05692,0.0,1.0,2.0,1.493977,0.069178,-0.617215,0.026235,0.006523,0.006687,-0.01393,0.061358,-0.088395,-0.100272,0.076393,-0.220994,0.11979,-0.010235,0.064231,-0.039353,-0.039369,-0.1556,0.197728,-0.208907,0.024392,0.151601,-0.265433,-0.161032,0.131595,-0.013024,0.061113,0.013482,-0.066214,-0.006035,0.011505,0.035534
2,559.0,4.0,9.0,1.20502,-0.427095,0.267573,0.093998,-0.20621,-0.173879,-0.26845,-0.073673,-0.171081,-0.123754,-0.346732,-0.017599,0.136532,0.028194,-0.137027,0.224937,0.071747,-0.089672,-0.109054,0.162518,0.218402,0.07755,-0.086101,-0.006071,0.102856,-0.067164,-0.158475,-0.109851,0.145781,-0.20713,-0.025419,0.264276,0.0,1.0,2.0,1.493977,0.069178,-0.617215,0.026235,0.006523,0.006687,-0.01393,0.061358,-0.088395,-0.100272,0.076393,-0.220994,0.11979,-0.010235,0.064231,-0.039353,-0.039369,-0.1556,0.197728,-0.208907,0.024392,0.151601,-0.265433,-0.161032,0.131595,-0.013024,0.061113,0.013482,-0.066214,-0.006035,0.011505,0.035534
3,650.0,2.0,2.0,1.521845,-0.379353,0.372153,0.378563,-0.124134,0.017656,0.15766,0.071572,0.023798,-0.048916,-0.123617,0.125467,-0.055241,-0.017257,0.078077,0.152097,-0.002196,0.033948,-0.043684,-0.103326,0.029483,-0.0269,-0.048845,-0.002305,-0.128888,0.110783,0.018756,0.010925,-0.033088,0.050633,-0.017134,0.123261,0.0,1.0,2.0,1.493977,0.069178,-0.617215,0.026235,0.006523,0.006687,-0.01393,0.061358,-0.088395,-0.100272,0.076393,-0.220994,0.11979,-0.010235,0.064231,-0.039353,-0.039369,-0.1556,0.197728,-0.208907,0.024392,0.151601,-0.265433,-0.161032,0.131595,-0.013024,0.061113,0.013482,-0.066214,-0.006035,0.011505,0.035534
4,1725.0,1.0,4.0,1.40068,-0.060753,0.252589,-0.257426,0.033496,-0.526633,-0.325087,0.389998,-0.154675,-0.337931,-0.670461,-0.271031,0.065209,0.17476,0.272669,-0.069219,-0.283027,-0.025453,-0.417692,-0.295723,0.183125,0.027462,-0.098067,-0.031495,0.097409,0.036053,-0.03467,-0.11358,0.047912,0.05361,-0.434853,0.092321,0.0,1.0,2.0,1.493977,0.069178,-0.617215,0.026235,0.006523,0.006687,-0.01393,0.061358,-0.088395,-0.100272,0.076393,-0.220994,0.11979,-0.010235,0.064231,-0.039353,-0.039369,-0.1556,0.197728,-0.208907,0.024392,0.151601,-0.265433,-0.161032,0.131595,-0.013024,0.061113,0.013482,-0.066214,-0.006035,0.011505,0.035534


n_features: 70


In [53]:
filepath = os.path.join(cfg.EXP_PREDS, "oof_2nd.pkl")
if os.path.isfile(filepath):
    oof = pickle.load(open(filepath, "rb"))
else:
    oof, models = train_cv(cfg, train_feat_df, target_df, cv_list, metrics_dict=metrics_dict)
    pickle.dump(oof, open(filepath, "wb"))



[0]	validation_0-logloss:0.44616
[500]	validation_0-logloss:0.02809
[1000]	validation_0-logloss:0.02580
[1500]	validation_0-logloss:0.02499
[1998]	validation_0-logloss:0.02478


f2_score: 0.3036


[0]	validation_0-logloss:0.44604
[500]	validation_0-logloss:0.02745
[1000]	validation_0-logloss:0.02528
[1500]	validation_0-logloss:0.02445
[2000]	validation_0-logloss:0.02422
[2004]	validation_0-logloss:0.02421


f2_score: 0.30807


[0]	validation_0-logloss:0.44621
[500]	validation_0-logloss:0.02884
[1000]	validation_0-logloss:0.02665
[1500]	validation_0-logloss:0.02595
[2000]	validation_0-logloss:0.02565
[2080]	validation_0-logloss:0.02567


f2_score: 0.3103


[0]	validation_0-logloss:0.44613
[500]	validation_0-logloss:0.02821
[1000]	validation_0-logloss:0.02624
[1500]	validation_0-logloss:0.02542
[2000]	validation_0-logloss:0.02511
[2294]	validation_0-logloss:0.02509


f2_score: 0.32437


[0]	validation_0-logloss:0.44608
[500]	validation_0-logloss:0.02814
[1000]	validation_0-logloss:0.02611
[1500]	validation_0-logloss:0.02539
[1996]	validation_0-logloss:0.02518


f2_score: 0.30403
Fold4 f2_score: 0.31013


In [70]:
oof_preds = np.where(oof>=1e-3, 1, 0)
reduced_pred_df = cand_df[oof_preds==1]
calc_pred_df = reduced_pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
calc_pred_df = pd.merge(correlation_df[["topic_id"]], calc_pred_df, on="topic_id", how="left")
calc_pred_df = calc_pred_df.fillna("nan")

reduced_recall_score = comp_recall_score(correlation_df["content_id"], calc_pred_df["content_id"])
LOGGER.info(f"Filtering by GBDT: \n recall = {np.round(reduced_recall_score, 5)} \n n_data = {len(reduced_pred_df)}")

Filtering by GBDT: 
 recall = 0.50745 
 n_data = 4000258


# 3rd stage: Matching by Transformer

In [23]:
oof_preds = np.where(oof>=1e-2, 1, 0)
train_filtered_df = cand_df.copy()
train_filtered_df["target"] = target_df.to_numpy()
train_filtered_df = train_filtered_df[oof_preds==1].reset_index(drop=True)

train_filtered_df = pd.merge(train_filtered_df, content_df, on="content_id", how="left")
train_filtered_df = pd.merge(train_filtered_df, topic_df, on="topic_id", how="left")

In [32]:
train_text_df = create_text_df(train_filtered_df)
train_text_df = pd.concat([train_text_df, train_filtered_df[["topic_id", "content_id"]]], axis=1)
train_text_df

Unnamed: 0,text,target,topic_id,content_id
0,c_0122f3ff5d19</s>Връхни ъгли</s>t_00004da3a1b...,0,t_00004da3a1b2,c_0122f3ff5d19
1,c_21f75cfb89da</s>Развивки на многостен</s>t_0...,0,t_00004da3a1b2,c_21f75cfb89da
2,c_247ee1c26c75</s>Синтез на заместени бензенов...,0,t_00004da3a1b2,c_247ee1c26c75
3,c_431a13312468</s>Успоредни резистори (част 2)...,0,t_00004da3a1b2,c_431a13312468
4,c_6334607e3816</s>Намиране на липсващи ъгли</s...,0,t_00004da3a1b2,c_6334607e3816
...,...,...,...,...
850896,c_e40fd6243440</s>Texto informativo: la notici...,0,t_fff9e5407d13,c_e40fd6243440
850897,c_14aa105dc884</s>يحسب الإحداثيّات القطبية لنق...,0,t_fffe811a6da9,c_14aa105dc884
850898,c_157e2611928f</s>يحسب الإحداثيّات الديكارتيّة...,0,t_fffe811a6da9,c_157e2611928f
850899,c_384c6789d404</s>Производни на полярни функци...,0,t_fffe811a6da9,c_384c6789d404


In [28]:
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))

bi_cv_list = get_StratifiedGroupKFold_list(X=train_text_df, y=train_text_df["target"], groups=train_filtered_df["topic_id"], n_splits=cfg.num_fold, seed=cfg.seed)

score = train_loop(cfg, train_text_df, bi_cv_list, correlation_df)

# if cfg.upload_from_colab:
#     dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1331 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
oof_pred = np.load(os.path.join(cfg.EXP_PREDS, "oof_pred.npy"))

In [None]:
score = calc_comp_score(train_filtered_df, sigmoid(oof_pred), correlation_df, thr=0.05)
# LOGGER.info(f'fold score: {fold_score}')
print(f'CV: {round(score, 4)}')

CV: 0.3795


In [None]:
def calc_comp_recall(train_df, oof, correlation_df, thr=0.1):
    """2値分類の予測からcvを計算する"""
    oof_preds = np.where(oof>=thr, 1, 0)
    pred_df = train_df[oof_preds==1]
    pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
    pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
    pred_df = pred_df.fillna("nan")

    cv_score = comp_recall_score(correlation_df["content_id"], pred_df["content_id"])
    return cv_score

In [None]:
score = calc_comp_recall(train_filtered_df, sigmoid(oof_pred), correlation_df, thr=0.05)
# LOGGER.info(f'fold score: {fold_score}')
print(f'CV: {round(score, 4)}')

CV: 0.4015


In [7]:
if cfg.upload_from_colab:
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

Starting upload for file XGBoost_fold_4.pkl


100%|██████████| 6.82M/6.82M [00:05<00:00, 1.29MB/s]


Upload successful: XGBoost_fold_4.pkl (7MB)
Starting upload for file XGBoost_fold_2.pkl


100%|██████████| 8.46M/8.46M [00:04<00:00, 1.87MB/s]


Upload successful: XGBoost_fold_2.pkl (8MB)
Starting upload for file tokenizer.tar


100%|██████████| 21.1M/21.1M [00:06<00:00, 3.43MB/s]


Upload successful: tokenizer.tar (21MB)
Starting upload for file topic_title_vec_svd_dict.pkl


100%|██████████| 97.0k/97.0k [00:02<00:00, 38.9kB/s]


Upload successful: topic_title_vec_svd_dict.pkl (97KB)
Starting upload for file XGBoost_fold_1.pkl


100%|██████████| 7.84M/7.84M [00:04<00:00, 1.72MB/s]


Upload successful: XGBoost_fold_1.pkl (8MB)
Starting upload for file preds.tar


100%|██████████| 103M/103M [00:20<00:00, 5.36MB/s] 


Upload successful: preds.tar (103MB)
Starting upload for file XGBoost_fold_0.pkl


100%|██████████| 7.91M/7.91M [00:05<00:00, 1.59MB/s]


Upload successful: XGBoost_fold_0.pkl (8MB)
Starting upload for file model.tar


100%|██████████| 5.19G/5.19G [08:50<00:00, 10.5MB/s]   


Upload successful: model.tar (5GB)
Starting upload for file content_id_content_kind_content_language_oe.pkl
