In [1]:
! nvidia-smi

Mon Jan  2 11:03:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
| 32%   50C    P5   115W / 480W |   2435MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import re
import gc
import pdb
import sys
import json
import time
import wandb
import pickle
import shutil
import joblib
import random
import requests
import warnings
from glob import glob
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from pandarallel import pandarallel

import scipy
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
import lightgbm as lgb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import tokenizers
import sentencepiece
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW
from transformers import logging

from cuml import NearestNeighbors
from kaggle.api.kaggle_api_extended import KaggleApi

sys.path.append("/home/working/")
from kaggle_utils.utils import Timer, reduce_mem_usage, get_logger, decorate, setup, dataset_create_new
from kaggle_utils.blocks import AbstractBaseBlock, IdentityBlock, LabelEncodingBlock, SVDBlock, run_blocks
from kaggle_utils.exp_manage import set_wandb
from kaggle_nlp_utils.preprocessing import resolve_encodings_and_normalize
from kaggle_nlp_utils.model import (
    freeze, 
    AttentionPooling, 
    MeanPooling, 
    WeightedLayerPooling, 
    replace_mixout, 
    reinit_bert,
)
from kaggle_nlp_utils.optimizer import (
    get_scheduler,
    get_optimizer_grouped_parameters,
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 300)
pandarallel.initialize(progress_bar=True)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')

logging.set_verbosity_error()
logging.set_verbosity_warning()

%load_ext autoreload
%autoreload 2
%env TOKENIZERS_PARALLELISM=true

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


env: TOKENIZERS_PARALLELISM=true


In [3]:
def fbeta_wrapper(y_true, y_pred):
    beta = 2
    return fbeta_score(y_true, y_pred, beta)
    
class Config:
    AUTHOR = "shu421"

    EXP = "exp002"
    EMB_MODEL_PATH = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    MODEL_PATH = "xlm-roberta-base"
    COMPETITION = "learning-equality-curriculum-recommendations"
    BASE_PATH = "/home/working/"
    api_path = "/.kaggle/kaggle.json"

    # Language Model Config
    # train
    apex=True
    seed = 42
    num_fold = 5
    train_fold = [0, 1, 2, 3, 4,]
    batch_size = 512
    n_epoch = 5
    max_len = 512
    n_class = 1
    
    # optimizer, scheduler
    weight_decay = 0.01
    scheduler="cosine"
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    lr_weight_decay = 0.95
    
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 500
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1

    # weight and bias
    wandb = False
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

    # GBDT
    gbdt_model = "XGBoost"
    stopping_rounds = 50
    log_evaluation = 500
    model_params = {
        "objective": "binary:logistic",
        # "eval_metric": fbeta_wrapper,
        "eval_metric": "logloss",
        "learning_rate": 0.3,
        "tree_method": "gpu_hist",
        "random_state": seed,
        "n_estimators": 99999,
    }
    train_params = {
        "verbose": log_evaluation,
    }

# setup
cfg = setup(Config)
LOGGER = get_logger(cfg.OUTPUT_EXP)
log_filepath = os.path.join(cfg.OUTPUT_EXP, f"{cfg.EXP}.log")
if os.path.isfile():
    with open(log_filepath) as f:
        pass
    f.close()
if cfg.wandb:
    run = set_wandb(cfg)

# Metrics

In [4]:
def comp_fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

def comp_recall_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        recall = len(TP) / len(true)
        
        score_list.append(recall)
    score = sum(score_list) / len(score_list)
    return score

def calc_cv(train_df, oof, correlation_df, thr=0.1):
    """2値分類の予測からcvを計算する"""
    oof_preds = np.where(oof>=thr, 1, 0)
    pred_df = train_df[oof_preds==1]
    pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
    pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
    pred_df = pred_df.fillna("nan")

    cv_score = comp_fbeta_score(correlation_df["content_id"], pred_df["content_id"])
    return cv_score

In [7]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

class BiEncoderDataset(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        self.text = df[col].to_numpy()
        self.label = df["target"].to_numpy()
    
    def __len__(self):
        return len(self.label)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        label = self.label[index].astype(np.float32)
        return text, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Language Model

In [8]:
class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.n_class)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        # batch, hidden_size
        feature = self.feature(inputs)
        feature = self.ln(feature)
        # feature1 = self.drop1(feature)
        # feature2 = self.drop2(feature)
        # feature3 = self.drop3(feature)
        # feature4 = self.drop4(feature)
        # feature5 = self.drop5(feature)
        # feature = (feature1 + feature2 + feature3 + feature4 + feature5) / 5
        output = self.fc(feature)
        return output.squeeze()



In [10]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [11]:
def train_fn(cfg, train_loader, valid_loader, train_df, valid_df, criterion, optimizer, scheduler, model, fold, epoch, best_val_preds, best_val_score):
    LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
    train_losses = []
    train_nums = []
    model.train()
    scaler = GradScaler(enabled=cfg.apex)
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for step, (inputs, labels) in enumerate(pbar):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(enabled=cfg.apex):
                output = model(inputs)
            loss = criterion(output, labels)

            pbar.set_postfix({
                'loss': loss.item(),
                'lr': scheduler.get_lr()[0]
            })
            train_losses.append(loss.item() * len(labels))
            train_nums.append(len(labels))

            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps

            scaler.scale(loss).backward()

            #  # FGM attack
            # fgm.attack()
            # with autocast(enabled=cfg.apex):
            #     loss_adv, _ = model(inputs, labels)
            # scaler.scale(loss_adv).backward()
            # fgm.restore()

            if cfg.clip_grad_norm is not None:
                # scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    cfg.clip_grad_norm
                )

            if (step+1) % cfg.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            if step % cfg.eval_step == 0 and step != 0:
                best_val_preds, best_val_score, val_loss = valid_fn(
                    cfg,
                    valid_loader,
                    valid_df,
                    criterion,
                    model,
                    fold,
                    epoch,
                    step,
                    best_val_preds,
                    best_val_score,
                )
                model.train()

            if cfg.wandb:
                wandb.log({f"[fold{fold}] train_loss": loss.item(),
                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
    train_loss = sum(train_losses)/sum(train_nums)
    return train_loss



def valid_fn(cfg, valid_loader, valid_df, criterion, model, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
            # for (inputs, labels) in valid_loader:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast(enabled=cfg.apex):
                    output = model(inputs)

                loss = criterion(output, labels)
                output = output.detach().cpu().numpy()
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    y_preds = np.where(val_preds>=0.5, 1, 0)
    score = fbeta_score(valid_df["target"], y_preds, beta=2)

    LOGGER.info(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epoch}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(),
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )

    return best_val_preds, best_val_score, val_loss


def train_loop(cfg, train_data: pd.DataFrame, cv_list: List, correlation_df: pd.DataFrame):
    """_summary_

    Args:
        cfg (_type_): _description_
        train_data (pd.DataFrame): textとtargetが格納されたデータフレーム
        cv_list (List): _description_
        correlation_df (pd.DataFrame): _description_

    Returns:
        _type_: _description_
    """
    oof_pred = np.zeros((len(train_data)), dtype=np.float32)
    fold_score = []

    for fold in cfg.train_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        train_idx, valid_idx = cv_list[fold]
        train_df = train_data.iloc[train_idx].reset_index(drop=True)
        valid_df = train_data.iloc[valid_idx].reset_index(drop=True)

        # Datasetの設定
        train_dataset = BiEncoderDataset(cfg, train_df, "text")
        valid_dataset = BiEncoderDataset(cfg, valid_df, "text")
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        # model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(cfg.device)

        criterion = nn.BCEWithLogitsLoss()

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epoch)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # # enable FGM
        # fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999

        for epoch in range(cfg.n_epoch):
            # # training
            # LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
            # train_losses = []
            # train_nums = []
            # model.train() 
            # scaler = GradScaler(enabled=cfg.apex)
            # with tqdm(train_loader, total=len(train_loader)) as pbar:
            #     for step, (inputs, labels) in enumerate(pbar):
            #         inputs = collate(inputs)
            #         for k, v in inputs.items():
            #             inputs[k] = v.to(cfg.device)
            #         labels = labels.to(cfg.device)
            #         with autocast(enabled=cfg.apex):
            #             output = model(inputs)
            #         loss = criterion(output, labels)

            #         pbar.set_postfix({
            #             'loss': loss.item(),
            #             'lr': scheduler.get_lr()[0]
            #         })
            #         train_losses.append(loss.item() * len(labels))
            #         train_nums.append(len(labels))

            #         if cfg.gradient_accumulation_steps > 1:
            #             loss = loss / cfg.gradient_accumulation_steps

            #         scaler.scale(loss).backward()

            #         #  # FGM attack
            #         # fgm.attack()
            #         # with autocast(enabled=cfg.apex):
            #         #     loss_adv, _ = model(inputs, labels)
            #         # scaler.scale(loss_adv).backward()
            #         # fgm.restore()

            #         if cfg.clip_grad_norm is not None:
            #             # scaler.unscale_(optimizer)
            #             torch.nn.utils.clip_grad_norm_(
            #                 model.parameters(),
            #                 cfg.clip_grad_norm
            #             )

            #         if (step+1) % cfg.gradient_accumulation_steps == 0:
            #             scaler.step(optimizer)
            #             scaler.update()
            #             optimizer.zero_grad()
            #             scheduler.step()

            #         if step % cfg.eval_step == 0 and step != 0:
            #             best_val_preds, best_val_score, val_loss = valid_fn(
            #                 cfg,
            #                 valid_loader,
            #                 valid_df,
            #                 criterion,
            #                 model,
            #                 fold,
            #                 epoch,
            #                 step,
            #                 best_val_preds,
            #                 best_val_score,
            #             )
            #             model.train()

            #         if cfg.wandb:
            #             wandb.log({f"[fold{fold}] train_loss": loss.item(),
            #                     f"[fold{fold}] lr": scheduler.get_lr()[0]})

            # train_loss = sum(train_losses)/sum(train_nums)
            train_loss = train_fn(
                cfg,
                train_loader,
                valid_loader,
                train_df,
                valid_df,
                criterion,
                optimizer,
                scheduler,
                model,
                fold,
                epoch,
                best_val_preds,
                best_val_score
                )

            LOGGER.info(f'Fold{fold}, Epoch{epoch}/{cfg.n_epoch} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg,
                valid_loader,
                valid_df,
                criterion,
                model,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch,
                        f"[fold{fold}] avg_train_loss": train_loss,
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model
        gc.collect()
        torch.cuda.empty_cache()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = calc_cv(train_data, oof_pred, correlation_df)
    LOGGER.info(f'fold score: {fold_score}')
    LOGGER.info(f'CV: {round(score, 4)}')
    return score

# Embedding

In [12]:
class EmbDataSet(Dataset):
    def __init__(self, cfg, df, col, tokenizer):
        self.cfg = cfg
        df[col] = df[col].fillna("no text")
        self.text = df[col].to_numpy()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        return text

    @staticmethod
    def prepare_input(self, cfg, text):
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

In [13]:
def get_emb_vec(content_df, topic_df, col):
    # prepare model for getting embeddings
    model = AutoModel.from_pretrained(cfg.EMB_MODEL_PATH)
    model.eval()
    model.to(cfg.device)
    tokenizer = AutoTokenizer.from_pretrained(cfg.EMB_MODEL_PATH)

    content_dataset = EmbDataSet(cfg, content_df, "content_"+col, tokenizer)
    topics_dataset = EmbDataSet(cfg, topic_df, "topic_"+col, tokenizer)

    content_loader = DataLoader(
        dataset=content_dataset,
        batch_size=256,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    topics_loader = DataLoader(
        dataset=topics_dataset,
        batch_size=256,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    if not os.path.isfile(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl")):
        content_vec = []
        for step, contents in enumerate(tqdm(content_loader)):
            contents = collate(contents)
            for k, v in contents.items():
                contents[k] = v.to(cfg.device)
            with torch.no_grad():
                output = model(**contents)
            vec = output.last_hidden_state.mean(1).cpu().numpy()
            content_vec.append(vec)
        content_vec = np.concatenate(content_vec)
        pickle.dump(content_vec, open(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl"), "wb"))
    else:
        content_vec = pickle.load(open(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl"), "rb"))

    if not os.path.isfile(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl")):
        topic_vec = []
        for step, topics in enumerate(tqdm(topics_loader)):
            topics = collate(topics)
            for k, v in topics.items():
                topics[k] = v.to(cfg.device)
            with torch.no_grad():
                output = model(**topics)
            vec = output.last_hidden_state.mean(1).cpu().numpy()
            topic_vec.append(vec)
        topic_vec = np.concatenate(topic_vec)
        pickle.dump(topic_vec, open(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl"), "wb"))
    else:
        topic_vec = pickle.load(open(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl"), "rb"))

    del model
    torch.cuda.empty_cache()
    gc.collect()

    return content_vec, topic_vec

# GBDT

In [14]:
class XGBoost:
    def __init__(self, model_params: dict, train_params: dict):
        self.model_params = model_params
        self.train_params = train_params
        
    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model = xgb.XGBClassifier(
            **self.model_params,
            callbacks=[
                xgb.callback.EarlyStopping(
                    rounds=cfg.stopping_rounds,
                    save_best=True,
                    maximize=False,
                    ),
                # xgb.callback.EvaluationMonitor(
                #     period=cfg.log_evaluation
                #     ),
            ],
            )

        self.model.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            **self.train_params
            )

    def predict(self, features):
        return self.model.predict_proba(features)

    

class LightGBM:
    def __init__(self, model_params: dict, train_params: dict):
        self.model_params = model_params
        self.train_params = train_params
        
    def fit(self, X_train, y_train, X_valid, y_valid):
        d_train = lgb.Dataset(
            X_train, 
            label=y_train
            )

        d_valid = lgb.Dataset(
            X_valid, 
            label=y_valid
            )
        self.model = lgb.train(
            params=self.model_params,
            train_set=d_train,
            valid_sets=[d_train, d_valid],
            valid_names=['train', 'valid'],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=True),
                lgb.log_evaluation(1000)],
            **self.train_params)

    def predict(self, features):
        return self.model.predict(features)
    
def get_model(gbdt_model):
    if gbdt_model == 'LightGBM':
        model = LightGBM(
            model_params=Config.model_params, 
            train_params=Config.train_params
            )
    elif gbdt_model == 'XGBoost':
        model = XGBoost(
            model_params=Config.model_params, 
            train_params=Config.train_params
            )
    return model


def save_model(filepath, model):
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)    

def load_model(filepath):
    with open(filepath, 'rb') as f:
        model = pickle.load(f)
    return model  

In [15]:
def get_whole_df():
    content_df = pd.read_csv(os.path.join(cfg.INPUT, 'content.csv'))
    topic_df = pd.read_csv(os.path.join(cfg.INPUT, 'topics.csv'))
    correlation_df = pd.read_csv(os.path.join(cfg.INPUT, 'correlations.csv'))
    sub_df = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))
    return content_df, topic_df, correlation_df, sub_df

def preprocess_df(content_df, topic_df, correlation_df):
    content_df = content_df.drop(columns=["license", "copyright_holder"])
    content_df = content_df.add_prefix("content_")
    topic_df = topic_df.add_prefix("topic_")
    correlation_df = correlation_df.rename(columns={"content_ids":"content_id"})
    return content_df, topic_df, correlation_df

def get_cv_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    # cv = GroupKFold(n_splits=n_splits)
    cv_list = list(cv.split(X, y, groups))
    return cv_list

def get_processed_df():
    content_df, topic_df, correlation_df, sub_df = get_whole_df()
    content_df, topic_df, correlation_df = preprocess_df(content_df, topic_df, correlation_df)
    return content_df, topic_df, correlation_df, sub_df

In [16]:
def get_cand_df(content_df, topic_df, correlation_df, content_title_vec, topics_title_vec):
    """
    knnを使用してtopicに対するcontentの候補を出力する関数
    """
    id2content_dict = dict(content_df["content_id"])
    id2topics_dict = dict(topic_df["topic_id"])

    # contentがどのtopicにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=40, metric="cosine")
    knn_model.fit(topics_title_vec)
    distances, indices = knn_model.kneighbors(content_title_vec)
    knn_pred_c2t_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_c2t_dict[id2topics_dict[j]].append(id2content_dict[idx])

    # topicがどのcontentにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=20, metric="cosine")
    knn_model.fit(content_title_vec)
    distances, indices = knn_model.kneighbors(topics_title_vec)
    knn_pred_t2c_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_t2c_dict[id2topics_dict[idx]].append(id2content_dict[j])

    # idの割り当て
    knn_pred_dict = {k:np.unique(v_c2t+v_t2c) for k,v_c2t, v_t2c in zip(knn_pred_c2t_dict.keys(), knn_pred_c2t_dict.values(), knn_pred_t2c_dict.values())}
    knn_pred_id = {k:[" ".join(v)] for k,v in knn_pred_dict.items()}
    knn_pred_df = pd.DataFrame(knn_pred_id).T.reset_index()
    knn_pred_df.columns = ["topic_id", "content_id"]
    knn_pred_df = knn_pred_df[knn_pred_df["topic_id"].isin(correlation_df["topic_id"].to_list())] # trainに入っているtopicを抽出
    knn_pred_df = knn_pred_df.reset_index(drop=True)

    # candidateの集計結果を出力
    recall = comp_recall_score(correlation_df["content_id"], knn_pred_df["content_id"])
    LOGGER.info(f"recall = {round(recall, 5)}")
    # candidateの集計結果を出力
    f2 = comp_fbeta_score(correlation_df["content_id"], knn_pred_df["content_id"])
    LOGGER.info(f"f2 = {round(f2, 5)}")
    n_bin_data = knn_pred_df["content_id"].apply(lambda x: len(x.split())).sum()
    LOGGER.info(f"n_data = {n_bin_data}")

    # 文字列の候補をlistに変換
    knn_pred_df["content_id"] = knn_pred_df["content_id"].apply(lambda x: x.split(" "))
    cand_df = knn_pred_df.explode("content_id")

    # target作成
    correlation_df_ = correlation_df.copy()
    correlation_df_["content_id"] = correlation_df_["content_id"].apply(lambda x: x.split(" "))
    correlation_df_ = correlation_df_.explode("content_id")
    correlation_df_["target"] = 1
    target_df = pd.DataFrame()
    target_df = pd.merge(cand_df, correlation_df_, on=["topic_id", "content_id"], how="left")
    target_df = target_df["target"].fillna(0).astype(int)

    return cand_df, target_df

def get_feature_df(cand_df, target_df, correlation_df, content_df, topic_df, cv_list, vecs):
    content_title_vec, topic_title_vec, content_desc_vec, topic_desc_vec = vecs

    # content features
    content_svd_cols = [
        "content_title_vec",
        "content_desc_vec",
    ]
    content_cat_cols = [
        "content_id",
        "content_kind",
        "content_language",
    ]

    # topic features
    topic_svd_cols = [
        "topic_title_vec",
        "topic_desc_vec",
    ]
    topic_cat_cols = [
        "topic_id",
        "topic_category",
        "topic_language",
    ]

    content_blocks = [
        # IdentityBlock(use_cols=content_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=content_cat_cols, cfg=cfg), 
        SVDBlock(cols=content_svd_cols, cfg=cfg, dim=32, title_vec=content_title_vec, desc_vec=content_desc_vec),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    topic_blocks = [
        # IdentityBlock(use_cols=topic_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=topic_cat_cols, cfg=cfg), 
        SVDBlock(cols=topic_svd_cols, cfg=cfg, dim=32, title_vec=topic_title_vec, desc_vec=topic_desc_vec),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    content_feat_df = run_blocks(content_df, blocks=content_blocks, cfg=cfg, test=False)
    topic_feat_df = run_blocks(topic_df, blocks=topic_blocks, cfg=cfg, test=False)

    # content_idとfeatを対応付ける
    content_feat_df = pd.concat([content_df[["content_id"]], content_feat_df], axis=1)
    # topic_idとfeatを対応付ける
    topic_feat_df = pd.concat([topic_df[["topic_id"]], topic_feat_df], axis=1)

    # topicとcontentのfeatをマージ
    train_feat_df = pd.merge(cand_df, content_feat_df, on="content_id", how="left")
    train_feat_df = pd.merge(train_feat_df, topic_feat_df, on="topic_id", how="left")
    train_feat_df = train_feat_df.drop(columns=["topic_id", "content_id"])

    display(train_feat_df.head())
    LOGGER.info(f"n_features: {len(train_feat_df.columns)}")

    return train_feat_df

In [17]:
def metrics_dict(y_true, y_pred):
    metrics_dict_ = {}
    # metrics_dict_['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) 
    # metrics_dict_['MAE'] = mean_absolute_error(y_true, y_pred)
    # metrics_dict_['RMSE'] = mean_squared_error(y_true, y_pred, squared=False)
    # metrics_dict_['f1_micro'] = f1_score(y_true, y_pred, average='micro')
    # metrics_dict_['auc'] = roc_auc_score(y_true, y_pred)
    # metrics_dict_["f1_score"] = f1_score(y_true, y_pred)
    metrics_dict_["f2_score"] = fbeta_score(y_true, y_pred, beta=2)
    # metrics_dict_["recall"] = recall_score(y_true, y_pred)

    return metrics_dict_

def train_cv(train_feat_df, target_df, cv_list, metrics_dict):
    """交差検証を実行"""
    oof = np.zeros(len(train_feat_df))
    valid_idxes = []
    models = []

    for i_fold, (train_idx, valid_idx) in enumerate(cv_list):
        LOGGER.info(decorate(f'Fold{i_fold}', decoration='=='))
        filepath = os.path.join(cfg.OUTPUT_EXP, f"{cfg.gbdt_model}_fold_{i_fold}.pkl")

        X_train = train_feat_df.iloc[train_idx].to_numpy()
        X_valid = train_feat_df.iloc[valid_idx].to_numpy()
        y_train = target_df.iloc[train_idx].to_numpy()
        y_valid = target_df.iloc[valid_idx].to_numpy()

        model = get_model(cfg.gbdt_model)
        model.fit(X_train, y_train, X_valid, y_valid)
        save_model(filepath, model)

        model = load_model(filepath)
        models.append(model)
        y_prob = model.predict(X_valid)
        y_prob = y_prob[:, 1].squeeze()
        # y_preds_ = np.where(y_prob>=0.5, 1, 0)
        y_preds_ = np.where(y_prob>=0.5, 1, 0)
        metrics_dict_scored = metrics_dict(y_valid, y_preds_)

        for key in metrics_dict_scored.keys():
            LOGGER.info(f"{key}: {np.round(metrics_dict_scored[key], 5)}")
        oof[valid_idx] = y_prob

    LOGGER.info(decorate('OOF'))
    oof_ = np.where(oof>=0.5, 1, 0)
    metrics_dict_scored = metrics_dict(target_df, oof_)
    for key in metrics_dict_scored.keys():
        LOGGER.info(f"Fold{i_fold} {key}: {np.round(metrics_dict_scored[key], 5)}")

    pickle.dump(oof, open(os.path.join(cfg.EXP_PREDS, "oof.pkl"), "wb"))
    return oof, models


def predict_cv(test_feat_df):
    """Inference"""
    prob_folds = []
    
    for i_fold in range(Config.n_fold):
        filepath = os.path.join(cfg.OUTPUT_EXP, f"{cfg.gbdt_model}_fold_{i_fold}.pkl")
        model = load_model(filepath)
        y_prob = model.predict(test_feat_df)
        preds = np.where(y_prob>=0.5, 1, 0)
        prob_folds.append(y_prob)
    
    pickle.dump(prob_folds, open(os.path.join(cfg.EXP_PREDS, "prob_folds.pkl"), "wb"))
    return prob_folds

In [18]:
def create_text_df(input_df):
    output_df = pd.DataFrame({"text":input_df["content_id"] + "</s>" + input_df["content_title"] + "</s>" + input_df["topic_id"] + "</s>" + input_df["topic_title"]}).reset_index(drop=True)
    output_df["target"] = input_df["target"]
    return output_df

# Setup & Preprocessing

In [19]:
content_df, topic_df, correlation_df, sub_df = get_processed_df()

# 1st stage: Candidate generation

In [20]:
content_title_vec, topics_title_vec = get_emb_vec(content_df, topic_df, 'title')
content_desc_vec, topics_desc_vec = get_emb_vec(content_df, topic_df, 'description')

vecs = [
    content_title_vec,
    topics_title_vec,
    content_desc_vec,
    topics_desc_vec,
]

cand_df, target_df = get_cand_df(content_df, topic_df, correlation_df, content_title_vec, topics_title_vec)

recall = 0.46371
f2 = 0.0989
n_data = 5304057


# 2nd stage: Filtering candidate by GBDT

In [21]:
cv_list = get_cv_list(X=cand_df, y=target_df, groups=cand_df["topic_id"], n_splits=cfg.num_fold, seed=cfg.seed)
pickle.dump(cv_list, open(os.path.join(cfg.OUTPUT_EXP, "cv_list.pkl"), "wb"))

train_feat_df = get_feature_df(cand_df, target_df, correlation_df, content_df, topic_df, cv_list, vecs)

******************** start run blocks... ********************
	- <kaggle_utils.blocks.LabelEncodingBlock object at 0x7fdfaae39850> 0.410[s]
	- <kaggle_utils.blocks.SVDBlock object at 0x7fdfaadb7850> 6.280[s]
run test=False 6.736[s]
******************** start run blocks... ********************
	- <kaggle_utils.blocks.LabelEncodingBlock object at 0x7fdfaadb7890> 0.191[s]
	- <kaggle_utils.blocks.SVDBlock object at 0x7fdfaadf9550> 3.048[s]
run test=False 3.269[s]


Unnamed: 0,content_id@LabelEncodingBlock,content_kind@LabelEncodingBlock,content_language@LabelEncodingBlock,content_title_vec_0@SVDBlock,content_title_vec_1@SVDBlock,content_title_vec_2@SVDBlock,content_title_vec_3@SVDBlock,content_title_vec_4@SVDBlock,content_title_vec_5@SVDBlock,content_title_vec_6@SVDBlock,content_title_vec_7@SVDBlock,content_title_vec_8@SVDBlock,content_title_vec_9@SVDBlock,content_title_vec_10@SVDBlock,content_title_vec_11@SVDBlock,content_title_vec_12@SVDBlock,content_title_vec_13@SVDBlock,content_title_vec_14@SVDBlock,content_title_vec_15@SVDBlock,content_title_vec_16@SVDBlock,content_title_vec_17@SVDBlock,content_title_vec_18@SVDBlock,content_title_vec_19@SVDBlock,content_title_vec_20@SVDBlock,content_title_vec_21@SVDBlock,content_title_vec_22@SVDBlock,content_title_vec_23@SVDBlock,content_title_vec_24@SVDBlock,content_title_vec_25@SVDBlock,content_title_vec_26@SVDBlock,content_title_vec_27@SVDBlock,content_title_vec_28@SVDBlock,content_title_vec_29@SVDBlock,content_title_vec_30@SVDBlock,content_title_vec_31@SVDBlock,content_desc_vec_0@SVDBlock,content_desc_vec_1@SVDBlock,content_desc_vec_2@SVDBlock,content_desc_vec_3@SVDBlock,content_desc_vec_4@SVDBlock,content_desc_vec_5@SVDBlock,content_desc_vec_6@SVDBlock,content_desc_vec_7@SVDBlock,content_desc_vec_8@SVDBlock,content_desc_vec_9@SVDBlock,content_desc_vec_10@SVDBlock,content_desc_vec_11@SVDBlock,content_desc_vec_12@SVDBlock,content_desc_vec_13@SVDBlock,content_desc_vec_14@SVDBlock,content_desc_vec_15@SVDBlock,content_desc_vec_16@SVDBlock,content_desc_vec_17@SVDBlock,content_desc_vec_18@SVDBlock,content_desc_vec_19@SVDBlock,content_desc_vec_20@SVDBlock,content_desc_vec_21@SVDBlock,content_desc_vec_22@SVDBlock,content_desc_vec_23@SVDBlock,content_desc_vec_24@SVDBlock,content_desc_vec_25@SVDBlock,content_desc_vec_26@SVDBlock,content_desc_vec_27@SVDBlock,content_desc_vec_28@SVDBlock,content_desc_vec_29@SVDBlock,content_desc_vec_30@SVDBlock,content_desc_vec_31@SVDBlock,topic_id@LabelEncodingBlock,topic_category@LabelEncodingBlock,topic_language@LabelEncodingBlock,topic_title_vec_0@SVDBlock,topic_title_vec_1@SVDBlock,topic_title_vec_2@SVDBlock,topic_title_vec_3@SVDBlock,topic_title_vec_4@SVDBlock,topic_title_vec_5@SVDBlock,topic_title_vec_6@SVDBlock,topic_title_vec_7@SVDBlock,topic_title_vec_8@SVDBlock,topic_title_vec_9@SVDBlock,topic_title_vec_10@SVDBlock,topic_title_vec_11@SVDBlock,topic_title_vec_12@SVDBlock,topic_title_vec_13@SVDBlock,topic_title_vec_14@SVDBlock,topic_title_vec_15@SVDBlock,topic_title_vec_16@SVDBlock,topic_title_vec_17@SVDBlock,topic_title_vec_18@SVDBlock,topic_title_vec_19@SVDBlock,topic_title_vec_20@SVDBlock,topic_title_vec_21@SVDBlock,topic_title_vec_22@SVDBlock,topic_title_vec_23@SVDBlock,topic_title_vec_24@SVDBlock,topic_title_vec_25@SVDBlock,topic_title_vec_26@SVDBlock,topic_title_vec_27@SVDBlock,topic_title_vec_28@SVDBlock,topic_title_vec_29@SVDBlock,topic_title_vec_30@SVDBlock,topic_title_vec_31@SVDBlock,topic_desc_vec_0@SVDBlock,topic_desc_vec_1@SVDBlock,topic_desc_vec_2@SVDBlock,topic_desc_vec_3@SVDBlock,topic_desc_vec_4@SVDBlock,topic_desc_vec_5@SVDBlock,topic_desc_vec_6@SVDBlock,topic_desc_vec_7@SVDBlock,topic_desc_vec_8@SVDBlock,topic_desc_vec_9@SVDBlock,topic_desc_vec_10@SVDBlock,topic_desc_vec_11@SVDBlock,topic_desc_vec_12@SVDBlock,topic_desc_vec_13@SVDBlock,topic_desc_vec_14@SVDBlock,topic_desc_vec_15@SVDBlock,topic_desc_vec_16@SVDBlock,topic_desc_vec_17@SVDBlock,topic_desc_vec_18@SVDBlock,topic_desc_vec_19@SVDBlock,topic_desc_vec_20@SVDBlock,topic_desc_vec_21@SVDBlock,topic_desc_vec_22@SVDBlock,topic_desc_vec_23@SVDBlock,topic_desc_vec_24@SVDBlock,topic_desc_vec_25@SVDBlock,topic_desc_vec_26@SVDBlock,topic_desc_vec_27@SVDBlock,topic_desc_vec_28@SVDBlock,topic_desc_vec_29@SVDBlock,topic_desc_vec_30@SVDBlock,topic_desc_vec_31@SVDBlock
0,193.0,4.0,5.0,1.389316,0.047603,0.247973,-0.148177,-0.078973,-0.164957,0.089955,-0.093493,-0.196,-0.182812,-0.007289,0.076333,0.163739,-0.035932,-0.112299,0.144413,-0.088069,-0.088248,0.146181,0.07606,0.06989,-0.009008,-0.225929,-0.241469,0.077414,0.052856,-0.055299,-0.031155,-0.000192,-0.226315,-0.275052,-0.052144,1.224498,1.088314,-0.712278,0.026894,0.135761,-0.155991,-0.008227,-0.113077,-0.120182,0.720173,-0.191181,-0.156748,0.145742,0.225105,0.084981,0.060606,-0.090413,-0.067793,0.020968,0.090605,-0.108054,-0.036625,0.045687,-0.009751,-0.034569,0.307761,-0.002164,0.278291,0.099777,0.264182,0.127014,-0.2221,0.0,1.0,2.0,1.495721,-0.126212,-0.585237,0.052639,-0.031097,0.003403,-0.022661,0.060196,-0.111865,-0.105856,0.071359,-0.220799,0.13631,-0.062322,0.053268,-0.031592,0.043254,-0.179094,-0.131098,-0.140867,0.177363,0.067326,-0.234173,-0.220354,0.100311,-0.010046,0.07578,-0.071466,0.133241,0.017762,-0.033576,-0.036103,0.840887,1.110407,-0.025474,-0.503556,-0.361773,-0.19853,0.146769,-0.01689,0.061173,0.303312,-0.132449,0.093747,0.006614,-0.037706,-0.356385,-0.042516,-0.042583,0.152235,-0.261989,-0.113987,0.0944,-0.143197,-0.009192,0.047209,0.059777,0.133004,0.01151,-0.051747,-0.091751,0.175704,0.021002,0.038716
1,650.0,2.0,2.0,1.521845,-0.379353,0.372153,0.378563,-0.124134,0.017656,0.15766,0.071572,0.023798,-0.048916,-0.123617,0.125467,-0.055241,-0.017257,0.078077,0.152097,-0.002196,0.033948,-0.043684,-0.103326,0.029483,-0.0269,-0.048845,-0.002305,-0.128888,0.110783,0.018756,0.010925,-0.033088,0.050633,-0.017134,0.123261,1.185677,0.932202,-0.175497,-0.53569,-0.22472,0.117852,0.028884,0.356231,0.552621,-0.15618,0.164619,0.509339,0.129331,0.031341,-0.183442,0.177784,0.050467,-0.128946,-0.00038,-0.105007,-0.016047,-0.11998,0.027262,0.151006,-0.171621,0.115587,0.012108,0.002377,0.140181,-0.113144,0.098507,0.078554,0.0,1.0,2.0,1.495721,-0.126212,-0.585237,0.052639,-0.031097,0.003403,-0.022661,0.060196,-0.111865,-0.105856,0.071359,-0.220799,0.13631,-0.062322,0.053268,-0.031592,0.043254,-0.179094,-0.131098,-0.140867,0.177363,0.067326,-0.234173,-0.220354,0.100311,-0.010046,0.07578,-0.071466,0.133241,0.017762,-0.033576,-0.036103,0.840887,1.110407,-0.025474,-0.503556,-0.361773,-0.19853,0.146769,-0.01689,0.061173,0.303312,-0.132449,0.093747,0.006614,-0.037706,-0.356385,-0.042516,-0.042583,0.152235,-0.261989,-0.113987,0.0944,-0.143197,-0.009192,0.047209,0.059777,0.133004,0.01151,-0.051747,-0.091751,0.175704,0.021002,0.038716
2,4302.0,3.0,4.0,1.464419,0.096453,1.086639,0.110974,0.276246,-0.04034,0.119056,0.14949,0.241033,-0.360595,-0.450255,0.168323,-0.025984,-0.07301,0.147094,0.06764,0.057095,-0.231207,-0.12137,-0.141377,-0.081554,-0.145352,0.177816,-0.038786,-0.000346,-0.007061,-0.042277,0.040645,0.047511,0.06008,0.104918,0.030732,2.068604,-0.735934,-0.026581,-0.026877,0.00488,0.001385,0.007251,0.014687,-0.009832,0.00106,-0.000874,0.007052,0.008331,-0.001575,0.004281,-0.000732,-0.002477,-0.001891,-0.00042,0.004625,0.002431,6.3e-05,-0.000141,-0.000747,0.000644,0.001636,0.000426,0.001422,0.00267,-0.001283,0.001609,-1.9e-05,0.0,1.0,2.0,1.495721,-0.126212,-0.585237,0.052639,-0.031097,0.003403,-0.022661,0.060196,-0.111865,-0.105856,0.071359,-0.220799,0.13631,-0.062322,0.053268,-0.031592,0.043254,-0.179094,-0.131098,-0.140867,0.177363,0.067326,-0.234173,-0.220354,0.100311,-0.010046,0.07578,-0.071466,0.133241,0.017762,-0.033576,-0.036103,0.840887,1.110407,-0.025474,-0.503556,-0.361773,-0.19853,0.146769,-0.01689,0.061173,0.303312,-0.132449,0.093747,0.006614,-0.037706,-0.356385,-0.042516,-0.042583,0.152235,-0.261989,-0.113987,0.0944,-0.143197,-0.009192,0.047209,0.059777,0.133004,0.01151,-0.051747,-0.091751,0.175704,0.021002,0.038716
3,4623.0,4.0,4.0,1.109421,-0.388845,0.749188,-0.101258,-0.196234,-0.171714,0.068884,-0.096163,-0.228486,-0.088837,-0.228714,0.307487,0.324368,-0.303841,-0.258865,0.296589,-0.353061,0.039046,-0.083801,0.101106,0.153182,-0.2184,-0.189719,-0.270605,0.133237,0.145588,0.021064,0.043974,0.052947,-0.261351,0.211863,0.538147,0.944939,0.72863,0.481107,-0.184826,0.34662,0.14477,-0.061005,0.022276,-0.048956,-0.153752,0.10622,0.01549,-0.203652,-0.000239,-0.230092,0.29696,0.111356,-0.109224,0.044242,0.639787,0.157941,0.418411,0.045835,0.468703,0.041071,0.227346,-0.220521,-0.214487,0.066082,0.42359,0.345238,0.113283,0.0,1.0,2.0,1.495721,-0.126212,-0.585237,0.052639,-0.031097,0.003403,-0.022661,0.060196,-0.111865,-0.105856,0.071359,-0.220799,0.13631,-0.062322,0.053268,-0.031592,0.043254,-0.179094,-0.131098,-0.140867,0.177363,0.067326,-0.234173,-0.220354,0.100311,-0.010046,0.07578,-0.071466,0.133241,0.017762,-0.033576,-0.036103,0.840887,1.110407,-0.025474,-0.503556,-0.361773,-0.19853,0.146769,-0.01689,0.061173,0.303312,-0.132449,0.093747,0.006614,-0.037706,-0.356385,-0.042516,-0.042583,0.152235,-0.261989,-0.113987,0.0944,-0.143197,-0.009192,0.047209,0.059777,0.133004,0.01151,-0.051747,-0.091751,0.175704,0.021002,0.038716
4,4954.0,1.0,4.0,1.216919,-0.09996,0.405897,-0.230196,0.085446,0.082817,-0.240677,0.433301,0.02741,-0.029796,-0.491905,-0.090658,-0.143411,-0.032434,-0.221956,0.298821,-0.214527,-0.027405,0.382973,-0.283588,0.44325,0.17943,-0.131553,-0.300024,-0.059132,0.366554,0.132206,0.106953,0.196361,0.160209,-0.165322,-0.084631,1.100724,0.617271,-0.131174,-0.045971,-0.371611,-0.144793,-0.357518,-0.439168,-0.036699,0.115138,0.001618,-0.021546,0.075126,0.281117,-0.021148,0.25053,0.073217,0.039869,0.227269,0.077603,0.015485,0.171069,0.124835,0.14414,-0.219136,-0.035226,-0.005552,-0.012889,-0.072879,-0.121618,0.084108,0.105099,0.0,1.0,2.0,1.495721,-0.126212,-0.585237,0.052639,-0.031097,0.003403,-0.022661,0.060196,-0.111865,-0.105856,0.071359,-0.220799,0.13631,-0.062322,0.053268,-0.031592,0.043254,-0.179094,-0.131098,-0.140867,0.177363,0.067326,-0.234173,-0.220354,0.100311,-0.010046,0.07578,-0.071466,0.133241,0.017762,-0.033576,-0.036103,0.840887,1.110407,-0.025474,-0.503556,-0.361773,-0.19853,0.146769,-0.01689,0.061173,0.303312,-0.132449,0.093747,0.006614,-0.037706,-0.356385,-0.042516,-0.042583,0.152235,-0.261989,-0.113987,0.0944,-0.143197,-0.009192,0.047209,0.059777,0.133004,0.01151,-0.051747,-0.091751,0.175704,0.021002,0.038716


n_features: 134


In [22]:
filepath = os.path.join(cfg.EXP_PREDS, "oof_2nd.pkl")
if os.path.isfile(filepath):
    oof = pickle.load(open(filepath, "rb"))
else:
    oof, models = train_cv(train_feat_df, target_df, cv_list, metrics_dict=metrics_dict)
    pickle.dump(oof, open(filepath), "wb")

In [23]:
cv_score = calc_cv(cand_df, oof, correlation_df)
LOGGER.info(f"cv: {np.round(cv_score, 5)}")

cv: 0.29606


In [24]:
oof_preds = np.where(oof>=1e-3, 1, 0)
reduced_pred_df = cand_df[oof_preds==1]
calc_pred_df = reduced_pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
calc_pred_df = pd.merge(correlation_df[["topic_id"]], calc_pred_df, on="topic_id", how="left")
calc_pred_df = calc_pred_df.fillna("nan")

reduced_recall_score = comp_recall_score(correlation_df["content_id"], calc_pred_df["content_id"])
LOGGER.info(f"Filtering by GBDT: \n recall = {np.round(reduced_recall_score, 5)} \n n_data = {len(reduced_pred_df)}")

Filtering by GBDT: 
 recall = 0.45899 
 n_data = 1485542


# 3rd stage: Matching by Transformer

In [25]:
oof_preds = np.where(oof>=1e-3, 1, 0)
train_filterd_df = cand_df.copy()
train_filterd_df["target"] = target_df.to_numpy()
train_filterd_df = train_filterd_df[oof_preds==1].reset_index(drop=True)

train_filterd_df = pd.merge(train_filterd_df, content_df, on="content_id", how="left")
train_filterd_df = pd.merge(train_filterd_df, topic_df, on="topic_id", how="left")

In [26]:
train_text_df = create_text_df(train_filterd_df)
train_text_df

Unnamed: 0,text,target
0,c_0122f3ff5d19</s>Връхни ъгли</s>t_00004da3a1b...,0
1,c_0b4a3ea959ba</s>Последователни и успоредни р...,0
2,c_0feaaa5dc39d</s>Успоредно свързани резистори...,0
3,c_21f75cfb89da</s>Развивки на многостен</s>t_0...,0
4,c_247ee1c26c75</s>Синтез на заместени бензенов...,0
...,...,...
1485537,c_d5043be1495d</s>Le journal</s>t_fff9e5407d13...,0
1485538,c_61826808000c</s>(Versión Extendida) Lección ...,0
1485539,c_14aa105dc884</s>يحسب الإحداثيّات القطبية لنق...,0
1485540,c_157e2611928f</s>يحسب الإحداثيّات الديكارتيّة...,0


In [27]:
# train_text_df = processing_features(train_text_df)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
# cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
# cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

('/home/working/output/exp002/tokenizer/tokenizer_config.json',
 '/home/working/output/exp002/tokenizer/special_tokens_map.json',
 '/home/working/output/exp002/tokenizer/sentencepiece.bpe.model',
 '/home/working/output/exp002/tokenizer/added_tokens.json',
 '/home/working/output/exp002/tokenizer/tokenizer.json')

In [28]:
bi_cv_list = get_cv_list(X=train_text_df, y=train_text_df["target"], groups=train_filterd_df["topic_id"], n_splits=cfg.num_fold, seed=cfg.seed)

score = train_loop(cfg, train_text_df, bi_cv_list, correlation_df)

# if cfg.upload_from_colab:
#     dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)



  0%|          | 0/2308 [00:00<?, ?it/s]

  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 500 | val_loss: 0.16052, score: 0.04231


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 1000 | val_loss: 0.13735, score: 0.42457


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 1500 | val_loss: 0.13007, score: 0.47722


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 2000 | val_loss: 0.12231, score: 0.4643
Fold0, Epoch0/5 | train_loss: 0.18391


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.12158, score: 0.48045


  0%|          | 0/2308 [00:00<?, ?it/s]

  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 500 | val_loss: 0.11715, score: 0.53496


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 1000 | val_loss: 0.11724, score: 0.55134


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 1500 | val_loss: 0.12007, score: 0.52564


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 2000 | val_loss: 0.11534, score: 0.52298
Fold0, Epoch1/5 | train_loss: 0.09415


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.11446, score: 0.56999


  0%|          | 0/2308 [00:00<?, ?it/s]

  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 500 | val_loss: 0.11706, score: 0.58523


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 1000 | val_loss: 0.11658, score: 0.60321


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 1500 | val_loss: 0.1157, score: 0.58443


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 2000 | val_loss: 0.11515, score: 0.56711
Fold0, Epoch2/5 | train_loss: 0.08275


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.11427, score: 0.59105


  0%|          | 0/2308 [00:00<?, ?it/s]

  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 500 | val_loss: 0.11701, score: 0.58788


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 1000 | val_loss: 0.1199, score: 0.60252


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 1500 | val_loss: 0.11763, score: 0.61758


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 2000 | val_loss: 0.11642, score: 0.60346
Fold0, Epoch3/5 | train_loss: 0.07209


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.11747, score: 0.60113


  0%|          | 0/2308 [00:00<?, ?it/s]

  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 500 | val_loss: 0.12156, score: 0.59864


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 1000 | val_loss: 0.12226, score: 0.6172


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 1500 | val_loss: 0.12236, score: 0.6171


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 2000 | val_loss: 0.12168, score: 0.61656
Fold0, Epoch4/5 | train_loss: 0.06217


  0%|          | 0/594 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.12172, score: 0.61636


  0%|          | 0/2329 [00:00<?, ?it/s]

In [None]:
train_text_df

Unnamed: 0,text,target
0,c_0122f3ff5d19</s>Връхни ъгли</s>t_00004da3a1b...,0
1,c_0b4a3ea959ba</s>Последователни и успоредни р...,0
2,c_0feaaa5dc39d</s>Успоредно свързани резистори...,0
3,c_21f75cfb89da</s>Развивки на многостен</s>t_0...,0
4,c_247ee1c26c75</s>Синтез на заместени бензенов...,0
...,...,...
1485537,c_d5043be1495d</s>Le journal</s>t_fff9e5407d13...,0
1485538,c_61826808000c</s>(Versión Extendida) Lección ...,0
1485539,c_14aa105dc884</s>يحسب الإحداثيّات القطبية لنق...,0
1485540,c_157e2611928f</s>يحسب الإحداثيّات الديكارتيّة...,0
