In [1]:
! nvidia-smi

Wed Jan 11 14:50:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
| 32%   50C    P8    23W / 480W |   1657MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import re
import gc
import pdb
import sys
import json
import math
import time
import wandb
import pickle
import shutil
import joblib
import random
import requests
import warnings
from glob import glob
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from pandarallel import pandarallel

import scipy
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
import lightgbm as lgb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

import tokenizers
import sentencepiece
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW
from transformers import logging
from transformers import DataCollatorWithPadding


from cuml import NearestNeighbors
from kaggle.api.kaggle_api_extended import KaggleApi

sys.path.append("/home/working/")
from kagglib.utils.utils import  Timer, reduce_mem_usage, get_logger, decorate, setup, dataset_create_new
from kagglib.utils.exp_manage import set_wandb
from kagglib.tabular.blocks import AbstractBaseBlock, IdentityBlock, LabelEncodingBlock, SVDBlock, run_blocks
from kagglib.tabular.model_selection import train_cv, predict_cv
from kagglib.nlp.preprocessing import resolve_encodings_and_normalize
from kagglib.nlp.model import (
    AttentionPooling,
    MeanPooling,
    WeightedLayerPooling,
    freeze,
    replace_mixout,
    reinit_bert,
)
from kagglib.nlp.activation import softmax, sigmoid
from kagglib.nlp.optimizer import (
    get_scheduler,
    get_optimizer_grouped_parameters,
)

%load_ext autoreload
%autoreload 2
%env TOKENIZERS_PARALLELISM=true

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 300)
pandarallel.initialize(progress_bar=True)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')

logging.set_verbosity_error()
logging.set_verbosity_warning()

env: TOKENIZERS_PARALLELISM=true
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Setup & data load

In [3]:
def fbeta_wrapper(y_true, y_pred):
    beta = 2
    return fbeta_score(y_true, y_pred, beta)

class Config:
    AUTHOR = "shu421"

    EXP = "exp009"
    COMPETITION = "learning-equality-curriculum-recommendations"
    DATASET_PATH = []
    BASE_PATH = "/home/working/"
    api_path = "/root/.kaggle/kaggle.json"


    EMB_MODEL_PATH = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

    # Language Model Config
    MODEL_PATH = "xlm-roberta-base"

    # train
    apex=True
    seed = 42
    num_fold = 5
    train_fold = [0, 1, 2, 3, 4,]
    batch_size = 512
    n_epoch = 5
    max_len = 128
    num_classes = 1

    # optimizer
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    betas = (0.9, 0.999)
    lr_weight_decay = 0.95

    # scheduler
    scheduler="cosine"
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 500
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000

    # gradient accumulation
    gradient_accumulation_steps = 1

    # weight and bias
    wandb = True

    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }


    upload_from_colab = True

    # GBDT
    gbdt_model = "XGBoost"
    stopping_rounds = 50
    log_evaluation = 500
    model_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": 0.3,
        "tree_method": "gpu_hist",
        "random_state": seed,
        "n_estimators": 99999,
    }
    train_params = {
        "verbose": log_evaluation,
    }

    svd_dim = 32

# setup
cfg = setup(Config)

In [4]:
# set log functions
LOGGER = get_logger(cfg.OUTPUT_EXP)
log_filepath = os.path.join(cfg.OUTPUT, f"{cfg.EXP}.log")
# if os.path.isfile(log_filepath):
#     with open(log_filepath, "w") as f:
#         pass
#     f.close()
if cfg.wandb:
    run = set_wandb(cfg, name=cfg.EXP, group=cfg.MODEL_PATH, config_path="/root/.kaggle/wandb.json")

[34m[1mwandb[0m: Currently logged in as: [33mshu421[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
def get_whole_df():
    content_df = pd.read_csv(os.path.join(cfg.INPUT, 'content.csv'))
    topic_df = pd.read_csv(os.path.join(cfg.INPUT, 'topics.csv'))
    correlation_df = pd.read_csv(os.path.join(cfg.INPUT, 'correlations.csv'))
    sub_df = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))
    return content_df, topic_df, correlation_df, sub_df

def preprocess_df(content_df, topic_df, correlation_df):
    content_df = content_df.add_prefix("content_")
    topic_df = topic_df.add_prefix("topic_")
    correlation_df = correlation_df.rename(columns={"content_ids":"content_id"})
    content_df = content_df.fillna("")
    topic_df = topic_df.fillna("")
    return content_df, topic_df, correlation_df

def get_processed_df():
    content_df, topic_df, correlation_df, sub_df = get_whole_df()
    content_df, topic_df, correlation_df = preprocess_df(content_df, topic_df, correlation_df)
    return content_df, topic_df, correlation_df, sub_df

# Metrics

In [6]:
def comp_fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

def comp_recall_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        recall = len(TP) / len(true)

        score_list.append(recall)
    score = sum(score_list) / len(score_list)
    return score

def calc_comp_score(train_df, oof, correlation_df, thr=0.1):
    """2値分類の予測からcvを計算する"""
    oof_preds = np.where(oof>=thr, 1, 0)
    pred_df = train_df[oof_preds==1]
    pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
    pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
    pred_df = pred_df.fillna("nan")

    cv_score = comp_fbeta_score(correlation_df["content_id"], pred_df["content_id"])
    return cv_score

def get_StratifiedGroupKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_list = list(cv.split(X, y, groups))
    return cv_list

def get_StratifiedKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_list = list(cv.split(X, y))
    return cv_list


# 1st Stage: Candidate generation by Metric Learning

In [7]:
class EmbDataSet(Dataset):
    def __init__(self, cfg, df, col, tokenizer):
        self.cfg = cfg
        df[col] = df[col].fillna("")
        self.text = df[col].to_numpy()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.prepare_input(self.text[index])
        return text

    def prepare_input(self, text):
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

In [8]:
def get_emb_vec(content_df, topic_df, col, is_overwrite=False):
    # prepare model for getting embeddings
    model = AutoModel.from_pretrained(cfg.EMB_MODEL_PATH)
    model.eval()
    model.to(cfg.device)
    tokenizer = AutoTokenizer.from_pretrained(cfg.EMB_MODEL_PATH)

    content_dataset = EmbDataSet(cfg, content_df, col, tokenizer)
    topic_dataset = EmbDataSet(cfg, topic_df, col, tokenizer)

    content_loader = DataLoader(
        dataset=content_dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    topic_loader = DataLoader(
        dataset=topic_dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    if not os.path.isfile(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl")) or is_overwrite:
        content_vec = []
        for step, contents in enumerate(tqdm(content_loader)):
            contents = collate(contents)
            for k, v in contents.items():
                contents[k] = v.to(cfg.device)
            with torch.no_grad():
                output = model(**contents)
            vec = output.last_hidden_state.mean(1).cpu().numpy()
            content_vec.append(vec)
        content_vec = np.concatenate(content_vec)
        pickle.dump(content_vec, open(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl"), "wb"))
    else:
        content_vec = pickle.load(open(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl"), "rb"))

    if not os.path.isfile(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl")) or is_overwrite:
        topic_vec = []
        for step, topics in enumerate(tqdm(topic_loader)):
            topics = collate(topics)
            for k, v in topics.items():
                topics[k] = v.to(cfg.device)
            with torch.no_grad():
                output = model(**topics)
            vec = output.last_hidden_state.mean(1).cpu().numpy()
            topic_vec.append(vec)
        topic_vec = np.concatenate(topic_vec)
        pickle.dump(topic_vec, open(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl"), "wb"))
    else:
        topic_vec = pickle.load(open(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl"), "rb"))

    del model
    torch.cuda.empty_cache()
    gc.collect()

    return content_vec, topic_vec

# GBDT

In [9]:
def metrics_dict(y_true, y_pred):
    metrics_dict_ = {}
    # metrics_dict_['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) 
    # metrics_dict_['MAE'] = mean_absolute_error(y_true, y_pred)
    # metrics_dict_['RMSE'] = mean_squared_error(y_true, y_pred, squared=False)
    # metrics_dict_['f1_micro'] = f1_score(y_true, y_pred, average='micro')
    # metrics_dict_['auc'] = roc_auc_score(y_true, y_pred)
    # metrics_dict_["f1_score"] = f1_score(y_true, y_pred)
    metrics_dict_["f2_score"] = fbeta_score(y_true, y_pred, beta=2)
    metrics_dict_["recall"] = recall_score(y_true, y_pred)

    return metrics_dict_

In [10]:
def get_cand_df(content_df, topic_df, correlation_df, content_vec, topic_vec, content2topic_thr=90, topic2content_thr=10):
    """
    knnを使用してtopicに対するcontentの候補を出力する関数
    """
    id2content_dict = dict(content_df["content_id"])
    id2topics_dict = dict(topic_df["topic_id"])

    # contentがどのtopicにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=content2topic_thr, metric="cosine")
    knn_model.fit(topic_vec)
    distances, indices = knn_model.kneighbors(content_vec)
    knn_pred_c2t_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_c2t_dict[id2topics_dict[j]].append(id2content_dict[idx])

    # topicがどのcontentにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=topic2content_thr, metric="cosine")
    knn_model.fit(content_vec)
    distances, indices = knn_model.kneighbors(topic_vec)
    knn_pred_t2c_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_t2c_dict[id2topics_dict[idx]].append(id2content_dict[j])

    # idの割り当て
    knn_pred_dict = {k:np.unique(v_c2t+v_t2c) for k,v_c2t, v_t2c in zip(knn_pred_c2t_dict.keys(), knn_pred_c2t_dict.values(), knn_pred_t2c_dict.values())}
    knn_pred_id = {k:[" ".join(v)] for k,v in knn_pred_dict.items()}
    knn_pred_df = pd.DataFrame(knn_pred_id).T.reset_index()
    knn_pred_df.columns = ["topic_id", "content_id"]
    knn_pred_df = knn_pred_df.reset_index(drop=True)

    # candidateの集計結果を出力
    recall = comp_recall_score(correlation_df["content_id"], knn_pred_df["content_id"])
    LOGGER.info(f"recall = {round(recall, 5)}")
    # candidateの集計結果を出力
    f2 = comp_fbeta_score(correlation_df["content_id"], knn_pred_df["content_id"])
    LOGGER.info(f"f2 = {round(f2, 5)}")
    n_bin_data = knn_pred_df["content_id"].apply(lambda x: len(x.split())).sum()
    LOGGER.info(f"n_data = {n_bin_data}")

    # 文字列の候補をlistに変換
    knn_pred_df["content_id"] = knn_pred_df["content_id"].apply(lambda x: x.split(" "))
    cand_df = knn_pred_df.explode("content_id")
    cand_df = cand_df.reset_index(drop=True)

    # target作成
    correlation_df_ = correlation_df.copy()
    correlation_df_["content_id"] = correlation_df_["content_id"].apply(lambda x: x.split(" "))
    correlation_df_ = correlation_df_.explode("content_id")
    correlation_df_["target"] = 1
    target_df = pd.DataFrame()
    target_df = pd.merge(cand_df, correlation_df_, on=["topic_id", "content_id"], how="left")
    target_df = target_df["target"].fillna(0).astype(int)

    return cand_df, target_df

def get_feature_df(cand_df, target_df, content_df, topic_df, cv_list, content_vec=None, topic_vec=None):

    content_df_ = content_df.rename(columns={"content_language": "language"}).copy()
    topic_df_ = topic_df.rename(columns={"topic_language": "language"}).copy()
    # content features
    content_svd_cols = [
        "emb",
    ]
    content_cat_cols = [
        # "content_id",
        # "content_kind",
        "language",
    ]

    # topic features
    topic_svd_cols = [
        "emb",
    ]
    topic_cat_cols = [
        # "topic_id",
        # "topic_category",
        "language",
    ]

    content_blocks = [
        # IdentityBlock(use_cols=content_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=content_cat_cols, cfg=cfg, path=os.path.join(cfg.OUTPUT_EXP,  "_".join(content_cat_cols) + "_oe.pkl")), 
        SVDBlock(cols=content_svd_cols, cfg=cfg, dim=cfg.svd_dim, title_vec=content_vec, path=os.path.join(cfg.OUTPUT_EXP, "_".join(content_svd_cols) + "_svd_dict.pkl")),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    topic_blocks = [
        # IdentityBlock(use_cols=topic_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=topic_cat_cols, cfg=cfg, path=os.path.join(cfg.OUTPUT_EXP, "_".join(topic_cat_cols) + "_oe.pkl")), 
        SVDBlock(cols=topic_svd_cols, cfg=cfg, dim=cfg.svd_dim, title_vec=topic_vec, path=os.path.join(cfg.OUTPUT_EXP, "_".join(content_svd_cols) + "_svd_dict.pkl")),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    content_feat_df = run_blocks(content_df_, blocks=content_blocks, is_test=False)
    topic_feat_df = run_blocks(topic_df_, blocks=topic_blocks, is_test=True)

    # content_idとfeatを対応付ける
    content_feat_df = pd.concat([content_df[["content_id"]], content_feat_df], axis=1)
    # topic_idとfeatを対応付ける
    topic_feat_df = pd.concat([topic_df[["topic_id"]], topic_feat_df], axis=1)

    # topicとcontentのfeatをマージ
    # train_feat_df = pd.merge(cand_df, content_feat_df, on="content_id", how="left")
    # train_feat_df = pd.merge(train_feat_df, topic_feat_df, on="topic_id", how="left")
    train_feat_df = pd.concat([cand_df, content_feat_df.set_index("content_id").reindex(cand_df["content_id"].to_numpy()).reset_index(drop=True)], axis=1)
    train_feat_df = pd.concat([train_feat_df, topic_feat_df.set_index("topic_id").reindex(train_feat_df["topic_id"].to_numpy()).reset_index(drop=True)], axis=1)
    train_feat_df = train_feat_df.drop(columns=["topic_id", "content_id"])

    display(train_feat_df.head())
    LOGGER.info(f"n_features: {len(train_feat_df.columns)}")

    return train_feat_df

# Language Model

In [11]:
def create_text_df(input_df):
    output_df = pd.DataFrame({"text":input_df["content_title"] + " </s> " + input_df["content_language"] + " </s> "  + input_df["topic_title"] + " </s> " + input_df["topic_language"]}).reset_index(drop=True)
    output_df["target"] = input_df["target"]
    return output_df

In [12]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

class BiEncoderDataset(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        self.text = df[col].to_numpy()
        self.label = df["target"].to_numpy()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        label = self.label[index].astype(np.float32)
        return text, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [13]:
class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.num_classes)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        # batch, hidden_size
        feature = self.feature(inputs)
        feature = self.ln(feature)
        # feature1 = self.drop1(feature)
        # feature2 = self.drop2(feature)
        # feature3 = self.drop3(feature)
        # feature4 = self.drop4(feature)
        # feature5 = self.drop5(feature)
        # feature = (feature1 + feature2 + feature3 + feature4 + feature5) / 5
        output = self.fc(feature)
        return output.squeeze()



In [14]:
def train_fn(cfg, train_loader, valid_loader, train_df, valid_df, criterion, optimizer, scheduler, model, fold, epoch, best_val_preds, best_val_score):
    LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
    train_losses = []
    train_nums = []
    model.train()
    scaler = GradScaler(enabled=cfg.apex)
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for step, (inputs, labels) in enumerate(pbar):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(enabled=cfg.apex):
                output = model(inputs)
            loss = criterion(output, labels)

            pbar.set_postfix({
                'loss': loss.item(),
                'lr': scheduler.get_lr()[0]
            })
            train_losses.append(loss.item() * len(labels))
            train_nums.append(len(labels))

            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps

            scaler.scale(loss).backward()

            if cfg.clip_grad_norm is not None:
                # scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    cfg.clip_grad_norm
                )

            if (step+1) % cfg.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            if step % cfg.eval_step == 0 and step != 0:
                best_val_preds, best_val_score, val_loss = valid_fn(
                    cfg,
                    valid_loader,
                    valid_df,
                    criterion,
                    model,
                    fold,
                    epoch,
                    step,
                    best_val_preds,
                    best_val_score,
                )
                model.train()

            if cfg.wandb:
                wandb.log({f"[fold{fold}] train_loss": loss.item(),
                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
    train_loss = sum(train_losses)/sum(train_nums)
    return train_loss



def valid_fn(cfg, valid_loader, valid_df, criterion, model, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
            # for (inputs, labels) in valid_loader:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast(enabled=cfg.apex):
                    output = model(inputs)

                loss = criterion(output, labels)
                output = output.detach().cpu().numpy()
                output = sigmoid(output)
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    y_preds = np.where(val_preds>=0.5, 1, 0)
    score = fbeta_score(valid_df["target"], y_preds, beta=2)
    recall = recall_score(valid_df["target"], y_preds)

    LOGGER.info(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epoch}, Step: {step} | val_loss: {np.round(val_loss, 5)}, f2: {np.round(score, 5)}, recall: {np.round(recall, 5)}')

    if score > best_val_score:
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(),
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )

    return best_val_preds, best_val_score, val_loss


def train_loop(cfg, train_data: pd.DataFrame, cv_list: List, correlation_df: pd.DataFrame):
    """_summary_

    Args:
        cfg (_type_): _description_
        train_data (pd.DataFrame): textとtargetが格納されたデータフレーム
        cv_list (List): _description_
        correlation_df (pd.DataFrame): _description_

    Returns:
        _type_: _description_
    """
    oof_pred = np.zeros((len(train_data)), dtype=np.float32)
    fold_score = []

    for fold in cfg.train_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        train_idx, valid_idx = cv_list[fold]
        train_df = train_data.iloc[train_idx].reset_index(drop=True)
        valid_df = train_data.iloc[valid_idx].reset_index(drop=True)

        # Datasetの設定
        train_dataset = BiEncoderDataset(cfg, train_df, "text")
        valid_dataset = BiEncoderDataset(cfg, valid_df, "text")
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        # model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(cfg.device)

        criterion = nn.BCEWithLogitsLoss()

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epoch)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # model-training
        best_val_preds = None
        best_val_score = -np.inf

        for epoch in range(cfg.n_epoch):
            train_loss = train_fn(
                cfg,
                train_loader,
                valid_loader,
                train_df,
                valid_df,
                criterion,
                optimizer,
                scheduler,
                model,
                fold,
                epoch,
                best_val_preds,
                best_val_score
                )

            LOGGER.info(f'Fold{fold}, Epoch{epoch}/{cfg.n_epoch} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg,
                valid_loader,
                valid_df,
                criterion,
                model,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch,
                        f"[fold{fold}] avg_train_loss": train_loss,
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model
        gc.collect()
        torch.cuda.empty_cache()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = calc_comp_score(train_data, oof_pred, correlation_df)
    LOGGER.info(f'fold score: {fold_score}')
    LOGGER.info(f'CV: {round(score, 4)}')
    return score

# Setup & Preprocessing

In [15]:
content_df, topic_df, correlation_df, sub_df = get_processed_df()
topic_df = topic_df[topic_df["topic_id"].isin(correlation_df["topic_id"])].reset_index(drop=True)

In [16]:
topic_df

Unnamed: 0,topic_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
2,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
4,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True
...,...,...,...,...,...,...,...,...,...
61512,t_fff830472691,Scalar Projections,,fef095,source,4,en,t_c75d6acecf78,True
61513,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True
61514,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
61515,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True


In [17]:
content_df

Unnamed: 0,content_id,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


In [18]:
def create_emb_feature_df(input_df, id_col, title_col, lang_col):
    output_df = input_df.copy()
    output_df["text"] = output_df[title_col] + " </s> " + output_df[lang_col]
    return output_df[[id_col, "text"]]

In [19]:
content_emb_feature_df = create_emb_feature_df(content_df, "content_id", "content_title", "content_language")
topic_emb_feature_df = create_emb_feature_df(topic_df, "topic_id", "topic_title", "topic_language")

content_vec, topic_vec = get_emb_vec(content_emb_feature_df, topic_emb_feature_df, 'text', is_overwrite=False)

cand_df, target_df = get_cand_df(content_df, topic_df, correlation_df, content_vec, topic_vec, content2topic_thr=90, topic2content_thr=10)

del content_emb_feature_df, topic_emb_feature_df
gc.collect()

recall = 0.55235
f2 = 0.06583
n_data = 13943152


0

In [20]:
cand_df = pd.merge(cand_df, content_df[["content_id", "content_language"]], on="content_id", how="left")
cand_df = pd.merge(cand_df, topic_df[["topic_id", "topic_language"]], on="topic_id", how="left")
lang_matched_idx = cand_df["content_language"]==cand_df["topic_language"]
cand_df = cand_df.drop(columns=["content_language", "topic_language"])
cand_df = cand_df[lang_matched_idx].reset_index(drop=True)
target_df = target_df[lang_matched_idx].reset_index(drop=True)

cand_df_ = cand_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
cand_df_ = pd.merge(correlation_df[["topic_id"]], cand_df_, on="topic_id", how="left")
cand_df_ = cand_df_.fillna("")

lang_recall = comp_recall_score(correlation_df["content_id"], cand_df_["content_id"])
LOGGER.info(f"recall: after lang match: {lang_recall}")

recall: after lang match: 0.5487967670105008


# 2nd stage: Filtering candidate by GBDT

In [21]:
cv_list = get_StratifiedGroupKFold_list(X=cand_df, y=target_df, groups=cand_df["topic_id"], n_splits=cfg.num_fold, seed=cfg.seed)

train_feat_df = get_feature_df(
    cand_df,
    target_df,
    content_df,
    topic_df,
    cv_list,
    content_vec,
    topic_vec,
    )

******************** start run blocks... ********************
	- <kagglib.tabular.blocks.LabelEncodingBlock object at 0x7f0ed8aaab50> 0.022[s]
	- <kagglib.tabular.blocks.SVDBlock object at 0x7f0ef938d9d0> 2.983[s]
run is_test=False 3.025[s]
******************** start run blocks... ********************
	- <kagglib.tabular.blocks.LabelEncodingBlock object at 0x7f0ef938ea90> 0.008[s]
	- <kagglib.tabular.blocks.SVDBlock object at 0x7f0ef938e390> 0.066[s]
run is_test=True 0.088[s]


Unnamed: 0,language@LabelEncodingBlock,emb_0@SVDBlock,emb_1@SVDBlock,emb_2@SVDBlock,emb_3@SVDBlock,emb_4@SVDBlock,emb_5@SVDBlock,emb_6@SVDBlock,emb_7@SVDBlock,emb_8@SVDBlock,emb_9@SVDBlock,emb_10@SVDBlock,emb_11@SVDBlock,emb_12@SVDBlock,emb_13@SVDBlock,emb_14@SVDBlock,emb_15@SVDBlock,emb_16@SVDBlock,emb_17@SVDBlock,emb_18@SVDBlock,emb_19@SVDBlock,emb_20@SVDBlock,emb_21@SVDBlock,emb_22@SVDBlock,emb_23@SVDBlock,emb_24@SVDBlock,emb_25@SVDBlock,emb_26@SVDBlock,emb_27@SVDBlock,emb_28@SVDBlock,emb_29@SVDBlock,emb_30@SVDBlock,emb_31@SVDBlock,language@LabelEncodingBlock.1,emb_0@SVDBlock.1,emb_1@SVDBlock.1,emb_2@SVDBlock.1,emb_3@SVDBlock.1,emb_4@SVDBlock.1,emb_5@SVDBlock.1,emb_6@SVDBlock.1,emb_7@SVDBlock.1,emb_8@SVDBlock.1,emb_9@SVDBlock.1,emb_10@SVDBlock.1,emb_11@SVDBlock.1,emb_12@SVDBlock.1,emb_13@SVDBlock.1,emb_14@SVDBlock.1,emb_15@SVDBlock.1,emb_16@SVDBlock.1,emb_17@SVDBlock.1,emb_18@SVDBlock.1,emb_19@SVDBlock.1,emb_20@SVDBlock.1,emb_21@SVDBlock.1,emb_22@SVDBlock.1,emb_23@SVDBlock.1,emb_24@SVDBlock.1,emb_25@SVDBlock.1,emb_26@SVDBlock.1,emb_27@SVDBlock.1,emb_28@SVDBlock.1,emb_29@SVDBlock.1,emb_30@SVDBlock.1,emb_31@SVDBlock.1
0,2.0,1.392505,-0.088146,-0.094533,-0.071988,0.218676,-0.125377,0.590902,-0.46616,-0.029396,-0.303019,-0.394904,0.102303,0.2826,0.195473,0.319366,-0.540984,-0.003061,0.170554,-0.242842,-0.26448,-0.018083,0.014701,0.004886,-0.104525,-0.240399,-0.118871,0.136802,0.208798,0.063542,0.193355,0.045901,0.020247,2.0,1.478283,-0.08377,-0.579379,-0.201401,0.403313,-0.018492,0.610767,-0.39056,-0.087408,-0.377274,-0.497713,-0.075335,0.089051,0.033133,0.186986,-0.354582,-0.292326,0.391412,-0.082472,-0.217101,-0.000822,-0.012888,0.11367,-0.115742,-0.333024,-0.175993,0.093047,0.303772,-0.142421,0.073766,-0.075833,-0.090284
1,2.0,1.177013,0.066715,-0.364118,-0.277566,-0.045339,-0.370854,0.590163,-0.044096,-0.334652,-0.40537,-0.149182,0.212496,0.050144,-0.085117,0.443345,-0.173174,-0.079575,0.314678,-0.07486,-0.256687,-0.026934,0.078386,0.180344,0.23302,-0.353018,-0.141738,0.081516,0.202715,-0.320821,-0.053193,0.130788,-0.080336,2.0,1.478283,-0.08377,-0.579379,-0.201401,0.403313,-0.018492,0.610767,-0.39056,-0.087408,-0.377274,-0.497713,-0.075335,0.089051,0.033133,0.186986,-0.354582,-0.292326,0.391412,-0.082472,-0.217101,-0.000822,-0.012888,0.11367,-0.115742,-0.333024,-0.175993,0.093047,0.303772,-0.142421,0.073766,-0.075833,-0.090284
2,2.0,1.306407,0.031054,-0.749033,-0.076588,0.19445,0.112504,0.371961,-0.29374,0.204364,-0.55766,0.059043,0.309948,-0.208646,-0.254143,0.227654,-0.191463,-0.226473,0.520958,-0.257983,-0.065444,0.084508,0.166705,-0.050035,-0.028739,-0.210576,-0.215253,0.143982,0.047219,0.078001,0.126202,-0.190933,0.041303,2.0,1.478283,-0.08377,-0.579379,-0.201401,0.403313,-0.018492,0.610767,-0.39056,-0.087408,-0.377274,-0.497713,-0.075335,0.089051,0.033133,0.186986,-0.354582,-0.292326,0.391412,-0.082472,-0.217101,-0.000822,-0.012888,0.11367,-0.115742,-0.333024,-0.175993,0.093047,0.303772,-0.142421,0.073766,-0.075833,-0.090284
3,2.0,1.183004,0.198797,-0.655253,-0.584558,0.068145,-0.011709,0.436433,-0.047452,-0.150072,-0.353919,0.182199,-0.114566,0.063491,-0.35342,0.250083,-0.291626,0.068923,0.2673,-0.198988,-0.429165,0.029803,-0.038025,0.234472,0.082099,-0.569052,-0.130297,0.03713,0.147347,-0.125632,0.090478,0.187632,0.031106,2.0,1.478283,-0.08377,-0.579379,-0.201401,0.403313,-0.018492,0.610767,-0.39056,-0.087408,-0.377274,-0.497713,-0.075335,0.089051,0.033133,0.186986,-0.354582,-0.292326,0.391412,-0.082472,-0.217101,-0.000822,-0.012888,0.11367,-0.115742,-0.333024,-0.175993,0.093047,0.303772,-0.142421,0.073766,-0.075833,-0.090284
4,2.0,1.446952,0.134551,-0.38933,-0.195758,0.466809,-0.081128,0.553684,-0.408727,-0.132846,-0.405788,-0.561649,-0.070295,0.223803,0.042359,0.274968,-0.445697,0.086563,0.377104,-0.064624,-0.282246,-0.008445,-0.044329,0.112839,-0.12024,-0.432349,-0.011237,-0.047979,0.090192,-0.30617,0.056379,-0.146495,0.027441,2.0,1.478283,-0.08377,-0.579379,-0.201401,0.403313,-0.018492,0.610767,-0.39056,-0.087408,-0.377274,-0.497713,-0.075335,0.089051,0.033133,0.186986,-0.354582,-0.292326,0.391412,-0.082472,-0.217101,-0.000822,-0.012888,0.11367,-0.115742,-0.333024,-0.175993,0.093047,0.303772,-0.142421,0.073766,-0.075833,-0.090284


n_features: 66


In [22]:
is_overwrite = False
filepath = os.path.join(cfg.EXP_PREDS, "oof_2nd.pkl")
if not is_overwrite and os.path.isfile(filepath):
    oof = pickle.load(open(filepath, "rb"))
else:
    oof, models = train_cv(cfg, train_feat_df, target_df, cv_list, metrics_dict=metrics_dict, LOGGER=LOGGER)
    pickle.dump(oof, open(filepath, "wb"))

In [23]:
oof_preds = np.where(oof>=1.5e-2, 1, 0)
reduced_pred_df = cand_df[oof_preds==1]
calc_pred_df = reduced_pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
calc_pred_df = pd.merge(correlation_df[["topic_id"]], calc_pred_df, on="topic_id", how="left")
calc_pred_df = calc_pred_df.fillna("nan")

reduced_recall_score = comp_recall_score(correlation_df["content_id"], calc_pred_df["content_id"])
LOGGER.info(f"Filtering by GBDT: \n recall = {np.round(reduced_recall_score, 5)} \n n_data = {len(reduced_pred_df)}")

Filtering by GBDT: 
 recall = 0.45187 
 n_data = 883136


# 3rd stage: Matching by Transformer

In [24]:
oof_preds = np.where(oof>=1.5e-2, 1, 0)
train_filtered_df = cand_df.copy()
train_filtered_df["target"] = target_df.to_numpy()
train_filtered_df = train_filtered_df[oof_preds==1].reset_index(drop=True)

train_filtered_df = pd.merge(train_filtered_df, content_df, on="content_id", how="left")
train_filtered_df = pd.merge(train_filtered_df, topic_df, on="topic_id", how="left")

In [25]:
train_text_df = create_text_df(train_filtered_df)
train_text_df = pd.concat([train_text_df, train_filtered_df[["topic_id", "content_id"]]], axis=1)
train_text_df

Unnamed: 0,text,target,topic_id,content_id
0,Характеристики на сигнала – форма </s> bg </s>...,0,t_00004da3a1b2,c_04fcf98b4b3f
1,Последователни и успоредни резистори </s> bg <...,0,t_00004da3a1b2,c_0b4a3ea959ba
2,Успоредно свързани резистори </s> bg </s> Откр...,0,t_00004da3a1b2,c_0feaaa5dc39d
3,Разпознаване на ъгли </s> bg </s> Откриването ...,0,t_00004da3a1b2,c_159822deca89
4,Опростяване на мрежи от резистори </s> bg </s>...,0,t_00004da3a1b2,c_2b6301d9ada5
...,...,...,...,...
883131,Area of inscribed equilateral triangle -dubbed...,1,t_fffbe1d5d43c,c_6659207b25d5
883132,Solving similar triangles -dubbed(KY) </s> sw ...,0,t_fffbe1d5d43c,c_b138475787c2
883133,(Versión Extendida) Lección 7: Me informo y op...,0,t_fffe14f1be1e,c_61826808000c
883134,يحسب الإحداثيّات القطبية لنقطة معطاة بالاحداثي...,0,t_fffe811a6da9,c_14aa105dc884


In [26]:
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))

('/home/working/output/exp009/tokenizer/tokenizer_config.json',
 '/home/working/output/exp009/tokenizer/special_tokens_map.json',
 '/home/working/output/exp009/tokenizer/sentencepiece.bpe.model',
 '/home/working/output/exp009/tokenizer/added_tokens.json',
 '/home/working/output/exp009/tokenizer/tokenizer.json')

In [27]:
bi_cv_list = get_StratifiedGroupKFold_list(X=train_text_df, y=train_text_df["target"], groups=train_filtered_df["topic_id"], n_splits=cfg.num_fold, seed=cfg.seed)

score = train_loop(cfg, train_text_df, bi_cv_list, correlation_df)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1375 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 500 | val_loss: 0.22784, f2: 0.35954, recall: 0.31715


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 1000 | val_loss: 0.19975, f2: 0.54586, recall: 0.51345
Fold0, Epoch0/5 | train_loss: 0.24366


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.19456, f2: 0.52753, recall: 0.48786


  0%|          | 0/1375 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 500 | val_loss: 0.18479, f2: 0.62164, recall: 0.59791


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 1000 | val_loss: 0.17918, f2: 0.61096, recall: 0.58163
Fold0, Epoch1/5 | train_loss: 0.15563


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.17937, f2: 0.64282, recall: 0.62097


  0%|          | 0/1375 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 500 | val_loss: 0.17803, f2: 0.63665, recall: 0.61084


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 1000 | val_loss: 0.17434, f2: 0.65024, recall: 0.62601
Fold0, Epoch2/5 | train_loss: 0.12694


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.17638, f2: 0.69112, recall: 0.68053


  0%|          | 0/1375 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 500 | val_loss: 0.17885, f2: 0.67002, recall: 0.65


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 1000 | val_loss: 0.17888, f2: 0.68938, recall: 0.67612
Fold0, Epoch3/5 | train_loss: 0.1085


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.17818, f2: 0.70237, recall: 0.69211


  0%|          | 0/1375 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 500 | val_loss: 0.18074, f2: 0.69953, recall: 0.68707


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 1000 | val_loss: 0.18331, f2: 0.70407, recall: 0.69249
Fold0, Epoch4/5 | train_loss: 0.08984


  0%|          | 0/350 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.18334, f2: 0.70662, recall: 0.69656
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1378 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 500 | val_loss: 0.22932, f2: 0.42336, recall: 0.38451


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 1000 | val_loss: 0.21055, f2: 0.62239, recall: 0.61896
Fold1, Epoch0/5 | train_loss: 0.24904


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: end | val_loss: 0.19216, f2: 0.59509, recall: 0.57112


  0%|          | 0/1378 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 500 | val_loss: 0.18765, f2: 0.60734, recall: 0.58329


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 1000 | val_loss: 0.18518, f2: 0.62, recall: 0.59764
Fold1, Epoch1/5 | train_loss: 0.15831


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: end | val_loss: 0.1844, f2: 0.64215, recall: 0.62574


  0%|          | 0/1378 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 500 | val_loss: 0.18456, f2: 0.57944, recall: 0.5449


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 1000 | val_loss: 0.1805, f2: 0.63869, recall: 0.61698
Fold1, Epoch2/5 | train_loss: 0.14328


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: end | val_loss: 0.17895, f2: 0.61884, recall: 0.58937


  0%|          | 0/1378 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 500 | val_loss: 0.18248, f2: 0.66682, recall: 0.65062


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 1000 | val_loss: 0.18239, f2: 0.67409, recall: 0.65962
Fold1, Epoch3/5 | train_loss: 0.12204


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: end | val_loss: 0.17983, f2: 0.66276, recall: 0.64261


  0%|          | 0/1378 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 500 | val_loss: 0.18437, f2: 0.67237, recall: 0.65601


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 1000 | val_loss: 0.18478, f2: 0.67518, recall: 0.65977
Fold1, Epoch4/5 | train_loss: 0.10872


  0%|          | 0/347 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: end | val_loss: 0.18519, f2: 0.67641, recall: 0.66146
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1381 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 500 | val_loss: 0.22838, f2: 0.39676, recall: 0.35483


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 1000 | val_loss: 0.2021, f2: 0.50409, recall: 0.46385
Fold2, Epoch0/5 | train_loss: 0.25156


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: end | val_loss: 0.19275, f2: 0.57447, recall: 0.54178


  0%|          | 0/1381 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 500 | val_loss: 0.1876, f2: 0.60751, recall: 0.58045


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 1000 | val_loss: 0.18592, f2: 0.6149, recall: 0.58797
Fold2, Epoch1/5 | train_loss: 0.15731


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: end | val_loss: 0.18495, f2: 0.6476, recall: 0.63008


  0%|          | 0/1381 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 500 | val_loss: 0.1855, f2: 0.65309, recall: 0.63595


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 1000 | val_loss: 0.18402, f2: 0.6299, recall: 0.60306
Fold2, Epoch2/5 | train_loss: 0.14239


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: end | val_loss: 0.18052, f2: 0.65746, recall: 0.63901


  0%|          | 0/1381 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 500 | val_loss: 0.18242, f2: 0.66787, recall: 0.65177


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 1000 | val_loss: 0.18209, f2: 0.66532, recall: 0.64726
Fold2, Epoch3/5 | train_loss: 0.12856


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: end | val_loss: 0.18092, f2: 0.66359, recall: 0.64377


  0%|          | 0/1381 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 500 | val_loss: 0.18346, f2: 0.67133, recall: 0.65425


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 1000 | val_loss: 0.18383, f2: 0.67167, recall: 0.65488
Fold2, Epoch4/5 | train_loss: 0.11866


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: end | val_loss: 0.18362, f2: 0.67483, recall: 0.65924
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1380 [00:00<?, ?it/s]

  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 500 | val_loss: 0.23199, f2: 0.43061, recall: 0.39172


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 1000 | val_loss: 0.20069, f2: 0.49049, recall: 0.44986
Fold3, Epoch0/5 | train_loss: 0.25004


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: end | val_loss: 0.19261, f2: 0.50544, recall: 0.46343


  0%|          | 0/1380 [00:00<?, ?it/s]

  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 500 | val_loss: 0.18702, f2: 0.54947, recall: 0.51085


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 1000 | val_loss: 0.17928, f2: 0.64263, recall: 0.62488
Fold3, Epoch1/5 | train_loss: 0.16175


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: end | val_loss: 0.17564, f2: 0.63183, recall: 0.60793


  0%|          | 0/1380 [00:00<?, ?it/s]

  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 500 | val_loss: 0.1765, f2: 0.61931, recall: 0.58858


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 1000 | val_loss: 0.17767, f2: 0.68086, recall: 0.67145
Fold3, Epoch2/5 | train_loss: 0.13637


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: end | val_loss: 0.17659, f2: 0.63252, recall: 0.60194


  0%|          | 0/1380 [00:00<?, ?it/s]

  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 500 | val_loss: 0.17429, f2: 0.70455, recall: 0.69813


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 1000 | val_loss: 0.17385, f2: 0.70372, recall: 0.69429
Fold3, Epoch3/5 | train_loss: 0.10893


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: end | val_loss: 0.17437, f2: 0.70069, recall: 0.68866


  0%|          | 0/1380 [00:00<?, ?it/s]

  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 500 | val_loss: 0.17649, f2: 0.69566, recall: 0.68048


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 1000 | val_loss: 0.17648, f2: 0.70037, recall: 0.68706
Fold3, Epoch4/5 | train_loss: 0.09247


  0%|          | 0/345 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: end | val_loss: 0.17683, f2: 0.70342, recall: 0.69165
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1382 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 500 | val_loss: 0.23263, f2: 0.4774, recall: 0.4459


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 1000 | val_loss: 0.20396, f2: 0.51906, recall: 0.48295
Fold4, Epoch0/5 | train_loss: 0.26018


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: end | val_loss: 0.1919, f2: 0.50962, recall: 0.4675


  0%|          | 0/1382 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 500 | val_loss: 0.18806, f2: 0.60874, recall: 0.58499


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 1000 | val_loss: 0.18281, f2: 0.59621, recall: 0.56664
Fold4, Epoch1/5 | train_loss: 0.15907


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: end | val_loss: 0.17937, f2: 0.61832, recall: 0.59252


  0%|          | 0/1382 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 500 | val_loss: 0.18124, f2: 0.64774, recall: 0.63026


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 1000 | val_loss: 0.1786, f2: 0.64542, recall: 0.6247
Fold4, Epoch2/5 | train_loss: 0.13879


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: end | val_loss: 0.17982, f2: 0.66538, recall: 0.65048


  0%|          | 0/1382 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 500 | val_loss: 0.18036, f2: 0.65025, recall: 0.62903


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 1000 | val_loss: 0.17797, f2: 0.67672, recall: 0.66465
Fold4, Epoch3/5 | train_loss: 0.12355


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: end | val_loss: 0.178, f2: 0.67636, recall: 0.66253


  0%|          | 0/1382 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 500 | val_loss: 0.18073, f2: 0.66818, recall: 0.65107


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 1000 | val_loss: 0.17983, f2: 0.67683, recall: 0.66224
Fold4, Epoch4/5 | train_loss: 0.11225


  0%|          | 0/343 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: end | val_loss: 0.17993, f2: 0.6755, recall: 0.66022
fold score: [0.7066243265562939, 0.6764140443185268, 0.6748286480580115, 0.7034189681227624, 0.6763571342467953]
CV: 0.3809


In [28]:
oof_pred = np.load(os.path.join(cfg.EXP_PREDS, "oof_pred.npy"))

In [31]:
oof_pred

array([0.00186062, 0.03253174, 0.08526611, ..., 0.00108147, 0.00169373,
       0.00108147], dtype=float32)

In [46]:
score = calc_comp_score(train_filtered_df, oof_pred, correlation_df, thr=0.07)
# LOGGER.info(f'fold score: {fold_score}')
print(f'CV: {round(score, 4)}')

CV: 0.3821


In [47]:
def calc_comp_recall(train_df, oof, correlation_df, thr=0.1):
    """2値分類の予測からcvを計算する"""
    oof_preds = np.where(oof>=thr, 1, 0)
    pred_df = train_df[oof_preds==1]
    pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
    pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
    pred_df = pred_df.fillna("nan")

    cv_score = comp_recall_score(correlation_df["content_id"], pred_df["content_id"])
    return cv_score

In [54]:
score = calc_comp_recall(train_filtered_df, oof_pred, correlation_df, thr=0.07)
# LOGGER.info(f'fold score: {fold_score}')
print(f'CV: {round(score, 4)}')

CV: 0.4043


In [55]:
if cfg.upload_from_colab:
    dataset_create_new(dataset_name=cfg.EXP, upload_dir=cfg.OUTPUT_EXP)

Starting upload for file XGBoost_fold_4.pkl


100%|██████████| 9.57M/9.57M [00:03<00:00, 2.80MB/s]


Upload successful: XGBoost_fold_4.pkl (10MB)
Starting upload for file XGBoost_fold_2.pkl


100%|██████████| 9.96M/9.96M [00:03<00:00, 2.88MB/s]


Upload successful: XGBoost_fold_2.pkl (10MB)
Starting upload for file tokenizer.tar


100%|██████████| 21.1M/21.1M [00:05<00:00, 4.06MB/s]


Upload successful: tokenizer.tar (21MB)
Starting upload for file language_oe.pkl


100%|██████████| 768/768 [00:02<00:00, 265B/s]  


Upload successful: language_oe.pkl (768B)
Starting upload for file ['topic_category', 'topic_language']_oe.pkl


100%|██████████| 864/864 [00:02<00:00, 340B/s]


Upload successful: ['topic_category', 'topic_language']_oe.pkl (864B)
Starting upload for file XGBoost_fold_1.pkl


100%|██████████| 8.89M/8.89M [00:05<00:00, 1.79MB/s]


Upload successful: XGBoost_fold_1.pkl (9MB)
Starting upload for file preds.tar


100%|██████████| 142M/142M [00:12<00:00, 12.1MB/s] 


Upload successful: preds.tar (142MB)
Starting upload for file XGBoost_fold_0.pkl


100%|██████████| 10.4M/10.4M [00:03<00:00, 2.97MB/s]


Upload successful: XGBoost_fold_0.pkl (10MB)
Starting upload for file model.tar


100%|██████████| 5.19G/5.19G [05:52<00:00, 15.8MB/s]  


Upload successful: model.tar (5GB)
Starting upload for file topic_vec_svd_dict.pkl


ReadTimeoutError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=None)

In [56]:
if cfg.upload_from_colab:
    dataset_create_new(dataset_name=cfg.EXP, upload_dir=os.path.join(cfg.OUTPUT, f"{cfg.EXP}.zip"))

NotADirectoryError: [Errno 20] Not a directory: '/home/working/output/exp009.zip/dataset-metadata.json'