In [158]:
! nvidia-smi

Sat Dec 31 13:36:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   21C    P8    16W / 480W |   2965MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [245]:
import os

class Config:
    AUTHOR = "shu421"

    EXP = "exp002"
    MODEL_PATH = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    DATASET_PATH = []

    COMPETITION = "learning-equality-curriculum-recommendations"

    BASE_PATH = "/home/working/"

    api_path = "/.kaggle/kaggle.json"

    apex=True
    seed = 42
    num_fold = 5
    trn_fold = [0, 1, 2, 3, 4,]
    batch_size = 8
    n_epochs = 5
    max_len = 512
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler="cosine"
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    lr_weight_decay = 0.95
    
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 40
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1

    # weight and bias
    wandb = False
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

    gbdt_model = "XGBoost"
    model_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": 0.1,
        "tree_method": "gpu_hist",
        "seed": seed,
        
    }

    train_params = {
        "early_stopping_rounds": 50,
        "num_boost_round": 99999,
        "verbose_eval": 100
    }

In [160]:
import os
import re
import gc
import pdb
import sys
import json
import time
import pickle
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')
# sns.set()
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import TruncatedSVD

# ! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

from kaggle.api.kaggle_api_extended import KaggleApi

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true

from cuml import NearestNeighbors


INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
env: TOKENIZERS_PARALLELISM=true


In [161]:
# ====================================================
# wandb
# ====================================================
if Config.wandb:
    
    import wandb
    import json

    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb_config = json.load(open('/root/.kaggle/wandb.json', 'rb'))
        secret_value_0 = wandb_config['key']
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project=Config.COMPETITION, 
                     name=Config.EXP,
                     config=class2dict(Config),
                     group=Config.MODEL_PATH,
                     job_type="train",
                     anonymous=anony)

In [195]:
def decorate(s: str, decoration=None):
    if decoration is None:
        decoration = '=' * 20
    else:
        decoration *= 20

    return ' '.join([decoration, str(s), decoration])

In [163]:
def setup(cfg):
    # cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    # cfg.DRIVE = cfg.DRIVE_PATH
    # cfg.EXP = (cfg.NAME if cfg.NAME is not None 
    #     else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    # )
    cfg.INPUT = os.path.join(cfg.BASE_PATH, 'input')
    cfg.OUTPUT = os.path.join(cfg.BASE_PATH, 'output')
    cfg.SUBMISSION = os.path.join(cfg.BASE_PATH, 'submission')
    cfg.DATASET = os.path.join(cfg.BASE_PATH, 'dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    
    # if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
    if len(os.listdir(cfg.INPUT))==0:
        # load dataset
        !kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
        filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
        !unzip -d $cfg.INPUT $filepath
        
    
    for path in cfg.DATASET_PATH:
        datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
        if not os.path.exists(datasetpath):
            os.makedirs(datasetpath, exist_ok=True)
            !kaggle datasets download $path -p $datasetpath
            filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
            !unzip -d $datasetpath $filepath
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [272]:
# setup
cfg = setup(Config)
LOGGER = get_logger(Config.OUTPUT_EXP)

model = AutoModel.from_pretrained(cfg.MODEL_PATH)
model.eval()
model.to(cfg.device)

tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer = tokenizer

In [164]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [275]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

def get_logger(filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    for h in logger.handlers[:]:
        logger.removeHandler(h)
        h.close()
    logger.addHandler(handler1)
    logger.addHandler(handler2)
        

    return logger

In [276]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [277]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [278]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min = 1e-9)
        mean_embeddings = sum_embeddings/sum_mask
        return mean_embeddings
        

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        feature = self.ln(feature)
        # feature1 = self.drop1(feature)
        # feature2 = self.drop2(feature)
        # feature3 = self.drop3(feature)
        # feature4 = self.drop4(feature)
        # feature5 = self.drop5(feature)
        # feature = (feature1 + feature2 + feature3 + feature4 + feature5) / 5
        output = self.fc(feature)

        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

import math
from torch.autograd.function import InplaceFunction
from torch.nn import Parameter
import torch.nn.init as init
class Mixout(InplaceFunction):
    @staticmethod
    def _make_noise(input):
        return input.new().resize_as_(input)
    @classmethod
    def forward(cls, ctx, input, target=None, p=0.0, training=False, inplace=False):
        if p < 0 or p > 1:
            raise ValueError("A mix probability of mixout has to be between 0 and 1," " but got {}".format(p))
        if target is not None and input.size() != target.size():
            raise ValueError(
                "A target tensor size must match with a input tensor size {},"
                " but got {}".format(input.size(), target.size())
            )
        ctx.p = p
        ctx.training = training
        if ctx.p == 0 or not ctx.training:
            return input
        if target is None:
            target = cls._make_noise(input)
            target.fill_(0)
        target = target.to(input.device)
        if inplace:
            ctx.mark_dirty(input)
            output = input
        else:
            output = input.clone()
        ctx.noise = cls._make_noise(input)
        if len(ctx.noise.size()) == 1:
            ctx.noise.bernoulli_(1 - ctx.p)
        else:
            ctx.noise[0].bernoulli_(1 - ctx.p)
            ctx.noise = ctx.noise[0].repeat(input.size()[0], 1)
        ctx.noise.expand_as(input)
        if ctx.p == 1:
            output = target
        else:
            output = ((1 - ctx.noise) * target + ctx.noise * output - ctx.p * target) / (1 - ctx.p)
        return output
    @staticmethod
    def backward(ctx, grad_output):
        if ctx.p > 0 and ctx.training:
            return grad_output * ctx.noise, None, None, None, None
        else:
            return grad_output, None, None, None, None
def mixout(input, target=None, p=0.0, training=False, inplace=False):
    return Mixout.apply(input, target, p, training, inplace)
class MixLinear(torch.nn.Module):
    __constants__ = ["bias", "in_features", "out_features"]
    def __init__(self, in_features, out_features, bias=True, target=None, p=0.0):
        super(MixLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter("bias", None)
        self.reset_parameters()
        self.target = target
        self.p = p
    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)
    def forward(self, input):
        return F.linear(input, mixout(self.weight, self.target, self.p, self.training), self.bias)
    def extra_repr(self):
        type = "drop" if self.target is None else "mix"
        return "{}={}, in_features={}, out_features={}, bias={}".format(
            type + "out", self.p, self.in_features, self.out_features, self.bias is not None
        )
def replace_mixout(model):
    for sup_module in model.modules():
        for name, module in sup_module.named_children():
            if isinstance(module, nn.Dropout):
                module.p = 0.0
            if isinstance(module, nn.Linear):
                target_state_dict = module.state_dict()
                bias = True if module.bias is not None else False
                new_module = MixLinear(
                    module.in_features, module.out_features, bias, target_state_dict["weight"], 0.2
                )
                new_module.load_state_dict(target_state_dict)
                setattr(sup_module, name, new_module)
    return model

In [279]:
def get_optimizer_grouped_parameters(cfg, model):
    '''Layerwise Learning Rate Decay'''
    model_type = 'model'
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if model_type not in n],
             'lr': cfg.decoder_lr, 'weight_decay': 0.0},
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = cfg.encoder_lr
    for layer in layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": cfg.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
        
        lr *= cfg.lr_weight_decay
    return optimizer_grouped_parameters


# initialize layer
def reinit_bert(model):
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [280]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [281]:
def valid_fn(cfg, valid_loader, model, valid_df, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        # with tqdm(valid_loader, total=len(valid_loader)) as pbar:
        # for (inputs, labels) in pbar:
        for (inputs, labels) in valid_loader:
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(dtype=torch.bfloat16):
                loss, output = model(inputs, labels)

            output = output.detach().cpu().to(torch.float16).numpy()
            val_preds.append(output)
            val_losses.append(loss.item() * len(labels))
            val_nums.append(len(labels))

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    LOGGER.info(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epochs}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        # print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    
    return best_val_preds, best_val_score, val_loss

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = reinit_bert(model)
        model = replace_mixout(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # enable FGM
        fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
            train_losses = []
            train_nums = []
            model.train() 
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex, dtype=torch.bfloat16):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                     # FGM attack
                    fgm.attack()
                    with autocast(enabled=cfg.apex, dtype=torch.bfloat16):
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        best_val_preds, best_val_score, val_loss = valid_fn(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            epoch,
                            step,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

                    if cfg.wandb:
                        wandb.log({f"[fold{fold}] train_loss": loss.item(),
                                f"[fold{fold}] lr": scheduler.get_lr()[0]})

            train_loss = sum(train_losses)/sum(train_nums)

            LOGGER.info(f'Fold{fold}, Epoch{epoch}/{cfg.n_epochs} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch, 
                        f"[fold{fold}] avg_train_loss": loss.item(), 
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model, fgm
        gc.collect()
        torch.cuda.empty_cache()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    LOGGER.info(f'fold score: {fold_score}')
    LOGGER.info(f'CV: {round(score, 4)}')
    return score

In [175]:
class EmbDataSet(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        df[col] = df[col].fillna("no text")
        self.text = df[col].to_numpy()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        return text

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

In [176]:
def get_emb_vec(content_df, topic_df, col):
    content_dataset = EmbDataSet(cfg, content_df, col)
    topics_dataset = EmbDataSet(cfg, topic_df, col)

    content_loader = DataLoader(
        dataset=content_dataset, 
        batch_size=256,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    topics_loader = DataLoader(
        dataset=topics_dataset, 
        batch_size=256,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    if not os.path.isfile(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl")):
        content_vec = []
        for step, contents in enumerate(tqdm(content_loader)):
            contents = collate(contents)
            for k, v in contents.items():
                contents[k] = v.to(cfg.device)
            with torch.no_grad():
                output = model(**contents)
            vec = output.last_hidden_state.mean(1).cpu().numpy()
            content_vec.append(vec)
        content_vec = np.concatenate(content_vec)
        pickle.dump(content_vec, open(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl"), "wb"))
    else:
        content_vec = pickle.load(open(os.path.join(cfg.OUTPUT_EXP, f"content_{col}_vec.pkl"), "rb"))

    if not os.path.isfile(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl")):
        topic_vec = []
        for step, topics in enumerate(tqdm(topics_loader)):
            topics = collate(topics)
            for k, v in topics.items():
                topics[k] = v.to(cfg.device)
            with torch.no_grad():
                output = model(**topics)
            vec = output.last_hidden_state.mean(1).cpu().numpy()
            topic_vec.append(vec)
        topic_vec = np.concatenate(topic_vec)
        pickle.dump(topic_vec, open(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl"), "wb"))
    else:
        topic_vec = pickle.load(open(os.path.join(cfg.OUTPUT_EXP, f"topic_{col}_vec.pkl"), "rb"))

    return content_vec, topic_vec

In [226]:
def comp_fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    # pdb.set_trace()
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

def comp_recall_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        recall = len(TP) / len(true)
        
        score_list.append(recall)
    score = sum(score_list) / len(score_list)
    return score


In [227]:
import xgboost as xgb
import lightgbm as lgb

class XGBoost:
    def __init__(self, model_params: dict, train_params: dict):
        self.model_params = model_params
        self.train_params = train_params
        
    def fit(self, X_train, y_train, X_valid, y_valid):
        d_train = xgb.DMatrix(X_train, 
                              label=y_train)
                              
        d_valid = xgb.DMatrix(X_valid, 
                              label=y_valid)
        self.model = xgb.train(params=self.model_params,
                               dtrain=d_train,
                               evals=[(d_valid, "valid")],
                               **self.train_params)

    def predict(self, features):
        return self.model.predict(xgb.DMatrix(features))

    

class LightGBM:
    def __init__(self, model_params: dict, train_params: dict):
        self.model_params = model_params
        self.train_params = train_params
        
    def fit(self, X_train, y_train, X_valid, y_valid):
        d_train = lgb.Dataset(X_train, 
                              label=y_train)
                              
        d_valid = lgb.Dataset(X_valid, 
                              label=y_valid)
        self.model = lgb.train(params=self.model_params,
                               train_set=d_train,
                               valid_sets=[d_train, d_valid],
                               valid_names=['train', 'valid'],
                               callbacks=[
                                   lgb.early_stopping(stopping_rounds=100, verbose=True),
                                   lgb.log_evaluation(1000)],
                               **self.train_params)

    def predict(self, features):
        return self.model.predict(features)
    
def get_model(gbdt_model):
    if gbdt_model == 'LightGBM':
        model = LightGBM(model_params=Config.model_params, 
                         train_params=Config.train_params)
    elif gbdt_model == 'XGBoost':
        model = XGBoost(model_params=Config.model_params, 
                         train_params=Config.train_params)
    return model


def save_model(filepath, model):
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)    

def load_model(filepath):
    with open(filepath, 'rb') as f:
        model = pickle.load(f)
    return model  

In [228]:
def get_whole_df():
    # main
    content_df = pd.read_csv(os.path.join(cfg.INPUT, 'content.csv'))
    topic_df = pd.read_csv(os.path.join(cfg.INPUT, 'topics.csv'))
    correlation_df = pd.read_csv(os.path.join(cfg.INPUT, 'correlations.csv'))
    sub_df = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))
    return content_df, topic_df, correlation_df, sub_df

def change_col_name(content_df, topic_df, correlation_df):
    content_df = content_df.rename(columns={"id":"content_id"})
    topic_df = topic_df.rename(columns={"id":"topic_id"})
    correlation_df = correlation_df.rename(columns={"content_ids":"content_id"})
    return content_df, topic_df, correlation_df

def get_cv_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    # cv = GroupKFold(n_splits=n_splits)
    return list(cv.split(X, y, groups))

def get_processed_df():
    content_df, topic_df, correlation_df, sub_df = get_whole_df()
    content_df, topic_df, correlation_df = change_col_name(content_df, topic_df, correlation_df)
    return content_df, topic_df, correlation_df, sub_df

In [239]:
class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" ", verbose=0):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None
        self.verbose = verbose

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time.time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time.time()
        if self.verbose is None:
            return
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)


def reduce_mem_usage(df, verbose=True):
    """DataFrameの型変換してメモリ削減するやつ"""
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float32))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f"Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction")
    return df_out
    

class AbstractBaseBlock:
    def fit(self, input_df: pd.DataFrame, y=None):
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        raise NotImplementedError()

class IdentityBlock(AbstractBaseBlock):
    """そのまま使う特徴量"""
    def __init__(self, use_cols):
        self.use_cols = use_cols
    def transform(self, input_df):
        return input_df[self.use_cols].copy()

class LabelEncodingBlock(AbstractBaseBlock):
    """指定したカラムをラベルエンコード"""
    def __init__(self, cols):
        self.cols = cols

    def fit(self, input_df, y=None):
        self.oe = OrdinalEncoder()
        self.oe.fit(input_df[self.cols])
        
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = input_df[self.cols].copy()
        output_df = pd.DataFrame(self.oe.transform(output_df))
        output_df.columns = self.cols
        
        return output_df

class SVDBlock(AbstractBaseBlock):
    def __init__(self, cols, dim, title_vec, desc_vec):
        self.cols = cols
        self.dim = dim
        self.svd = TruncatedSVD(n_components=dim, random_state=Config.seed)
        self.title_vec = title_vec
        self.desc_vec = desc_vec
    
    def fit(self, input_df, y=None):        
        svd_dict = dict()

        output_df = pd.DataFrame()
        for svd_name, vec in zip(
            self.cols,
            [self.title_vec, self.desc_vec]
        ):
            svd = TruncatedSVD(n_components=self.dim, random_state=Config.seed)
            svd_vec = svd.fit_transform(vec)
            # svdと変換後のベクトルを保存
            svd_dict[svd_name] = svd
            for i_dim in range(self.dim):
                output_df[f"{svd_name}_{i_dim}"] = svd_vec[:, i_dim]
        
        pickle.dump(svd_dict, open(os.path.join(cfg.OUTPUT_EXP, "_".join(self.cols) + "_svd_dict.pkl"), "wb"))
        return output_df

    # def transform(self, input_df):
    #     tokenized_text = input_df[self.col].astype(str).parallel_apply(lambda x: " ".join(self.tokenizer.tokenize(x))).fillna("hogehoge")
        
    #     output_df = pd.DataFrame(self.pipe.transform(tokenized_text))
    #     output_df = output_df.add_prefix(f"Tfidf_SVD_{self.col}_")
    #     return output_df



def run_blocks(input_df, blocks, y=None, test=False):
    """ブロック毎の特徴量を作成"""
    out_df = pd.DataFrame()

    print(decorate("start run blocks...", "*"))

    with Timer(prefix="run test={}".format(test)):
        for block in blocks:
            with Timer(prefix=f"\t- {str(block)}"):
                if not test:
                    out_i = block.fit(input_df, y=y)
                else:
                    out_i = block.transform(input_df)

            assert len(input_df) == len(out_i), block
            out_i = reduce_mem_usage(out_i, verbose=False)
            name = block.__class__.__name__
            out_df = pd.concat([out_df, out_i.add_suffix(f"@{name}")], axis=1)
    assert len(out_df.columns) == len(set(out_df.columns)), "col name duplicates"
    return out_df

def get_cand_df(content_df, topic_df, correlation_df, content_title_vec, topics_title_vec):
    """
    knnを使用してtopicに対するcontentの候補を出力する関数
    """
    id2content_dict = dict(content_df["content_id"])
    id2topics_dict = dict(topic_df["topic_id"])
    
    # contentがどのtopicにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=60)
    knn_model.fit(topics_title_vec)
    distances, indices = knn_model.kneighbors(content_title_vec)
    knn_pred_c2t_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_c2t_dict[id2topics_dict[j]].append(id2content_dict[idx])

    # topicがどのcontentにマッチするかを予測
    knn_model = NearestNeighbors(n_neighbors=40)
    knn_model.fit(content_title_vec)
    distances, indices = knn_model.kneighbors(topics_title_vec)
    knn_pred_t2c_dict = {k:[] for k in topic_df["topic_id"]}
    for idx, i in enumerate(indices):
        for j in i:
            knn_pred_t2c_dict[id2topics_dict[idx]].append(id2content_dict[j])

    # idの割り当て
    knn_pred_dict = {k:np.unique(v_c2t+v_t2c) for k,v_c2t, v_t2c in zip(knn_pred_c2t_dict.keys(), knn_pred_c2t_dict.values(), knn_pred_t2c_dict.values())}
    knn_pred_id = {k:[" ".join(v)] for k,v in knn_pred_dict.items()}
    knn_pred_df = pd.DataFrame(knn_pred_id).T.reset_index()
    knn_pred_df.columns = ["topic_id", "content_id"]
    knn_pred_df = knn_pred_df[knn_pred_df["topic_id"].isin(correlation_df["topic_id"].to_list())] # trainに入っているtopicを抽出
    knn_pred_df = knn_pred_df.reset_index(drop=True)

    # candidateの集計結果を出力
    recall = comp_recall_score(correlation_df["content_id"], knn_pred_df["content_id"])
    print(f"recall = {round(recall, 5)}")
    n_bin_data = knn_pred_df["content_id"].apply(lambda x: len(x.split())).sum()
    print(f"n_data = {n_bin_data}")

    # 文字列の候補をlistに変換
    knn_pred_df["content_id"] = knn_pred_df["content_id"].apply(lambda x: x.split(" "))
    cand_df = knn_pred_df.explode("content_id")

    # pdb.set_trace()
    # target作成
    correlation_df_ = correlation_df.copy()
    correlation_df_["content_id"] = correlation_df_["content_id"].apply(lambda x: x.split(" "))
    correlation_df_ = correlation_df_.explode("content_id")
    correlation_df_["target"] = 1
    target_df = pd.DataFrame()
    target_df = pd.merge(cand_df, correlation_df_, on=["topic_id", "content_id"], how="left")
    target_df = target_df["target"].fillna(0).astype(int)


    topic_df_ = pd.merge(cand_df, topic_df[["topic_id", "channel"]], on="topic_id", how="left")

    cv_list = get_cv_list(X=cand_df, y=target_df, groups=topic_df_["channel"], n_splits=5, seed=cfg.seed)
    return cand_df, target_df, cv_list

def get_feature_df(cand_df, target_df, correlation_df, content_df, topic_df, cv_list, vecs):
    content_title_vec, topic_title_vec, content_desc_vec, topic_desc_vec = vecs
    # assert sum(cand_df["topic_id"] == correlation_df["topic_id"]) == len(correlation_df), "topic_id of cand_df and correlation_df must be matched"

    # content features
    content_svd_cols = [
        "content_title_vec",
        "content_desc_vec",
    ]
    content_cat_cols = [
        "kind",
        "language",
        "copyright_holder",
        "license",
    ]

    # topic features
    topic_svd_cols = [
        "topic_title_vec",
        "topic_desc_vec",
    ]
    topic_cat_cols = [
        "category",
        "language",
    ]

    content_blocks = [
        # IdentityBlock(use_cols=content_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=content_cat_cols), 
        SVDBlock(cols=content_svd_cols, dim=32, title_vec=content_title_vec, desc_vec=content_desc_vec),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    topic_blocks = [
        # IdentityBlock(use_cols=topic_num_cols), 
        # *[TargetEncodingBlock(col=col, 
        #                       func=func, 
        #                       cv_list=cv_list) for col in ["company_id"] for func in ["mean"]], 
        LabelEncodingBlock(cols=topic_cat_cols), 
        SVDBlock(cols=topic_svd_cols, dim=32, title_vec=topic_title_vec, desc_vec=topic_desc_vec),
        # *[AggBlock(key=key, 
        #             values=numeric_cols, 
        #             funcs=["min", "max", "mean", "sum", "std"]) for key in cat_cols], 
        # *[WrapperBlock(func=func) for func in funcs], 
        ]
    content_feat_df = run_blocks(content_df, blocks=content_blocks, test=False)
    topic_feat_df = run_blocks(topic_df, blocks=topic_blocks, test=False)

    # content_idとfeatを対応付ける
    content_feat_df = pd.concat([content_df[["content_id"]], content_feat_df], axis=1)
    # topic_idとfeatを対応付ける
    topic_feat_df = pd.concat([topic_df[["topic_id"]], topic_feat_df], axis=1)

    # topicとcontentのfeatをマージ
    train_feat_df = pd.merge(cand_df, content_feat_df, on="content_id", how="left")
    train_feat_df = pd.merge(train_feat_df, topic_feat_df, on="topic_id", how="left")
    train_feat_df = train_feat_df.drop(columns=["topic_id", "content_id"])

    return train_feat_df

In [215]:
content_df, topic_df, correlation_df, sub_df = get_processed_df()

In [28]:
content_title_vec, topics_title_vec = get_emb_vec(content_df, topic_df, 'title')
content_desc_vec, topics_desc_vec = get_emb_vec(content_df, topic_df, 'description')

vecs = [
    content_title_vec,
    topics_title_vec,
    content_desc_vec,
    topics_desc_vec,
]

del model
torch.cuda.empty_cache()
gc.collect()

  0%|          | 0/602 [00:00<?, ?it/s]

  0%|          | 0/301 [00:00<?, ?it/s]

174

In [240]:
cand_df, target_df, cv_list = get_cand_df(content_df, topic_df, correlation_df, content_title_vec, topics_title_vec)

recall = 0.47808
n_data = 8606702


In [241]:
train_feat_df = get_feature_df(cand_df, target_df, correlation_df, content_df, topic_df, cv_list, vecs)

******************** start run blocks... ********************
	- <__main__.LabelEncodingBlock object at 0x7f09aebc8650> 0.114[s]
	- <__main__.SVDBlock object at 0x7f09aebc8610> 7.086[s]
run test=False 7.273[s]
******************** start run blocks... ********************
	- <__main__.LabelEncodingBlock object at 0x7f09aebc8850> 0.030[s]
	- <__main__.SVDBlock object at 0x7f09aebc8810> 3.776[s]
run test=False 3.857[s]


In [242]:
def metrics_dict(y_true, y_pred):
    metrics_dict_ = {}
    # metrics_dict_['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) 
    # metrics_dict_['MAE'] = mean_absolute_error(y_true, y_pred)
    # metrics_dict_['RMSE'] = mean_squared_error(y_true, y_pred, squared=False)
    # print(y_true, y_pred)
    # metrics_dict_['f1_micro'] = f1_score(y_true, y_pred, average='micro')
    # metrics_dict_['auc'] = roc_auc_score(y_true, y_pred)
    # metrics_dict_["f1_score"] = f1_score(y_true, y_pred)
    metrics_dict_["f2_score"] = fbeta_score(y_true, y_pred, beta=2)
    # metrics_dict_["recall"] = recall_score(y_true, y_pred)

    return metrics_dict_

In [243]:
def train_cv(train_feat_df, target_df, cv_list, metrics_dict):
    """交差検証を実行"""
    oof = np.zeros(len(train_feat_df))
    valid_idxes = []
    models = []

    for i_fold, (train_idx, valid_idx) in enumerate(cv_list):
        print(decorate(f'FOLD{i_fold}', decoration='=='))
        filepath = os.path.join(cfg.OUTPUT_EXP, f"{cfg.gbdt_model}_fold_{i_fold}.pkl")
        
        X_train = train_feat_df.iloc[train_idx].to_numpy()
        X_valid = train_feat_df.iloc[valid_idx].to_numpy()
        y_train = target_df.iloc[train_idx].to_numpy()
        y_valid = target_df.iloc[valid_idx].to_numpy()
        
        model = get_model(cfg.gbdt_model)
        model.fit(X_train, y_train, X_valid, y_valid)
        save_model(filepath, model)

        model = load_model(filepath)
        models.append(model)
        preds = model.predict(X_valid)
        preds = np.where(preds>=0.5, 1, 0)
        metrics_dict_scored = metrics_dict(y_valid, preds)
       
        for key in metrics_dict_scored.keys():
            print(decorate(f'{key}: {np.round(metrics_dict_scored[key], 4)}', decoration='*'))
        oof[valid_idx] = preds

    print(decorate('OOF'))
    metrics_dict_scored = metrics_dict(target_df, oof)
    for key in metrics_dict_scored.keys():
        print(f"Fold{i_fold} {key}: {np.round(metrics_dict_scored[key], 5)}")
    return oof, models


def predict_cv(test_feat_df):
    """Inference"""
    preds_fold = []
    preds_fold_df = pd.DataFrame()
    
    for i_fold in range(Config.n_fold):
        filepath = os.path.join(cfg.OUTPUT_EXP, f"{cfg.gbdt_model}_fold_{i_fold}.pkl")
        model = load_model(filepath)
        preds = model.predict(test_feat_df)
        reds = np.where(preds>=0.5, 1, 0)
        preds_fold.append(preds)
        preds_fold_df[f"fold={i_fold:02}"] = preds
    
    preds = np.mean(preds_fold, axis=0)
        
    return preds

In [249]:
oof, models = train_cv(train_feat_df, target_df, cv_list, metrics_dict=metrics_dict)

[0]	valid-logloss:0.60248
[100]	valid-logloss:0.04927
[200]	valid-logloss:0.04680
[300]	valid-logloss:0.04540
[400]	valid-logloss:0.04476
[500]	valid-logloss:0.04385
[600]	valid-logloss:0.04319
[700]	valid-logloss:0.04263
[800]	valid-logloss:0.04233
[900]	valid-logloss:0.04203
[1000]	valid-logloss:0.04181
[1045]	valid-logloss:0.04177
******************** f2_score: 0.0743 ********************
[0]	valid-logloss:0.60491
[100]	valid-logloss:0.08538
[200]	valid-logloss:0.08250
[300]	valid-logloss:0.08068
[400]	valid-logloss:0.07949
[500]	valid-logloss:0.07847
[600]	valid-logloss:0.07765
[700]	valid-logloss:0.07680
[800]	valid-logloss:0.07615
[900]	valid-logloss:0.07581
[1000]	valid-logloss:0.07509
[1100]	valid-logloss:0.07458
[1200]	valid-logloss:0.07411
[1300]	valid-logloss:0.07370
[1400]	valid-logloss:0.07328
[1500]	valid-logloss:0.07303
[1600]	valid-logloss:0.07275
[1700]	valid-logloss:0.07242
[1800]	valid-logloss:0.07205
[1900]	valid-logloss:0.07178
[2000]	valid-logloss:0.07155
[2100]	v

In [260]:
pred_df = cand_df[oof==1].reset_index(drop=True)
pd.DataFrame(pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)).reset_index()

Unnamed: 0,topic_id,content_id
0,t_0016d30772f3,c_061d9f90bb06 c_ea312de91d4f
1,t_0032c51bd677,c_1f52a74f3cec c_39cb5ce03ac3 c_55300bf0553e c...
2,t_007e3770673d,c_435b019a49c8 c_a2d7f3bf94ad c_e6e664a85465
3,t_007e54e21339,c_80fe928d3455 c_93d0c4c67998 c_a65f63db6629 c...
4,t_009e393dee75,c_81c5a6443260 c_a0c550b4a04e c_d19c941a3515
...,...,...
2835,t_ffa69ad4b0ac,c_de9e0ea9604e
2836,t_ffadf33f3fb7,c_58c65c5b01de c_af2ae65372b0 c_b461414a88ef c...
2837,t_ffc3082acbde,c_7608aaacbae7
2838,t_ffc4f1fe8460,c_3dba70c808d6 c_8b2e9850495f c_cadb7e1a7ae9


In [274]:
# calculate cv
pred_df = cand_df[oof==1]
pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
pred_df = pred_df.fillna("nan")

cv_score = comp_fbeta_score(correlation_df["content_id"], pred_df["content_id"])
LOGGER.info(f"cv: {np.round(cv_score, 5)}")

cv: 0.01806
cv: 0.01806


In [115]:
# # =====================
# # Main
# # =====================

# # setup
# cfg = setup(Config)
# LOGGER = get_logger(Config.OUTPUT_EXP)

# LOGGER.info(f"tokenizers.__version__: {tokenizers.__version__}")
# LOGGER.info(f"transformers.__version__: {transformers.__version__}")

# # main
# train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
# test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
# sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

# train = processing_features(train)

# cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
# cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
# cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
# cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

# score = training(cfg, train)

# if cfg.upload_from_colab:
#     from kaggle.api.kaggle_api_extended import KaggleApi
#     dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)