In [1]:
! nvidia-smi

Sat Feb 25 03:22:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  Off |
|  0%   40C    P8    28W / 480W |      1MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import re
import gc
import pdb
import sys
import json
import math
import time
import wandb
import pickle
import shutil
import joblib
import random
import pathlib
import requests
import warnings
from glob import glob
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from pandarallel import pandarallel

import scipy
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
import lightgbm as lgb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

from pytorch_metric_learning import losses

import tokenizers
import sentencepiece
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW
from transformers import logging
from transformers import DataCollatorWithPadding

import cupy as cp
from cuml import NearestNeighbors
from kaggle.api.kaggle_api_extended import KaggleApi

sys.path.append("/home/working/")
from kagglib.utils.utils import  Timer, reduce_mem_usage, get_logger, decorate, setup, dataset_create_new
from kagglib.utils.exp_manage import set_wandb
from kagglib.tabular.blocks import AbstractBaseBlock, IdentityBlock, LabelEncodingBlock, SVDBlock, run_blocks
from kagglib.tabular.model_selection import train_cv, predict_cv
from kagglib.nlp.preprocessing import resolve_encodings_and_normalize
from kagglib.nlp.model import (
    AttentionPooling,
    MeanPooling,
    WeightedLayerPooling,
    freeze,
    replace_mixout,
    reinit_bert,
)
from kagglib.nlp.activation import softmax, sigmoid
from kagglib.nlp.optimizer import (
    get_scheduler,
    get_optimizer_grouped_parameters,
)

%load_ext autoreload
%autoreload 2
%env TOKENIZERS_PARALLELISM=true

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 300)
pandarallel.initialize(progress_bar=True)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')

logging.set_verbosity_error()
logging.set_verbosity_warning()

env: TOKENIZERS_PARALLELISM=true
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Setup & data load

In [3]:
def fbeta_wrapper(y_true, y_pred):
    beta = 2
    return fbeta_score(y_true, y_pred, beta)

class Config:
    AUTHOR = "shu421"

    EXP = "exp022"
    COMPETITION = "learning-equality-curriculum-recommendations"
    DATASET_PATH = []
    BASE_PATH = "/home/working/"
    api_path = "/root/.kaggle/kaggle.json"

    # Language Model Config
    MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"

    # train
    apex=True
    seed = 42
    num_fold = 5
    train_fold = [0]
    batch_size = 768
    n_epoch = 10
    max_len = 256
    num_classes = 1


    # optimizer
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    betas = (0.9, 0.999)
    lr_weight_decay = 0.95

    # scheduler
    scheduler = "cosine"
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 1000
    num_cycles = 0.5
    num_warmup_steps_rate = 0.1
    clip_grad_norm = 1000

    # gradient accumulation
    gradient_accumulation_steps = 1

    es_patience = 3


    # weight and bias
    wandb = True

    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }


    upload_from_colab = True

# setup
cfg = setup(Config)

In [4]:
class Metrics_Config(Config):
    AUTHOR = "shu421"

    MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"

    # train
    apex=True
    seed = 42
    num_fold = 5
    train_fold = [0]
    batch_size = 1024
    n_epoch = 30
    max_len = 128
    num_classes = 1


    # optimizer
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    betas = (0.9, 0.999)
    lr_weight_decay = 0.95

    # scheduler
    scheduler="cosine"
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 150
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000

    # gradient accumulation
    gradient_accumulation_steps = 1

    es_patience = 6

    # weight and bias
    wandb = False

    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

metric_cfg = setup(Metrics_Config)

In [5]:
# set log functions
LOGGER = get_logger(cfg.OUTPUT_EXP)
log_filepath = os.path.join(cfg.OUTPUT, f"{cfg.EXP}.log")
# if os.path.isfile(log_filepath):
#     with open(log_filepath, "w") as f:
#         pass
#     f.close()
if cfg.wandb:
    run = set_wandb(cfg, name=cfg.EXP, group=cfg.MODEL_PATH, config_path="/root/.kaggle/wandb.json")
if metric_cfg.wandb:
    run = set_wandb(cfg, name=cfg.EXP + "_metric", group=metric_cfg.MODEL_PATH, config_path="/root/.kaggle/wandb.json")

[34m[1mwandb[0m: Currently logged in as: [33mshu421[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Metrics

In [6]:
def comp_fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

def comp_recall_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        recall = len(TP) / len(true)

        score_list.append(recall)
    score = sum(score_list) / len(score_list)
    return score

def calc_comp_score(train_df, oof, correlation_df, thr=0.1):
    """2値分類の予測からcvを計算する"""
    oof_preds = np.where(oof>=thr, 1, 0)
    pred_df = train_df[oof_preds==1]
    pred_df = pred_df.groupby("topic_id")["content_id"].apply(list).apply(" ".join)
    pred_df = pd.merge(correlation_df[["topic_id"]], pred_df, on="topic_id", how="left")
    pred_df = pred_df.fillna("")

    cv_score = comp_fbeta_score(correlation_df["content_id"], pred_df["content_id"])
    return cv_score


import heapq
from dataclasses import dataclass

import torch
from typing import Callable
import numpy as np

def cos_sim(a, b):
    # From https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py#L31
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))


# From: https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py#L204
def semantic_search(
    query_embeddings: torch.Tensor,
    corpus_embeddings: torch.Tensor,
    query_chunk_size: int = 100,
    corpus_chunk_size: int = 500000,
    top_k: int = 10,
    score_function: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = cos_sim,
):
    """
    This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.
    :param query_embeddings: A 2 dimensional tensor with the query embeddings.
    :param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
    :param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
    :param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
    :param top_k: Retrieve top k matching entries.
    :param score_function: Function for computing scores. By default, cosine similarity.
    :return: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
    """

    if isinstance(query_embeddings, (np.ndarray, np.generic)):
        query_embeddings = torch.from_numpy(query_embeddings)
    elif isinstance(query_embeddings, list):
        query_embeddings = torch.stack(query_embeddings)

    if len(query_embeddings.shape) == 1:
        query_embeddings = query_embeddings.unsqueeze(0)

    if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
        corpus_embeddings = torch.from_numpy(corpus_embeddings)
    elif isinstance(corpus_embeddings, list):
        corpus_embeddings = torch.stack(corpus_embeddings)

    # Check that corpus and queries are on the same device
    if corpus_embeddings.device != query_embeddings.device:
        query_embeddings = query_embeddings.to(corpus_embeddings.device)

    queries_result_list = [[] for _ in range(len(query_embeddings))]

    for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
        # Iterate over chunks of the corpus
        for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
            # Compute cosine similarities
            cos_scores = score_function(
                query_embeddings[query_start_idx : query_start_idx + query_chunk_size],
                corpus_embeddings[
                    corpus_start_idx : corpus_start_idx + corpus_chunk_size
                ],
            )

            # Get top-k scores
            cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
                cos_scores,
                min(top_k, len(cos_scores[0])),
                dim=1,
                largest=True,
                sorted=False,
            )
            cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
            cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()

            for query_itr in range(len(cos_scores)):
                for sub_corpus_id, score in zip(
                    cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]
                ):
                    corpus_id = corpus_start_idx + sub_corpus_id
                    query_id = query_start_idx + query_itr
                    if len(queries_result_list[query_id]) < top_k:
                        heapq.heappush(
                            queries_result_list[query_id], (score, corpus_id)
                        )  # heaqp tracks the quantity of the first element in the tuple
                    else:
                        heapq.heappushpop(
                            queries_result_list[query_id], (score, corpus_id)
                        )

    # change the data format and sort
    for query_id in range(len(queries_result_list)):
        for doc_itr in range(len(queries_result_list[query_id])):
            score, corpus_id = queries_result_list[query_id][doc_itr]
            queries_result_list[query_id][doc_itr] = {
                "corpus_id": corpus_id,
                "score": score,
            }
        queries_result_list[query_id] = sorted(
            queries_result_list[query_id], key=lambda x: x["score"], reverse=True
        )

    return queries_result_list


def get_topk_preds(topic_embeddings, content_embeddings, df, k=100, return_idxs=True):
    """
    df has the same number of rows as topic_embeddings and content_embeddings.
    A bunch of the topic embeddings are duplicates, so we'll dedupe before finding 
    nearest neighbors.

    Returns tuple (prediction content ids, topic ids)

        prediction content ids is a list of lists. The outer list has the same number of elements as unique topic
        ids. The inner list contains content ids and the length is equal to k (num nearest neighbors).
    """

    content_ids = df.content_id.values

    # These idx values will be used to compare with the idx
    # values returned by `semantic_search` to calculate recall.
    df["idx"] = list(range(len(content_embeddings)))

    deduped = df[["idx", "topic_id"]].drop_duplicates("topic_id")
    topic_embeddings = topic_embeddings[deduped["idx"]]
    device = torch.device("cuda:0")

    # Compare each of the topic embeddings to each of the
    # content embeddings and return a ranking for each one.
    # Works much, much faster on GPU.
    search_results = semantic_search(
        torch.tensor(topic_embeddings, device=device),
        torch.tensor(content_embeddings, device=device),
        top_k=k,
    )

    # `search_results` is a list of lists. The inner list
    # has a `dict` at each element.
    # The dict has two keys: `corpus_id` and `score`.
    all_pred_c_ids = [[content_ids[x["corpus_id"]] for x in row] for row in search_results]

    return all_pred_c_ids, deduped["topic_id"].tolist()

def Precision_score(pred_content, gt_content):
    """
    Arguments can be int (idx) or string values.
    """
    def precision(pred, gt):
        tp = len(set(pred)&set(gt))
        fp = len(set(pred)-set(gt))
        return tp/(tp+fp)

    # Get a recall score for each row of the dataset
    return [
        precision(pred, gt)
        for pred, gt in zip(pred_content, gt_content)
    ]

def Recall_score(pred_content, gt_content):
    """
    Arguments can be int (idx) or string values.
    """
    def recall(pred, gt):
        tp = len(set(pred)&set(gt))
        return tp/len(set(gt))

    # Get a recall score for each row of the dataset
    return [
        recall(pred, gt)
        for pred, gt in zip(pred_content, gt_content)
    ]

def mean_f2_score(Precision_scores, Recall_scores):
    """
    Inputs should be outputs of the `Precision_score` and 
    `Recall_score` functions.
    """
    beta = 2

    def f2_score(precision, recall):
        return (1+beta**2)*(precision*recall)/(beta**2*precision+recall+1e-7)

    return round(np.mean([f2_score(p, r) for p, r in zip(Precision_scores, Recall_scores)]), 5)



def compute_metrics(topic_embeddings, content_embeddings, val_df, k=100):
    """
    After creating embeddings for all of the topic and content texts,
    perform a semantic search and measure the recall@100. The model
    has not seen any of these examples before, so it should be a
    good measure of how well the model can generalize.

    Since the dataset uses the exploded view of the correlations
    (one topic with 5 contents is 5 rows), I need to deduplicate
    the topic embeddings. Then I can use the `semantic_search`
    function taken from the sentence-transformers util to
    do a cosine similarity search of the topic embeddings with all
    content embeddings. This function conveniently returns the top
    `k` indexes, which makes it easy to compare with the true indexes.
    """

    if isinstance(k, int):
        k = [k]

    # eval_predictions is a tuple of (model_output, labels)
    # The model_output is whatever is returned by `compute_loss`

    pred_content_ids, topic_ids = get_topk_preds(topic_embeddings, content_embeddings, val_df, k=max(k))

    # Make sure true content ids are in same order as predictions
    grouped = val_df[["topic_id", "content_id"]].groupby("topic_id").agg(list)
    true_content_ids = grouped.loc[topic_ids].reset_index()["content_id"]

    metrics = {}
    for kk in k:
        top_preds = [row[:kk] for row in pred_content_ids]
        precisions = Precision_score(top_preds, true_content_ids)
        recalls = Recall_score(top_preds, true_content_ids)
        f2 = mean_f2_score(precisions, recalls)

        metrics[f"recall@{kk}"] = np.round(np.mean(recalls), 5)
        metrics[f"f2@{kk}"] = np.round(f2, 5)

    return metrics

def get_StratifiedGroupKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_list = list(cv.split(X, y, groups))
    return cv_list

def get_StratifiedKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    """cv_listを取得"""
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_list = list(cv.split(X, y))
    return cv_list

def get_GroupKFold_list(X, y=None, groups=None, n_splits=5, seed=42):
    cv = GroupKFold(n_splits=n_splits)
    cv_list = list(cv.split(X, y, groups=groups))
    return cv_list


In [7]:
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [8]:
def del_newline(text):
    """改行を削除"""
    text = text.replace("\n\n", " ").replace("\n", " ").replace("\r", " ").replace("\f", " ").replace("\t", " ").replace("\v", " ")
    return text

def del_url_str(text):
    """URLに含まれる必要のない文字列を削除"""
    text = text.replace("https", "").replace("http", "").replace("www", "")
    return text

def del_symbol(text):
    """記号を削除"""
    # text = re.sub('[\[\]\!\"\#\$\%\&\'\\\\\(\)\*\+\-\/\:\;\<\=\>\@\^\_\`\{\|\}\~\「\」\〔\〕\“\”\〈\〉\『\』\【\】\＆\＊\・\（\）\＄\＃\＠\。\、\！\｀\＋\￥\％]', " ", text)
    # 複数回繰り返す文字列を削除
    text = re.sub("\.{2,}", " ", text)
    text = re.sub("(\. ){2,}", " ", text)
    text = re.sub("(\ .){2,}", " ", text)
    text = re.sub("\-{2,}", " ", text)
    text = re.sub("\|{2,}", " ", text)
    text = re.sub("\!{2,}", " ", text)
    text = re.sub("\?{2,}", " ", text)
    return text

def lower_text(text):
    """小文字化"""
    return text.lower()

def normalize_unicode(text, form='NFKC'):
    """正規化"""
    import unicodedata
    normalized_text = unicodedata.normalize(form, text)
    return normalized_text

def clean_text(text):
    # text = lower_text(text)
    # text = del_newline(text)
    # text = del_url_str(text)
    # text = del_symbol(text)
    # text = normalize_unicode(text)
    return text

In [9]:
def get_whole_df():
    content_df = pd.read_csv(os.path.join(cfg.INPUT, 'content.csv'), usecols=["id", "title", "description", "kind", "text", "language"])
    topic_df = pd.read_csv(os.path.join(cfg.INPUT, 'topics.csv'))
    correlation_df = pd.read_csv(os.path.join(cfg.INPUT, 'correlations.csv'))
    sub_df = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))
    return content_df, topic_df, correlation_df, sub_df

def preprocess_df(content_df, topic_df, correlation_df):
    content_df = content_df.add_prefix("content_")
    topic_df = topic_df.add_prefix("topic_")
    correlation_df = correlation_df.rename(columns={"content_ids":"content_id"})
    content_df = content_df.fillna("")
    topic_df = topic_df.fillna("")
    # cleaning
    for col in ["topic_title", "topic_description"]:
        topic_df[col] = topic_df[col].apply(clean_text)
    for col in ["content_title", "content_description", "content_kind", "content_text"]:
        content_df[col] = content_df[col].apply(clean_text)
    return content_df, topic_df, correlation_df

def get_processed_df():
    content_df, topic_df, correlation_df, sub_df = get_whole_df()
    content_df, topic_df, correlation_df = preprocess_df(content_df, topic_df, correlation_df)
    return content_df, topic_df, correlation_df, sub_df

In [10]:
# get context
sys.path.append("../input/")
from helper_funcs import Topic, ContentItem


def get_topic_context(x):
    topic = Topic(content_df, topic_df, correlation_df, [x])
    topic_context = topic.get_breadcrumbs(separator=" ")
    return topic_context

# 1st Stage Functions

In [11]:
def create_topic_emb_feature_df(topic_df):
    output_df = topic_df[["topic_id", "topic_channel", "topic_category"]].copy()
    output_df["topic_text"] = topic_df["topic_title"] + " </s> " + topic_df["topic_description"] + " </s> " + topic_df["topic_context"]
    return output_df

def create_content_emb_feature_df(content_df):
    output_df = content_df[["content_id"]].copy()
    output_df["content_text"] = content_df["content_title"] + " </s> " + content_df["content_description"] + " </s> " + content_df["content_kind"] + " </s> " + content_df["content_text"]
    return output_df

In [12]:
class MetricDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.topic_text = df["topic_text"].to_numpy()
        self.content_text = df["content_text"].to_numpy()
        self.label = df["target"].to_numpy()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, index):
        topic_text = self.prepare_input(self.cfg, self.topic_text[index])
        content_text = self.prepare_input(self.cfg, self.content_text[index])
        label = torch.tensor(self.label[index], dtype=torch.float)
        return topic_text, content_text, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

class MetricModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        cfg.hidden_size = self.config.hidden_size
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        # self.pool = AttentionPooling(self.config.hidden_size)
        self.pool = MeanPooling()
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        # self.fc = nn.Linear(self.config.hidden_size*2, cfg.num_classes)
        # self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs[0]
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        feature = F.normalize(feature, dim=1)
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature
    
    def forward_classification(self, inputs):
        feature = self.forward_once(inputs)
        logits = self.fc(feature)
        return logits

    def forward_once(self, inputs):
        feature = self.feature(inputs)
        # feature = self.ln(feature)
        return feature

    def forward(self, inputs1, inputs2):
        # batch, hidden_size
        feature1 = self.forward_once(inputs1)
        feature2 = self.forward_once(inputs2)
        return feature1, feature2

In [13]:
class MultipleNegativesRankingLoss(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, embeddings_a, embeddings_b, labels=None):
        """
        Compute similarity between `a` and `b`.
        Labels have the index of the row number at each row. 
        This indicates that `a_i` and `b_j` have high similarity 
        when `i==j` and low similarity when `i!=j`.
        """

        similarity_scores = (
            cos_sim(embeddings_a, embeddings_b) * 20.0
        )  # Not too sure why to scale it by 20: https://github.com/UKPLab/sentence-transformers/blob/b86eec31cf0a102ad786ba1ff31bfeb4998d3ca5/sentence_transformers/losses/MultipleNegativesRankingLoss.py#L57

        labels = torch.tensor(
            range(len(similarity_scores)),
            dtype=torch.long,
            device=similarity_scores.device,
        )  # Example a[i] should match with b[i]

        return self.loss_function(similarity_scores, labels)


In [14]:
class MultipleNegativesSymmetricRankingLoss(nn.Module):
    def __init__(self, scale: float = 20):
        """
        :param model: SentenceTransformer model
        :param scale: Output of similarity function is multiplied by scale value
        :param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
        """
        super(MultipleNegativesSymmetricRankingLoss, self).__init__()
        self.scale = scale
        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()


    def forward(self, embeddings_a, embeddings_b, labels=None):
        # reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]

        # anchor = reps[0]
        # candidates = torch.cat(reps[1:])

        # scores = self.similarity_fct(anchor, candidates) * self.scale
        scores = (
            cos_sim(embeddings_a, embeddings_b) * 20.0
        )
        labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device)  # Example a[i] should match with b[i]

        # anchor_positive_scores = scores[:, 0:len(reps[1])]
        anchor_positive_scores = scores[:, 0:len(embeddings_b)]
        forward_loss = self.cross_entropy_loss(scores, labels)
        backward_loss = self.cross_entropy_loss(anchor_positive_scores.transpose(0, 1), labels)
        return (forward_loss + backward_loss) / 2

In [15]:
def get_cand_train_fn(_cfg, train_loader, valid_loader, train_df, valid_df, criterion, optimizer, scheduler, model, fold, epoch, best_val_score, es_count):
    LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
    train_losses = []
    train_nums = []
    model.train()
    scaler = GradScaler(enabled=_cfg.apex)
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for step, (topic_inputs, content_inputs, labels) in enumerate(pbar):
            topic_inputs = collate(topic_inputs)
            content_inputs = collate(content_inputs)
            for k, v in topic_inputs.items():
                topic_inputs[k] = v.to(_cfg.device)
            for k, v in content_inputs.items():
                content_inputs[k] = v.to(_cfg.device)
            labels = labels.to(_cfg.device)
            with autocast(enabled=_cfg.apex):
                topic_emb, content_emb = model(topic_inputs, content_inputs)
            loss = criterion(topic_emb, content_emb)

            pbar.set_postfix({
                'loss': loss.item(),
                'lr': scheduler.get_lr()[0]
            })
            train_losses.append(loss.item() * len(labels))
            train_nums.append(len(labels))

            if _cfg.gradient_accumulation_steps > 1:
                loss = loss / _cfg.gradient_accumulation_steps

            scaler.scale(loss).backward()

            if _cfg.clip_grad_norm is not None:
                # scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    _cfg.clip_grad_norm
                )

            if (step+1) % _cfg.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            if step % _cfg.eval_step == 0 and step != 0:
                best_val_score, val_score, es_count = get_cand_valid_fn(
                    _cfg,
                    valid_loader,
                    valid_df,
                    criterion,
                    model,
                    fold,
                    epoch,
                    step,
                    best_val_score,
                    es_count
                    )
                model.train()

            if _cfg.wandb:
                wandb.log({f"[fold{fold}] train_loss": loss.item(),
                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
    train_loss = sum(train_losses)/sum(train_nums)
    return train_loss, es_count



def get_cand_valid_fn(_cfg, valid_loader, valid_df, criterion, model, fold, epoch, step, best_val_score, es_count):
    val_losses = []
    topic_embs = []
    content_embs = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (topic_inputs, content_inputs, labels) in pbar:
                topic_inputs = collate(topic_inputs)
                content_inputs = collate(content_inputs)
                for k, v in topic_inputs.items():
                    topic_inputs[k] = v.to(_cfg.device)
                for k, v in content_inputs.items():
                    content_inputs[k] = v.to(_cfg.device)
                labels = labels.to(_cfg.device)
                with autocast(enabled=_cfg.apex):
                    topic_emb, content_emb = model(topic_inputs, content_inputs)
                loss = criterion(topic_emb, content_emb)
                val_losses.append(loss.item() * len(labels))
                topic_embs.append(topic_emb.detach().cpu().numpy())
                content_embs.append(content_emb.detach().cpu().numpy())
                val_nums.append(len(labels))

    val_loss = sum(val_losses) / sum(val_nums)
    topic_embs = np.concatenate(topic_embs)
    content_embs = np.concatenate(content_embs)
    metrics_dict = compute_metrics(topic_embs, content_embs, valid_df, k=[5, 10, 50])
    val_score = metrics_dict["recall@50"]
    LOGGER.info(f"Fold: {fold}, Epoch: {epoch}/{_cfg.n_epoch}, Step: {step} | val_loss: {np.round(val_loss, 5)}, \n metrics: {metrics_dict}")


    if val_score > best_val_score:
        best_val_score = val_score
        torch.save(
            model.state_dict(),
            os.path.join(_cfg.EXP_MODEL, f"cand_fold{fold}.pth")
        )
        pickle.dump(topic_emb, open(pathlib.Path(_cfg.OUTPUT_EXP) / f"topic_emb_fold{fold}.pkl", "wb"))
        pickle.dump(content_emb, open(pathlib.Path(_cfg.OUTPUT_EXP) / f"content_emb_fold{fold}.pkl", "wb"))
        es_count = 0
    elif es_count == _cfg.es_patience:
        es_count += 100
    else:
        es_count += 1
    
    LOGGER.info(f"Early stopping count: {es_count}")

    return best_val_score, val_score, es_count



def get_cand_train_loop(_cfg, train_data: pd.DataFrame, cv_list: List, correlation_df: pd.DataFrame):
    """_summary_

    Args:
        _cfg (_type_): _description_
        train_data (pd.DataFrame): textとtargetが格納されたデータフレーム
        cv_list (List): _description_
        correlation_df (pd.DataFrame): _description_

    Returns:
        _type_: _description_
    """
    fold_score = []

    train_data["fold"] = -1
    for fold in _cfg.train_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        train_idx, valid_idx = cv_list[fold]
        train_data.loc[valid_idx, "fold"] = fold
        train_data.loc[train_data["topic_category"]=="source", "fold"] = -1 # sourceは学習に使用
        train_df = train_data[train_data["fold"]!=fold].reset_index(drop=True)
        valid_df = train_data[train_data["fold"]==fold].reset_index(drop=True)

        # Datasetの設定
        train_dataset = MetricDataset(_cfg, train_df)
        valid_dataset = MetricDataset(_cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=_cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=_cfg.batch_size * 2,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = MetricModel(_cfg)
        model.config.save_pretrained(pathlib.Path(cfg.EXP_MODEL)/"1st_model_config")
        # model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(_cfg.device)

        criterion = MultipleNegativesSymmetricRankingLoss()

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(_cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=_cfg.encoder_lr, eps=_cfg.eps, betas=_cfg.betas, weight_decay=_cfg.weight_decay)

        num_train_steps = int(len(train_df) / _cfg.batch_size * _cfg.n_epoch)
        scheduler = get_scheduler(_cfg, optimizer, num_train_steps)


        # model-training
        best_val_score = -np.inf
        es_count = 0

        for epoch in range(_cfg.n_epoch):
            train_loss, es_count = get_cand_train_fn(
                _cfg,
                train_loader,
                valid_loader,
                train_df,
                valid_df,
                criterion,
                optimizer,
                scheduler,
                model,
                fold,
                epoch,
                best_val_score,
                es_count,
                )
            best_val_score, val_score, es_count = get_cand_valid_fn(
                _cfg,
                valid_loader,
                valid_df,
                criterion,
                model,
                fold,
                epoch,
                "fold",
                best_val_score,
                es_count,
                )

            if _cfg.wandb:
                wandb.log({
                    f"[fold{fold}] epoch": epoch,
                    f"[fold{fold}] avg_train_loss": train_loss,
                    f"[fold{fold}] avg_val_score": val_score,
                    # f"[fold{fold}] score": best_val_score
                    })
            
            if es_count >= 100:
                break

        # save embeddings
        # oof_embs[valid_idx] = val_embs
        # torch.save(oof_embs, os.path.join(_cfg.EXP_PREDS, f"oof_embs_fold{fold}.pt"))
        # save oof preds
        # oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        # np.save(os.path.join(_cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)

        # fold_score.append(best_val_score)
        del model
        gc.collect()
        torch.cuda.empty_cache()
        # break # 1fold

    # torch.save(oof_embs, os.path.join(_cfg.EXP_PREDS, f"oof_embs.pt"))
    # np.save(os.path.join(_cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    # score = calc_comp_score(train_data, oof_pred, correlation_df)
    # LOGGER.info(f'fold score: {fold_score}')
    # LOGGER.info(f'CV: {round(score, 4)}')
    # return score

# 2nd Stage Functions

In [16]:
# def create_emb_feature_df(input_df):
#     input_df["text"] = input_df["topic_title"] + " </s> " \
#         + input_df["content_title"] + " </s> " \
#         + input_df["topic_context"] + " </s> " \
#         + input_df["topic_description"] + " </s> " \
#         + input_df["content_description"] + " </s> " \
#         + input_df["content_kind"] + " </s> " \
#         + input_df["content_text"]
#     return input_df[["topic_id", "content_id", "topic_category", "text", "target"]]
def create_emb_feature_df(input_df):
    input_df["text"] = input_df["topic_title"] + " </s> " \
        + input_df["topic_description"] + " </s> " \
        + input_df["topic_context"] + " </s> " \
        + input_df["content_title"] + " </s> " \
        + input_df["content_description"] + " </s> " \
        + input_df["content_kind"] + " </s> " \
        + input_df["content_text"]
    return input_df[["topic_id", "content_id", "topic_category", "text", "target"]]

In [17]:
def get_cand_df(topic_df, content_df, topic_emb, content_emb, n_neighbors=10):
    """
    languageごとに各topicにn_neighbors個のcontentを割り当てる
    """

    topic_pred_df = []
    for _lang in tqdm(topic_df["topic_language"].unique()):
        # 対象のlanguageのindex
        content_lang_idx = content_df["content_language"] == _lang
        topic_lang_idx = topic_df["topic_language"] == _lang

        # 対象languageのdataframe
        content_lang_df = content_df[content_lang_idx].reset_index()
        topic_lang_df = topic_df[topic_lang_idx].reset_index()

        # 対象languageのembeddings
        content_lang_vec = content_emb[content_lang_idx]
        topic_lang_vec = topic_emb[topic_lang_idx]

        content_lang_vec_gpu = cp.array(content_lang_vec)
        topic_lang_vec_gpu = cp.array(topic_lang_vec)
        neighbors_model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
        neighbors_model.fit(content_lang_vec_gpu)
        indices = neighbors_model.kneighbors(topic_lang_vec_gpu, return_distance=False)

        preds = []
        for _content_indices in range(len(indices)):
            idx_pred = indices[_content_indices]
            str_pred = " ".join([content_lang_df.loc[content_idx, "content_id"] for content_idx in idx_pred.get()])
            preds.append(str_pred)
        topic_lang_pred_df = topic_lang_df.copy()
        topic_lang_pred_df["content_id"] = preds
        topic_pred_df.append(topic_lang_pred_df)

        del content_lang_vec_gpu, topic_lang_vec_gpu, neighbors_model, indices, preds, topic_lang_pred_df
        gc.collect()
        torch.cuda.empty_cache()

    topic_pred_df = pd.concat(topic_pred_df).sort_values("index").drop(columns="index").reset_index(drop=True)

    return topic_pred_df[["topic_id", "content_id"]]

In [18]:
class EmbDataset(Dataset):
    def __init__(self, cfg, df, col):
        self.cfg = cfg
        self.text = df[col].to_numpy()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        return text

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

In [19]:
def get_cand_inference_fn(_cfg, data_loader, model):
    embs = []
    model.eval()
    with torch.no_grad():
        with tqdm(data_loader, total=len(data_loader)) as pbar:
            for inputs in pbar:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(_cfg.device)
                with autocast(enabled=_cfg.apex):
                    emb = model.forward_once(inputs)
                emb = emb.detach().cpu().numpy()
                embs.append(emb)
    embs = np.concatenate(embs)
    return embs

def get_cand_fn(_cfg, topic_df, content_df):
    topic_dataset = EmbDataset(_cfg, topic_df, "topic_text")
    content_dataset = EmbDataset(_cfg, content_df, "content_text")
    topic_loader = DataLoader(
        dataset=topic_dataset,
        batch_size=_cfg.batch_size*2,
        shuffle=False,
        pin_memory=True,
        drop_last=False
    )
    content_loader = DataLoader(
        dataset=content_dataset,
        batch_size=_cfg.batch_size*2,
        shuffle=False,
        pin_memory=True,
        drop_last=False
    )

    model = MetricModel(_cfg)
    model.load_state_dict(torch.load(f"/home/working/output/exp019/model/cand_fold0.pth"))
    model = model.to(_cfg.device)
    topic_emb = get_cand_inference_fn(_cfg, topic_loader, model)
    content_emb = get_cand_inference_fn(_cfg, content_loader, model)


    del model
    gc.collect()
    torch.cuda.empty_cache()

    return topic_emb, content_emb

In [20]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

class BiEncoderDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df["text"].to_numpy()
        self.label = df["target"].to_numpy()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, index):
        text = self.prepare_input(self.cfg, self.text[index])
        label = self.label[index].astype(np.float32)
        return text, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
            )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs


In [21]:
class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        # self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        # self.fc = nn.Linear(self.config.hidden_size, cfg.num_classes)
        # self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        # batch, hidden_size
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output.squeeze()



In [22]:
def train_fn(cfg, train_loader, valid_loader, train_df, valid_df, criterion, optimizer, scheduler, model, fold, epoch, best_val_preds, best_val_score, es_count):
    LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
    train_losses = []
    train_nums = []
    model.train()
    scaler = GradScaler(enabled=cfg.apex)
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for step, (inputs, labels) in enumerate(pbar):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(enabled=cfg.apex):
                output = model(inputs)
            loss = criterion(output, labels)

            pbar.set_postfix({
                'loss': loss.item(),
                'lr': scheduler.get_lr()[0]
            })
            train_losses.append(loss.item() * len(labels))
            train_nums.append(len(labels))

            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps

            scaler.scale(loss).backward()

            if cfg.clip_grad_norm is not None:
                # scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    cfg.clip_grad_norm
                )

            if (step+1) % cfg.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            if step % cfg.eval_step == 0 and step != 0:
                best_val_preds, best_val_score, val_loss, es_count = valid_fn(
                    cfg,
                    valid_loader,
                    valid_df,
                    criterion,
                    model,
                    fold,
                    epoch,
                    step,
                    best_val_preds,
                    best_val_score,
                    es_count,
                )
                model.train()

            if cfg.wandb:
                wandb.log({f"[fold{fold}] train_loss": loss.item(),
                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
    train_loss = sum(train_losses)/sum(train_nums)
    return train_loss



def valid_fn(cfg, valid_loader, valid_df, criterion, model, fold, epoch, step, best_val_preds, best_val_score, es_count):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast(enabled=cfg.apex):
                    output = model(inputs)
                loss = criterion(output, labels)
                output = output.detach().cpu().numpy()
                output = sigmoid(output)
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    y_preds = np.where(val_preds>=0.05, 1, 0)
    score = fbeta_score(valid_df["target"], y_preds, beta=2)
    recall = recall_score(valid_df["target"], y_preds)

    LOGGER.info(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epoch}, Step: {step} | val_loss: {np.round(val_loss, 5)}, f2: {np.round(score, 5)}, recall: {np.round(recall, 5)}')

    if score > best_val_score:
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(),
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
        pickle.dump(best_val_preds, open(pathlib.Path(cfg.OUTPUT_EXP) / "best_val_preds.pkl", "wb"))
        es_count = 0
    elif es_count == cfg.es_patience:
        es_count += 100
    else:
        es_count += 1
    LOGGER.info(f"Early stopping count: {es_count}")

    return best_val_preds, best_val_score, val_loss, es_count


def train_loop(cfg, train_data: pd.DataFrame, correlation_df: pd.DataFrame):
    """_summary_

    Args:
        cfg (_type_): _description_
        train_data (pd.DataFrame): textとtargetが格納されたデータフレーム
        cv_list (List): _description_
        correlation_df (pd.DataFrame): _description_

    Returns:
        _type_: _description_
    """
    oof_pred = np.zeros((len(train_data)), dtype=np.float32)
    fold_score = []

    for fold in [0]:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')

        train_data.loc[train_data["topic_category"]=="source", "fold"] = -1 # sourceは学習に使用
        train_df = train_data[train_data["fold"]!=fold].reset_index(drop=True)
        valid_df = train_data[train_data["fold"]==fold]
        valid_idx = valid_df.index
        valid_df = valid_df.reset_index(drop=True)


        # Datasetの設定
        train_dataset = BiEncoderDataset(cfg, train_df)
        valid_dataset = BiEncoderDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        model.load_state_dict(torch.load(f"/home/working/output/exp019/model/cand_fold0.pth"))
        # for param in model.parameters():
        #     param.requires_grad = False  # 勾配を計算しない
        model.pool = MeanPooling()
        model.fc = nn.Linear(model.config.hidden_size, cfg.num_classes)
        model._init_weights(model.fc)
        
        model.config.save_pretrained(pathlib.Path(cfg.EXP_MODEL)/"2nd_model_config")
        # model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(cfg.device)
        
        # # クラス数の逆数で重み付け
        # n_0 = len(valid_df[valid_df["target"]==0])
        # n_1 = len(valid_df[valid_df["target"]==1])
        # pos_weight = torch.tensor([1 / n_1]).to(cfg.device)
        # criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        criterion = nn.BCEWithLogitsLoss()

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epoch)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # model-training
        best_val_preds = None
        best_val_score = -np.inf
        es_count = 0

        for epoch in range(cfg.n_epoch):
            train_loss = train_fn(
                cfg,
                train_loader,
                valid_loader,
                train_df,
                valid_df,
                criterion,
                optimizer,
                scheduler,
                model,
                fold,
                epoch,
                best_val_preds,
                best_val_score,
                es_count,
                )

            LOGGER.info(f'Fold{fold}, Epoch{epoch}/{cfg.n_epoch} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss, es_count = valid_fn(
                cfg,
                valid_loader,
                valid_df,
                criterion,
                model,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
                es_count,
            )


            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch,
                        f"[fold{fold}] avg_train_loss": train_loss,
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

            if es_count >= 100:
                break

        oof_pred[valid_idx] = best_val_preds.squeeze().astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model
        gc.collect()
        torch.cuda.empty_cache()
        break

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    # score = calc_comp_score(train_data, oof_pred, correlation_df)
    # LOGGER.info(f'fold score: {fold_score}')
    # LOGGER.info(f'CV: {round(score, 4)}')
    # return score

# Setup & Preprocessing

In [23]:
content_df, topic_df, correlation_df, sub_df = get_processed_df()

In [24]:
# get context
is_overwrite = False
filepath = pathlib.Path(cfg.OUTPUT_EXP) / "topic_context.csv"
if filepath.exists() and not is_overwrite:
    topic_context = pd.read_csv(filepath)
else:
    topic_context = topic_df["topic_id"].parallel_apply(get_topic_context)
    topic_context.to_csv(filepath, index=False)
topic_df["topic_context"] = topic_context

In [25]:
# remove topic has no content
topic_df = topic_df[topic_df["topic_has_content"]].reset_index(drop=True)

topic_emb_feature_df = create_topic_emb_feature_df(topic_df)
content_emb_feature_df = create_content_emb_feature_df(content_df)

# correlation_dfを広げる
correlation_df_ = correlation_df.copy()
correlation_df_["content_id"] = correlation_df["content_id"].apply(lambda x: x.split(" "))
correlation_df_ = correlation_df_.explode("content_id")

# correlation_dfに含まれるtopicとcontentの組み合わせを学習データとする
cand_df = pd.merge(correlation_df_, topic_emb_feature_df, on="topic_id", how="left")
cand_df = pd.merge(cand_df, content_emb_feature_df, on="content_id", how="left")
cand_df["target"] = 1

cand_df

Unnamed: 0,topic_id,content_id,topic_channel,topic_category,topic_text,content_text,target
0,t_00004da3a1b2,c_1108dd0c7a5d,000cf7,source,Откриването на резисторите </s> Изследване на ...,Молив като резистор </s> Моливът причинява про...,1
1,t_00004da3a1b2,c_376c5a8eb028,000cf7,source,Откриването на резисторите </s> Изследване на ...,Да чуем променливото съпротивление </s> Тук че...,1
2,t_00004da3a1b2,c_5bc0e1e2cba0,000cf7,source,Откриването на резисторите </s> Изследване на ...,Променлив резистор (реостат) с графит от молив...,1
3,t_00004da3a1b2,c_76231f9d0b5e,000cf7,source,Откриването на резисторите </s> Изследване на ...,Последователно свързване на галваничен елемент...,1
4,t_00068291e9a4,c_639ea2ef9c95,8e286a,source,Entradas e saídas de uma função </s> Entenda u...,Dados e resultados de funções: gráficos </s> E...,1
...,...,...,...,...,...,...,...
279914,t_fff9e5407d13,c_d64037a72376,71fd51,supplemental,NA_U06 - El periódico </s> </s> PF (Español) ...,Introducción: El periódico </s> </s> html5 </s>,1
279915,t_fffbe1d5d43c,c_46f852a49c08,0c929f,source,Inscribed shapes problem solving </s> Use prop...,Proof: Right triangles inscribed in circles -d...,1
279916,t_fffbe1d5d43c,c_6659207b25d5,0c929f,source,Inscribed shapes problem solving </s> Use prop...,Area of inscribed equilateral triangle -dubbed...,1
279917,t_fffe14f1be1e,c_cece166bad6a,6e90a7,aligned,Lección 7 </s> </s> CREE Para el Estudiante I...,Juego con las palabras </s> </s> document </s>,1


# 1st Stage

In [26]:
metric_cfg.tokenizer = AutoTokenizer.from_pretrained(metric_cfg.MODEL_PATH)
metric_cfg.tokenizer.save_pretrained(pathlib.Path(metric_cfg.OUTPUT_EXP) / "1st_tokenizer")

cand_cv_list = get_GroupKFold_list(X=cand_df, y=cand_df["target"], groups=cand_df["topic_channel"], n_splits=metric_cfg.num_fold, seed=metric_cfg.seed)

# fold0
fold0_topic_id = cand_df.iloc[cand_cv_list[0][1]]["topic_id"].unique()

# get_cand_train_loop(metric_cfg, cand_df, cand_cv_list, correlation_df)

In [27]:
is_overwrite = False
filepath = pathlib.Path(cfg.OUTPUT_EXP) / "topic_emb.pkl"
if not filepath.exists() or is_overwrite:
    topic_emb, content_emb = get_cand_fn(metric_cfg, topic_emb_feature_df, content_emb_feature_df)
    pickle.dump(topic_emb, open(pathlib.Path(cfg.OUTPUT_EXP) / "topic_emb.pkl", "wb"))
    pickle.dump(content_emb, open(pathlib.Path(cfg.OUTPUT_EXP) / "content_emb.pkl", "wb"))
    # train_text_df = get_cand_df(topic_df, content_df, topic_emb, content_emb, n_neighbors=50)
    # train_text_df.to_csv(f"{cfg.OUTPUT_EXP}/train_text_df.csv", index=False)
else:
    topic_emb = pickle.load(open(pathlib.Path(cfg.OUTPUT_EXP) / "topic_emb.pkl", "rb"))
    content_emb = pickle.load(open(pathlib.Path(cfg.OUTPUT_EXP) / "content_emb.pkl", "rb"))
    # train_text_df = pd.read_csv(f"{cfg.OUTPUT_EXP}/train_text_df.csv")

train_text_df = get_cand_df(topic_df, content_df, topic_emb, content_emb, n_neighbors=50)
train_text_df.to_csv(f"{cfg.OUTPUT_EXP}/train_text_df.csv", index=False)

  0%|          | 0/27 [00:00<?, ?it/s]

In [28]:
_correlation_df = correlation_df[correlation_df["topic_id"].isin(fold0_topic_id)]
_train_text_df = train_text_df[train_text_df["topic_id"].isin(fold0_topic_id)]

# pred_df = pd.merge(_correlation_df[["topic_id"]], _train_text_df, on="topic_id", how="left")
# pred_df = pred_df.fillna("")

# cv_score = comp_recall_score(_correlation_df["content_id"], pred_df["content_id"])
cv_score = comp_recall_score(_correlation_df["content_id"], _train_text_df["content_id"])

# del pred_df
gc.collect()

print(np.round(cv_score, 5))

0.82504


In [29]:
train_text_df_6 = get_cand_df(topic_df, content_df, topic_emb, content_emb, n_neighbors=6)
_correlation_df = correlation_df[correlation_df["topic_id"].isin(fold0_topic_id)]
_train_text_df = train_text_df_6[train_text_df_6["topic_id"].isin(fold0_topic_id)]
pred_df_ = pd.merge(_correlation_df[["topic_id"]], _train_text_df, on="topic_id", how="left")
pred_df_ = pred_df_.fillna("")

cv_score = comp_recall_score(_correlation_df["content_id"], pred_df_["content_id"])
cv_score1 = comp_fbeta_score(_correlation_df["content_id"], pred_df_["content_id"])

# del pred_df_
gc.collect()

print(np.round(cv_score, 5))
print(np.round(cv_score1, 5))

  0%|          | 0/27 [00:00<?, ?it/s]

0.51976
0.42989


# 2nd Stage

In [30]:
train_text_df["content_id"] = train_text_df["content_id"].apply(lambda x: x.split(" "))
train_text_df = train_text_df.explode("content_id").reset_index(drop=True)

train_text_df = pd.merge(train_text_df, cand_df[["topic_id", "content_id"]], on=["topic_id", "content_id"], how="outer")

train_text_df = pd.merge(train_text_df, topic_df, on="topic_id", how="left")
train_text_df = pd.merge(train_text_df, content_df, on="content_id", how="left")

In [31]:
# create target
_correlation_df = correlation_df.copy()
_correlation_df["content_id"] = _correlation_df["content_id"].apply(lambda x: x.split(" "))
_correlation_df = _correlation_df.explode("content_id")
_correlation_df["target"] = 1
train_text_df = pd.merge(train_text_df, _correlation_df, on=["topic_id", "content_id"], how="left")
train_text_df = create_emb_feature_df(train_text_df)
train_text_df["target"] = train_text_df["target"].fillna(0) # correlation_dfにないペアは0とする

# create fold
train_text_df["fold"] = -1
train_text_df.loc[train_text_df["topic_id"].isin(fold0_topic_id), "fold"] = 0
train_text_df

Unnamed: 0,topic_id,content_id,topic_category,text,target,fold
0,t_00004da3a1b2,c_e1e8557d7c61,source,Откриването на резисторите </s> Изследване на ...,0.0,-1
1,t_00004da3a1b2,c_1108dd0c7a5d,source,Откриването на резисторите </s> Изследване на ...,1.0,-1
2,t_00004da3a1b2,c_c38db8abc0c5,source,Откриването на резисторите </s> Изследване на ...,0.0,-1
3,t_00004da3a1b2,c_9d6c0ff046ab,source,Откриването на резисторите </s> Изследване на ...,0.0,-1
4,t_00004da3a1b2,c_fe262c542966,source,Откриването на резисторите </s> Изследване на ...,0.0,-1
...,...,...,...,...,...,...
3149568,t_fff7f2dd208b,c_112de3281469,source,Fatoração de polinômios encontrando fatores co...,1.0,-1
3149569,t_fff7f2dd208b,c_15470abc39f4,source,Fatoração de polinômios encontrando fatores co...,1.0,-1
3149570,t_fff9e5407d13,c_026db653a269,supplemental,NA_U06 - El periódico </s> </s> PF (Español) ...,1.0,0
3149571,t_fff9e5407d13,c_4edb6a46f75e,supplemental,NA_U06 - El periódico </s> </s> PF (Español) ...,1.0,0


In [32]:
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(pathlib.Path(cfg.OUTPUT_EXP) / "2nd_tokenizer")

score = train_loop(cfg, train_text_df, correlation_df)



  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/10, Step: 1000 | val_loss: 0.30944, f2: 0.34188, recall: 0.95671
Early stopping count: 0


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/10, Step: 2000 | val_loss: 0.29948, f2: 0.36831, recall: 0.83319
Early stopping count: 0


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/10, Step: 3000 | val_loss: 0.29659, f2: 0.37049, recall: 0.83597
Early stopping count: 0
Fold0, Epoch0/10 | train_loss: 0.29594


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/10, Step: end | val_loss: 0.30007, f2: 0.36396, recall: 0.87412
Early stopping count: 0


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/10, Step: 1000 | val_loss: 0.2866, f2: 0.38289, recall: 0.79705
Early stopping count: 0


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/10, Step: 2000 | val_loss: 0.29816, f2: 0.39345, recall: 0.55994
Early stopping count: 0


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/10, Step: 3000 | val_loss: 0.2913, f2: 0.42009, recall: 0.60164
Early stopping count: 0
Fold0, Epoch1/10 | train_loss: 0.21747


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/10, Step: end | val_loss: 0.28813, f2: 0.42689, recall: 0.63754
Early stopping count: 0


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/10, Step: 1000 | val_loss: 0.29341, f2: 0.41627, recall: 0.64378
Early stopping count: 1


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/10, Step: 2000 | val_loss: 0.30447, f2: 0.4131, recall: 0.63063
Early stopping count: 2


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/10, Step: 3000 | val_loss: 0.29621, f2: 0.43012, recall: 0.60471
Early stopping count: 0
Fold0, Epoch2/10 | train_loss: 0.19232


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/10, Step: end | val_loss: 0.29808, f2: 0.42589, recall: 0.60529
Early stopping count: 1


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/10, Step: 1000 | val_loss: 0.2933, f2: 0.4325, recall: 0.61565
Early stopping count: 0


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/10, Step: 2000 | val_loss: 0.30616, f2: 0.43582, recall: 0.60015
Early stopping count: 0


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/10, Step: 3000 | val_loss: 0.31741, f2: 0.41433, recall: 0.53062
Early stopping count: 1
Fold0, Epoch3/10 | train_loss: 0.17445


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/10, Step: end | val_loss: 0.31317, f2: 0.42257, recall: 0.55533
Early stopping count: 2


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/10, Step: 1000 | val_loss: 0.309, f2: 0.42069, recall: 0.58696
Early stopping count: 3


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/10, Step: 2000 | val_loss: 0.31366, f2: 0.41558, recall: 0.55994
Early stopping count: 103


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/10, Step: 3000 | val_loss: 0.31635, f2: 0.43119, recall: 0.59718
Early stopping count: 0
Fold0, Epoch4/10 | train_loss: 0.15998


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/10, Step: end | val_loss: 0.31481, f2: 0.43597, recall: 0.59871
Early stopping count: 0


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 5/10, Step: 1000 | val_loss: 0.32851, f2: 0.4272, recall: 0.55759
Early stopping count: 1


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 5/10, Step: 2000 | val_loss: 0.32507, f2: 0.42104, recall: 0.58086
Early stopping count: 2


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 5/10, Step: 3000 | val_loss: 0.33874, f2: 0.42174, recall: 0.55322
Early stopping count: 3
Fold0, Epoch5/10 | train_loss: 0.14774


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 5/10, Step: end | val_loss: 0.33184, f2: 0.43442, recall: 0.58192
Early stopping count: 1


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 6/10, Step: 1000 | val_loss: 0.34544, f2: 0.42198, recall: 0.53431
Early stopping count: 2


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 6/10, Step: 2000 | val_loss: 0.34864, f2: 0.41529, recall: 0.52068
Early stopping count: 3


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 6/10, Step: 3000 | val_loss: 0.35439, f2: 0.41265, recall: 0.50921
Early stopping count: 103
Fold0, Epoch6/10 | train_loss: 0.13776


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 6/10, Step: end | val_loss: 0.34605, f2: 0.42067, recall: 0.54458
Early stopping count: 2


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 7/10, Step: 1000 | val_loss: 0.35387, f2: 0.42021, recall: 0.53748
Early stopping count: 3


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 7/10, Step: 2000 | val_loss: 0.36838, f2: 0.40562, recall: 0.48637
Early stopping count: 103


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 7/10, Step: 3000 | val_loss: 0.35817, f2: 0.41823, recall: 0.53278
Early stopping count: 104
Fold0, Epoch7/10 | train_loss: 0.13005


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 7/10, Step: end | val_loss: 0.36401, f2: 0.41286, recall: 0.51377
Early stopping count: 3


  0%|          | 0/3809 [00:00<?, ?it/s]

  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 8/10, Step: 1000 | val_loss: 0.36842, f2: 0.41242, recall: 0.52083
Early stopping count: 103


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 8/10, Step: 2000 | val_loss: 0.37277, f2: 0.40775, recall: 0.50048
Early stopping count: 104


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 8/10, Step: 3000 | val_loss: 0.37156, f2: 0.41162, recall: 0.50739
Early stopping count: 105
Fold0, Epoch8/10 | train_loss: 0.12483


  0%|          | 0/292 [00:00<?, ?it/s]

Fold: 0, Epoch: 8/10, Step: end | val_loss: 0.37235, f2: 0.40874, recall: 0.50859
Early stopping count: 103


In [101]:
valid_df = train_text_df[train_text_df["fold"]==0]
valid_df = valid_df[valid_df["topic_category"]!="source"]

best_val_preds = pickle.load(open(pathlib.Path(cfg.OUTPUT_EXP) / "best_val_preds.pkl", "rb"))

In [75]:
for i in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12]:
    y_preds = np.where(best_val_preds>=i, 1, 0)
    score = fbeta_score(valid_df["target"], y_preds, beta=2)
    recall = recall_score(valid_df["target"], y_preds)
    print(f"{i}:", np.round(score, 5), np.round(recall, 5))

0.01: 0.42845 0.82918
0.02: 0.46514 0.77247
0.03: 0.48732 0.73919
0.04: 0.49993 0.71284
0.05: 0.50859 0.69164
0.06: 0.5147 0.67395
0.07: 0.51857 0.65882
0.08: 0.52051 0.64418
0.09: 0.52135 0.63094
0.1: 0.52115 0.61861
0.11: 0.52161 0.60815
0.12: 0.52065 0.59763


In [76]:
# if cfg.upload_from_colab:
#     dataset_create_new(dataset_name=cfg.EXP, upload_dir=cfg.OUTPUT_EXP)

In [102]:
valid_df["pred_prob"] = best_val_preds
valid_df["pred"] = np.where(best_val_preds>=0.06, 1, 0)
valid_df = valid_df.sort_values("pred_prob", ascending=False)

In [137]:
pos = valid_df[valid_df["pred"]==1].groupby("topic_id")["content_id"].agg(list).reset_index()

# 予測contentがなければ、最も予測確率が高いcontentを割り当て
no_pos = valid_df.groupby("topic_id").head(1)
no_pos = no_pos[no_pos["pred"]==0].groupby("topic_id")["content_id"].agg(list).reset_index()

In [138]:
pred_df = pd.concat([pos, no_pos]).reset_index(drop=True)
pred_df["content_id"] = pred_df["content_id"].apply(lambda x: " ".join(x))
pred_df.columns = ["topic_id", "content_id"]
pred_df

Unnamed: 0,topic_id,content_id
0,t_00102869fbcb,c_b44c206ad756 c_bef2c010d6f9 c_6062d2e46506 c...
1,t_0012a45fa09c,c_dde078b8ea7a
2,t_0016d30772f3,c_ff54ff4784e2 c_061d9f90bb06 c_ea312de91d4f c...
3,t_001bd01717d7,c_16f1cff519b3 c_66c8c07d7f46 c_54a2f234b783 c...
4,t_001edc523bd1,c_dce3269cf0e6 c_68c33f74debf c_f3d956ce2a3b c...
...,...,...
7643,t_fdc6d9a37d4f,c_e796680f8368
7644,t_fe095aa127e0,c_9af2856b97ab
7645,t_fe1fd643b6cd,c_585c5d4803cb
7646,t_fec0a310a189,c_9acb28a94c26


In [126]:
target_df = correlation_df[correlation_df["topic_id"].isin(valid_df["topic_id"])]
target_df

Unnamed: 0,topic_id,content_id
8,t_00102869fbcb,c_005e793bd0c5 c_066737fa5146 c_08b2070f92e0 c...
10,t_0012a45fa09c,c_dde078b8ea7a
14,t_0016d30772f3,c_061d9f90bb06 c_242ddc729eec c_61b851222e17 c...
19,t_001bd01717d7,c_16f1cff519b3 c_289a31069ea7 c_6f0a3cf19895 c...
23,t_001edc523bd1,c_5298b652bdf8 c_68c33f74debf c_dce3269cf0e6 c...
...,...,...
61468,t_ffc52bedb319,c_0343a44b435b c_04160d71e42d c_177cd75e88c7 c...
61479,t_ffcf958baa88,c_01e0760aef78 c_4cd8d3554464 c_614c24b65833 c...
61498,t_ffe8df837f62,c_1251a6f1c5c1 c_4d8b2d04de68 c_72f1b23b472e c...
61501,t_ffeb3f57ecf3,c_64b4d0736580


In [140]:
comp_fbeta_score(target_df["content_id"], valid_pred_df["content_id"])

0.004674422870271427

In [139]:
comp_fbeta_score(target_df["content_id"], pred_df["content_id"])

0.004398162522812917

In [143]:
for i in [0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12]:
    valid_df["pred_prob"] = best_val_preds
    valid_df["pred"] = np.where(best_val_preds>i, 1, 0)
    valid_df = valid_df.sort_values("pred_prob", ascending=False)
    valid_df_ = valid_df[valid_df["pred"]==1]
    valid_pred_df = valid_df_.groupby("topic_id").agg(list).reset_index()
    valid_pred_df["content_id"] = valid_pred_df["content_id"].apply(lambda x: " ".join(x))
    print(comp_fbeta_score(target_df["content_id"], valid_pred_df["content_id"]))

0.23699841679304032
0.21679033014741544
0.1892149796675095
0.17483156934092592
0.16232434776417734
0.14847666119000952
0.1430110210715904
0.13446749024068697
0.12943977807615445
0.03181848039979413
0.008458565658880502
0.015799572599887752
0.026872859905546168


In [146]:
from sklearn.metrics import confusion_matrix
confusion_matrix(valid_df["target"], valid_df["pred"])

array([[302096,  52142],
       [ 27909,   4874]])