In [4]:
! nvidia-smi

Fri Dec 30 13:34:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   37C    P8    23W / 480W |    658MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import os

class Config:
    AUTHOR = "shu421"

    EXP = "exp002"
    MODEL_PATH = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    DATASET_PATH = []

    COMPETITION = "learning-equality-curriculum-recommendations"

    BASE_PATH = '/home/working/'

    api_path = "/.kaggle/kaggle.json"

    apex=True
    seed = 42
    num_fold = 5
    trn_fold = [0, 1, 2, 3, 4,]
    batch_size = 8
    n_epochs = 5
    max_len = 1024
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    lr_weight_decay = 0.95
    
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 40
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1

    # weight and bias
    wandb = False
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

In [6]:
import os
import re
import gc
import pdb
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')
# sns.set()
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

# ! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F


from kaggle.api.kaggle_api_extended import KaggleApi

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true


INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
env: TOKENIZERS_PARALLELISM=true


In [7]:
# ====================================================
# wandb
# ====================================================
if Config.wandb:
    
    import wandb
    import json

    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb_config = json.load(open('/root/.kaggle/wandb.json', 'rb'))
        secret_value_0 = wandb_config['key']
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project=Config.COMPETITION, 
                     name=Config.EXP,
                     config=class2dict(Config),
                     group=Config.MODEL_PATH,
                     job_type="train",
                     anonymous=anony)

In [8]:
def setup(cfg):
    # cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    # cfg.DRIVE = cfg.DRIVE_PATH
    # cfg.EXP = (cfg.NAME if cfg.NAME is not None 
    #     else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    # )
    cfg.INPUT = os.path.join(cfg.BASE_PATH, 'input')
    cfg.OUTPUT = os.path.join(cfg.BASE_PATH, 'output')
    cfg.SUBMISSION = os.path.join(cfg.BASE_PATH, 'submission')
    cfg.DATASET = os.path.join(cfg.BASE_PATH, 'dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    
    # if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
    if len(os.listdir(cfg.INPUT))==0:
        # load dataset
        !kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
        filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
        !unzip -d $cfg.INPUT $filepath
        
    
    for path in cfg.DATASET_PATH:
        datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
        if not os.path.exists(datasetpath):
            os.makedirs(datasetpath, exist_ok=True)
            !kaggle datasets download $path -p $datasetpath
            filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
            !unzip -d $datasetpath $filepath
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [9]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [10]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

def get_logger(filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

In [11]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [12]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [13]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min = 1e-9)
        mean_embeddings = sum_embeddings/sum_mask
        return mean_embeddings
        

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        # self.weighted_layer_pool = WeightedLayerPooling(self.config.num_hidden_layers)
        # self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.drop4 = nn.Dropout(0.4)
        self.drop5 = nn.Dropout(0.5)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.model.encoder.layer[:4])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        # all_layer_embeddings = outputs[1]
        # feature = self.weighted_layer_pool(all_layer_embeddings)
        # feature = self.pool(feature, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        feature = self.ln(feature)
        # feature1 = self.drop1(feature)
        # feature2 = self.drop2(feature)
        # feature3 = self.drop3(feature)
        # feature4 = self.drop4(feature)
        # feature5 = self.drop5(feature)
        # feature = (feature1 + feature2 + feature3 + feature4 + feature5) / 5
        output = self.fc(feature)

        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

import math
from torch.autograd.function import InplaceFunction
from torch.nn import Parameter
import torch.nn.init as init
class Mixout(InplaceFunction):
    @staticmethod
    def _make_noise(input):
        return input.new().resize_as_(input)
    @classmethod
    def forward(cls, ctx, input, target=None, p=0.0, training=False, inplace=False):
        if p < 0 or p > 1:
            raise ValueError("A mix probability of mixout has to be between 0 and 1," " but got {}".format(p))
        if target is not None and input.size() != target.size():
            raise ValueError(
                "A target tensor size must match with a input tensor size {},"
                " but got {}".format(input.size(), target.size())
            )
        ctx.p = p
        ctx.training = training
        if ctx.p == 0 or not ctx.training:
            return input
        if target is None:
            target = cls._make_noise(input)
            target.fill_(0)
        target = target.to(input.device)
        if inplace:
            ctx.mark_dirty(input)
            output = input
        else:
            output = input.clone()
        ctx.noise = cls._make_noise(input)
        if len(ctx.noise.size()) == 1:
            ctx.noise.bernoulli_(1 - ctx.p)
        else:
            ctx.noise[0].bernoulli_(1 - ctx.p)
            ctx.noise = ctx.noise[0].repeat(input.size()[0], 1)
        ctx.noise.expand_as(input)
        if ctx.p == 1:
            output = target
        else:
            output = ((1 - ctx.noise) * target + ctx.noise * output - ctx.p * target) / (1 - ctx.p)
        return output
    @staticmethod
    def backward(ctx, grad_output):
        if ctx.p > 0 and ctx.training:
            return grad_output * ctx.noise, None, None, None, None
        else:
            return grad_output, None, None, None, None
def mixout(input, target=None, p=0.0, training=False, inplace=False):
    return Mixout.apply(input, target, p, training, inplace)
class MixLinear(torch.nn.Module):
    __constants__ = ["bias", "in_features", "out_features"]
    def __init__(self, in_features, out_features, bias=True, target=None, p=0.0):
        super(MixLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter("bias", None)
        self.reset_parameters()
        self.target = target
        self.p = p
    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)
    def forward(self, input):
        return F.linear(input, mixout(self.weight, self.target, self.p, self.training), self.bias)
    def extra_repr(self):
        type = "drop" if self.target is None else "mix"
        return "{}={}, in_features={}, out_features={}, bias={}".format(
            type + "out", self.p, self.in_features, self.out_features, self.bias is not None
        )
def replace_mixout(model):
    for sup_module in model.modules():
        for name, module in sup_module.named_children():
            if isinstance(module, nn.Dropout):
                module.p = 0.0
            if isinstance(module, nn.Linear):
                target_state_dict = module.state_dict()
                bias = True if module.bias is not None else False
                new_module = MixLinear(
                    module.in_features, module.out_features, bias, target_state_dict["weight"], 0.2
                )
                new_module.load_state_dict(target_state_dict)
                setattr(sup_module, name, new_module)
    return model

In [14]:
def get_optimizer_grouped_parameters(cfg, model):
    '''Layerwise Learning Rate Decay'''
    model_type = 'model'
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if model_type not in n],
             'lr': cfg.decoder_lr, 'weight_decay': 0.0},
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = cfg.encoder_lr
    for layer in layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": cfg.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
        
        lr *= cfg.lr_weight_decay
    return optimizer_grouped_parameters


# initialize layer
def reinit_bert(model):
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [15]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [16]:
def valid_fn(cfg, valid_loader, model, valid_df, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        # with tqdm(valid_loader, total=len(valid_loader)) as pbar:
        # for (inputs, labels) in pbar:
        for (inputs, labels) in valid_loader:
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast(dtype=torch.bfloat16):
                loss, output = model(inputs, labels)

            output = output.detach().cpu().to(torch.float16).numpy()
            val_preds.append(output)
            val_losses.append(loss.item() * len(labels))
            val_nums.append(len(labels))

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    LOGGER.info(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epochs}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        # print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    
    return best_val_preds, best_val_score, val_loss

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        LOGGER.info(f'{"="*30} Fold{fold} {"="*30}')
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = reinit_bert(model)
        model = replace_mixout(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)


        # enable FGM
        fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            LOGGER.info(f'{"="*20} epoch{epoch} {"="*20}')
            train_losses = []
            train_nums = []
            model.train() 
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex, dtype=torch.bfloat16):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                     # FGM attack
                    fgm.attack()
                    with autocast(enabled=cfg.apex, dtype=torch.bfloat16):
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        best_val_preds, best_val_score, val_loss = valid_fn(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            epoch,
                            step,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

                    if cfg.wandb:
                        wandb.log({f"[fold{fold}] train_loss": loss.item(),
                                f"[fold{fold}] lr": scheduler.get_lr()[0]})

            train_loss = sum(train_losses)/sum(train_nums)

            LOGGER.info(f'Fold{fold}, Epoch{epoch}/{cfg.n_epochs} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch, 
                        f"[fold{fold}] avg_train_loss": loss.item(), 
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model, fgm
        gc.collect()
        torch.cuda.empty_cache()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    LOGGER.info(f'fold score: {fold_score}')
    LOGGER.info(f'CV: {round(score, 4)}')
    return score

In [17]:
# setup
cfg = setup(Config)
LOGGER = get_logger(Config.OUTPUT_EXP)

In [18]:
# main
content_df = pd.read_csv(os.path.join(cfg.INPUT, 'content.csv'))
correlations_df = pd.read_csv(os.path.join(cfg.INPUT, 'correlations.csv'))
topics_df = pd.read_csv(os.path.join(cfg.INPUT, 'topics.csv'))
sub_df = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

In [42]:
model = AutoModel.from_pretrained(cfg.MODEL_PATH)
model.eval()
model.to(cfg.device)

tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer = tokenizer

In [20]:
# vecs = []
# for _, row in tqdm(content_df.fillna("").iterrows(), total=len(content_df)):
#     texts = row['title']
#     if texts == "":
#         texts = "no title"
#     tok = tokenizer(texts)
#     for k, v in tok.items():
#         tok[k] = torch.tensor(v[:cfg.max_len]).to(cfg.device).unsqueeze(0)
#     with torch.no_grad():
#         output = model(**tok)
#     vec = output.last_hidden_state.squeeze(0).mean(0).cpu()

#     vecs.append(vec)

# vecs1 = torch.stack(vecs)

In [21]:
class EmbDataSet(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        df['title'] = df['title'].fillna("no title")
        self.title = df['title'].to_numpy()

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = self.prepare_input(self.cfg, self.title[index])
        return title

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs



In [95]:
content_dataset = EmbDataSet(cfg, content_df)
topics_dataset = EmbDataSet(cfg, topics_df)

content_loader = DataLoader(
    dataset=content_dataset, 
    batch_size=256,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)

topics_loader = DataLoader(
    dataset=topics_dataset, 
    batch_size=256,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)

content_vec = []
for step, contents in enumerate(tqdm(content_loader)):
    contents = collate(contents)
    for k, v in contents.items():
        contents[k] = v.to(cfg.device)
    with torch.no_grad():
        output = model(**contents)
    vec = output.last_hidden_state.mean(1).cpu().numpy()
    content_vec.append(vec)
content_vec = np.concatenate(content_vec)


topics_vec = []
for step, topics in enumerate(tqdm(topics_loader)):
    topics = collate(topics)
    for k, v in topics.items():
        topics[k] = v.to(cfg.device)
    with torch.no_grad():
        output = model(**topics)
    vec = output.last_hidden_state.mean(1).cpu().numpy()
    topics_vec.append(vec)
topics_vec = np.concatenate(topics_vec)

# del model
# torch.cuda.empty_cache()
# gc.collect();

  0%|          | 0/602 [00:00<?, ?it/s]

  0%|          | 0/301 [00:00<?, ?it/s]

In [43]:
vecs = []
for _, row in tqdm(topics_df.fillna('').iterrows(), total=len(topics_df)):
    texts = row['title']
    if texts == '':
        texts = "no title"
        
    tok = tokenizer(texts)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:cfg.max_len]).to(cfg.device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    
    vecs.append(vec)
    
vecs2 = torch.stack(vecs)

  0%|          | 0/76972 [00:00<?, ?it/s]

In [100]:
id2content_dict = dict(content_df["id"])
id2topics_dict = dict(topics_df["id"])

In [101]:
from cuml import NearestNeighbors

In [104]:
# contentがどのtopicにマッチするかを予測
model = NearestNeighbors(n_neighbors=60)
model.fit(topics_vec)
distances, indices = model.kneighbors(content_vec)

knn_pred_c2t_dict = {k:[] for k in topics_df["id"]}

for idx, i in enumerate(indices):
    for j in i:
        knn_pred_c2t_dict[id2topics_dict[j]].append(id2content_dict[idx])


# topicがどのcontentにマッチするかを予測
model = NearestNeighbors(n_neighbors=40)
model.fit(content_vec)
distances, indices = model.kneighbors(topics_vec)

knn_pred_t2c_dict = {k:[] for k in topics_df["id"]}

for idx, i in enumerate(indices):
    for j in i:
        knn_pred_t2c_dict[id2topics_dict[idx]].append(id2content_dict[j])

In [105]:
knn_pred_dict = {k:np.unique(v_c2t+v_t2c) for k,v_c2t, v_t2c in zip(knn_pred_c2t_dict.keys(), knn_pred_c2t_dict.values(), knn_pred_t2c_dict.values())}
knn_pred_id = {k:[" ".join(v)] for k,v in knn_pred_dict.items()}
knn_pred_df = pd.DataFrame(knn_pred_id).T.reset_index()
knn_pred_df.columns = ["topic_id", "content_ids"]
knn_pred_df = knn_pred_df[knn_pred_df["topic_id"].isin(correlations_df["topic_id"].to_list())] # trainに入っているtopicを抽出
knn_pred_df = knn_pred_df.reset_index(drop=True)

In [106]:
def fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    # pdb.set_trace()
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

def recall_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        # precision = len(TP) / (len(pred))
        recall = len(TP) / len(true)
        # f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        
        score_list.append(recall)
    score = sum(score_list) / len(score_list)
    return score

recall = recall_score(correlations_df['content_ids'], knn_pred_df['content_ids'])
print(f"recall = {round(recall, 5)}")
n_bin_data = knn_pred_df["content_ids"].apply(lambda x: len(x.split())).sum()
print(f"n_data = {n_bin_data}")
n_true = correlations_df["content_ids"].apply(lambda x: len(x.split())).sum() * recall
print(f"n_true = {int(n_true)}")

recall = 0.47808
n_data = 8606631
n_true = 133824


In [298]:
content_df

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


In [300]:
topics_df

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
...,...,...,...,...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True


In [16]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)
LOGGER = get_logger(Config.OUTPUT_EXP)

LOGGER.info(f"tokenizers.__version__: {tokenizers.__version__}")
LOGGER.info(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

train = processing_features(train)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

score = training(cfg, train)

if cfg.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1


env: TOKENIZERS_PARALLELISM=true




  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 40 | val_loss: 1.68937, score: 2.28751
Fold: 0, Epoch: 0/5, Step: 80 | val_loss: 0.23675, score: 0.70608
Fold: 0, Epoch: 0/5, Step: 120 | val_loss: 0.14364, score: 0.53861
Fold: 0, Epoch: 0/5, Step: 160 | val_loss: 0.14033, score: 0.5285
Fold: 0, Epoch: 0/5, Step: 200 | val_loss: 0.1283, score: 0.50636
Fold: 0, Epoch: 0/5, Step: 240 | val_loss: 0.11897, score: 0.48797
Fold: 0, Epoch: 0/5, Step: 280 | val_loss: 0.10919, score: 0.46783
Fold: 0, Epoch: 0/5, Step: 320 | val_loss: 0.14399, score: 0.52949
Fold: 0, Epoch: 0/5, Step: 360 | val_loss: 0.12367, score: 0.49766
Fold: 0, Epoch: 0/5, Step: 400 | val_loss: 0.11565, score: 0.48172
Fold0, Epoch0/5 | train_loss: 0.38747
Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.10683, score: 0.46239


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 40 | val_loss: 0.11697, score: 0.48414
Fold: 0, Epoch: 1/5, Step: 80 | val_loss: 0.11148, score: 0.47267
Fold: 0, Epoch: 1/5, Step: 120 | val_loss: 0.14372, score: 0.53458
Fold: 0, Epoch: 1/5, Step: 160 | val_loss: 0.10587, score: 0.46092
Fold: 0, Epoch: 1/5, Step: 200 | val_loss: 0.11316, score: 0.4762
Fold: 0, Epoch: 1/5, Step: 240 | val_loss: 0.12201, score: 0.49411
Fold: 0, Epoch: 1/5, Step: 280 | val_loss: 0.11264, score: 0.47439
Fold: 0, Epoch: 1/5, Step: 320 | val_loss: 0.11989, score: 0.48968
Fold: 0, Epoch: 1/5, Step: 360 | val_loss: 0.11134, score: 0.47067
Fold: 0, Epoch: 1/5, Step: 400 | val_loss: 0.1052, score: 0.45897
Fold0, Epoch1/5 | train_loss: 0.11464
Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.10518, score: 0.4591


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 40 | val_loss: 0.11399, score: 0.47868
Fold: 0, Epoch: 2/5, Step: 80 | val_loss: 0.10824, score: 0.46587
Fold: 0, Epoch: 2/5, Step: 120 | val_loss: 0.10192, score: 0.45199
Fold: 0, Epoch: 2/5, Step: 160 | val_loss: 0.10115, score: 0.4503
Fold: 0, Epoch: 2/5, Step: 200 | val_loss: 0.10948, score: 0.46855
Fold: 0, Epoch: 2/5, Step: 240 | val_loss: 0.10289, score: 0.4542
Fold: 0, Epoch: 2/5, Step: 280 | val_loss: 0.10809, score: 0.46559
Fold: 0, Epoch: 2/5, Step: 320 | val_loss: 0.10793, score: 0.46421
Fold: 0, Epoch: 2/5, Step: 360 | val_loss: 0.10063, score: 0.44901
Fold: 0, Epoch: 2/5, Step: 400 | val_loss: 0.10322, score: 0.45471
Fold0, Epoch2/5 | train_loss: 0.10371
Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.10422, score: 0.45708


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 40 | val_loss: 0.10954, score: 0.46909
Fold: 0, Epoch: 3/5, Step: 80 | val_loss: 0.10426, score: 0.45717
Fold: 0, Epoch: 3/5, Step: 120 | val_loss: 0.10019, score: 0.44815
Fold: 0, Epoch: 3/5, Step: 160 | val_loss: 0.10207, score: 0.45252
Fold: 0, Epoch: 3/5, Step: 200 | val_loss: 0.10237, score: 0.4527
Fold: 0, Epoch: 3/5, Step: 240 | val_loss: 0.10218, score: 0.45264
Fold: 0, Epoch: 3/5, Step: 280 | val_loss: 0.10004, score: 0.4476
Fold: 0, Epoch: 3/5, Step: 320 | val_loss: 0.10236, score: 0.45241
Fold: 0, Epoch: 3/5, Step: 360 | val_loss: 0.10046, score: 0.44841
Fold: 0, Epoch: 3/5, Step: 400 | val_loss: 0.10085, score: 0.44961
Fold0, Epoch3/5 | train_loss: 0.09547
Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.09942, score: 0.44624


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 40 | val_loss: 0.10041, score: 0.44853
Fold: 0, Epoch: 4/5, Step: 80 | val_loss: 0.09947, score: 0.44631
Fold: 0, Epoch: 4/5, Step: 120 | val_loss: 0.09865, score: 0.44458
Fold: 0, Epoch: 4/5, Step: 160 | val_loss: 0.10141, score: 0.4508
Fold: 0, Epoch: 4/5, Step: 200 | val_loss: 0.0991, score: 0.44551
Fold: 0, Epoch: 4/5, Step: 240 | val_loss: 0.09878, score: 0.44473
Fold: 0, Epoch: 4/5, Step: 280 | val_loss: 0.099, score: 0.44523
Fold: 0, Epoch: 4/5, Step: 320 | val_loss: 0.0993, score: 0.44594
Fold: 0, Epoch: 4/5, Step: 360 | val_loss: 0.09911, score: 0.44551
Fold: 0, Epoch: 4/5, Step: 400 | val_loss: 0.09918, score: 0.44568
Fold0, Epoch4/5 | train_loss: 0.08987
Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.09917, score: 0.44565


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 40 | val_loss: 1.94304, score: 2.53146
Fold: 1, Epoch: 0/5, Step: 80 | val_loss: 0.29822, score: 0.79623
Fold: 1, Epoch: 0/5, Step: 120 | val_loss: 0.16042, score: 0.5708
Fold: 1, Epoch: 0/5, Step: 160 | val_loss: 0.12701, score: 0.50523
Fold: 1, Epoch: 0/5, Step: 200 | val_loss: 0.12339, score: 0.49838
Fold: 1, Epoch: 0/5, Step: 240 | val_loss: 0.12107, score: 0.49208
Fold: 1, Epoch: 0/5, Step: 280 | val_loss: 0.14202, score: 0.5326
Fold: 1, Epoch: 0/5, Step: 320 | val_loss: 0.11296, score: 0.47603
Fold: 1, Epoch: 0/5, Step: 360 | val_loss: 0.11481, score: 0.47999
Fold: 1, Epoch: 0/5, Step: 400 | val_loss: 0.11322, score: 0.47694
Fold1, Epoch0/5 | train_loss: 0.43001
Fold: 1, Epoch: 0/5, Step: end | val_loss: 0.12072, score: 0.49295


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 40 | val_loss: 0.11436, score: 0.47923
Fold: 1, Epoch: 1/5, Step: 80 | val_loss: 0.12235, score: 0.49446
Fold: 1, Epoch: 1/5, Step: 120 | val_loss: 0.11081, score: 0.47211
Fold: 1, Epoch: 1/5, Step: 160 | val_loss: 0.11769, score: 0.48674
Fold: 1, Epoch: 1/5, Step: 200 | val_loss: 0.11989, score: 0.49059
Fold: 1, Epoch: 1/5, Step: 240 | val_loss: 0.11711, score: 0.48388
Fold: 1, Epoch: 1/5, Step: 280 | val_loss: 0.11185, score: 0.47409
Fold: 1, Epoch: 1/5, Step: 320 | val_loss: 0.11313, score: 0.476
Fold: 1, Epoch: 1/5, Step: 360 | val_loss: 0.11911, score: 0.48862
Fold: 1, Epoch: 1/5, Step: 400 | val_loss: 0.11823, score: 0.48678
Fold1, Epoch1/5 | train_loss: 0.111
Fold: 1, Epoch: 1/5, Step: end | val_loss: 0.11357, score: 0.47741


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 40 | val_loss: 0.11347, score: 0.47749
Fold: 1, Epoch: 2/5, Step: 80 | val_loss: 0.1098, score: 0.46972
Fold: 1, Epoch: 2/5, Step: 120 | val_loss: 0.14176, score: 0.5346
Fold: 1, Epoch: 2/5, Step: 160 | val_loss: 0.11417, score: 0.47832
Fold: 1, Epoch: 2/5, Step: 200 | val_loss: 0.10599, score: 0.46153
Fold: 1, Epoch: 2/5, Step: 240 | val_loss: 0.11977, score: 0.49131
Fold: 1, Epoch: 2/5, Step: 280 | val_loss: 0.11006, score: 0.47007
Fold: 1, Epoch: 2/5, Step: 320 | val_loss: 0.11335, score: 0.47733
Fold: 1, Epoch: 2/5, Step: 360 | val_loss: 0.10368, score: 0.45631
Fold: 1, Epoch: 2/5, Step: 400 | val_loss: 0.10968, score: 0.46952
Fold1, Epoch2/5 | train_loss: 0.10406
Fold: 1, Epoch: 2/5, Step: end | val_loss: 0.10473, score: 0.45842


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 40 | val_loss: 0.11661, score: 0.48439
Fold: 1, Epoch: 3/5, Step: 80 | val_loss: 0.11284, score: 0.47612
Fold: 1, Epoch: 3/5, Step: 120 | val_loss: 0.10629, score: 0.46199
Fold: 1, Epoch: 3/5, Step: 160 | val_loss: 0.10528, score: 0.45956
Fold: 1, Epoch: 3/5, Step: 200 | val_loss: 0.10471, score: 0.45847
Fold: 1, Epoch: 3/5, Step: 240 | val_loss: 0.10462, score: 0.45822
Fold: 1, Epoch: 3/5, Step: 280 | val_loss: 0.1035, score: 0.4558
Fold: 1, Epoch: 3/5, Step: 320 | val_loss: 0.10454, score: 0.45805
Fold: 1, Epoch: 3/5, Step: 360 | val_loss: 0.105, score: 0.45919
Fold: 1, Epoch: 3/5, Step: 400 | val_loss: 0.10415, score: 0.4574
Fold1, Epoch3/5 | train_loss: 0.09478
Fold: 1, Epoch: 3/5, Step: end | val_loss: 0.10449, score: 0.45814


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 40 | val_loss: 0.10391, score: 0.45676
Fold: 1, Epoch: 4/5, Step: 80 | val_loss: 0.10442, score: 0.45781
wandb: Network error (ConnectionError), entering retry loop.
Fold: 1, Epoch: 4/5, Step: 120 | val_loss: 0.1038, score: 0.45655
Fold: 1, Epoch: 4/5, Step: 160 | val_loss: 0.1048, score: 0.45887
Fold: 1, Epoch: 4/5, Step: 200 | val_loss: 0.104, score: 0.45702
Fold: 1, Epoch: 4/5, Step: 240 | val_loss: 0.10397, score: 0.45696
Fold: 1, Epoch: 4/5, Step: 280 | val_loss: 0.10407, score: 0.45724
Fold: 1, Epoch: 4/5, Step: 320 | val_loss: 0.10391, score: 0.45683
Fold: 1, Epoch: 4/5, Step: 360 | val_loss: 0.10387, score: 0.45674
Fold: 1, Epoch: 4/5, Step: 400 | val_loss: 0.10379, score: 0.45654
Fold1, Epoch4/5 | train_loss: 0.08973
Fold: 1, Epoch: 4/5, Step: end | val_loss: 0.10374, score: 0.45643


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 40 | val_loss: 1.40984, score: 2.01739
Fold: 2, Epoch: 0/5, Step: 80 | val_loss: 0.21718, score: 0.6708
Fold: 2, Epoch: 0/5, Step: 120 | val_loss: 0.15265, score: 0.55787
Fold: 2, Epoch: 0/5, Step: 160 | val_loss: 0.13468, score: 0.52102
Fold: 2, Epoch: 0/5, Step: 200 | val_loss: 0.11493, score: 0.48134
Fold: 2, Epoch: 0/5, Step: 240 | val_loss: 0.13771, score: 0.52592
Fold: 2, Epoch: 0/5, Step: 280 | val_loss: 0.11585, score: 0.48292
Fold: 2, Epoch: 0/5, Step: 320 | val_loss: 0.11216, score: 0.47513
Fold: 2, Epoch: 0/5, Step: 360 | val_loss: 0.11688, score: 0.48454
Fold: 2, Epoch: 0/5, Step: 400 | val_loss: 0.14281, score: 0.53299
Fold2, Epoch0/5 | train_loss: 0.34099
Fold: 2, Epoch: 0/5, Step: end | val_loss: 0.12048, score: 0.49188


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 40 | val_loss: 0.11069, score: 0.47132
Fold: 2, Epoch: 1/5, Step: 80 | val_loss: 0.10912, score: 0.46881
Fold: 2, Epoch: 1/5, Step: 120 | val_loss: 0.11256, score: 0.47566
Fold: 2, Epoch: 1/5, Step: 160 | val_loss: 0.1263, score: 0.50351
Fold: 2, Epoch: 1/5, Step: 200 | val_loss: 0.11863, score: 0.48758
Fold: 2, Epoch: 1/5, Step: 240 | val_loss: 0.11741, score: 0.48581
Fold: 2, Epoch: 1/5, Step: 280 | val_loss: 0.11267, score: 0.47592
Fold: 2, Epoch: 1/5, Step: 320 | val_loss: 0.10506, score: 0.45938
Fold: 2, Epoch: 1/5, Step: 360 | val_loss: 0.10929, score: 0.46821
Fold: 2, Epoch: 1/5, Step: 400 | val_loss: 0.10908, score: 0.46809
Fold2, Epoch1/5 | train_loss: 0.11367
Fold: 2, Epoch: 1/5, Step: end | val_loss: 0.11483, score: 0.48


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 40 | val_loss: 0.10426, score: 0.45762
Fold: 2, Epoch: 2/5, Step: 80 | val_loss: 0.11401, score: 0.47849
Fold: 2, Epoch: 2/5, Step: 120 | val_loss: 0.10363, score: 0.45608
Fold: 2, Epoch: 2/5, Step: 160 | val_loss: 0.11741, score: 0.48548
Fold: 2, Epoch: 2/5, Step: 200 | val_loss: 0.12608, score: 0.50293
Fold: 2, Epoch: 2/5, Step: 240 | val_loss: 0.12916, score: 0.50822
Fold: 2, Epoch: 2/5, Step: 280 | val_loss: 0.10412, score: 0.45715
Fold: 2, Epoch: 2/5, Step: 320 | val_loss: 0.11047, score: 0.47101
Fold: 2, Epoch: 2/5, Step: 360 | val_loss: 0.10721, score: 0.46371
Fold: 2, Epoch: 2/5, Step: 400 | val_loss: 0.11765, score: 0.48595
Fold2, Epoch2/5 | train_loss: 0.10271
Fold: 2, Epoch: 2/5, Step: end | val_loss: 0.10886, score: 0.468


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 40 | val_loss: 0.10733, score: 0.46458
Fold: 2, Epoch: 3/5, Step: 80 | val_loss: 0.10348, score: 0.45604
Fold: 2, Epoch: 3/5, Step: 120 | val_loss: 0.10824, score: 0.46611
Fold: 2, Epoch: 3/5, Step: 160 | val_loss: 0.10275, score: 0.45428
Fold: 2, Epoch: 3/5, Step: 200 | val_loss: 0.1031, score: 0.45482
Fold: 2, Epoch: 3/5, Step: 240 | val_loss: 0.10244, score: 0.45348
Fold: 2, Epoch: 3/5, Step: 280 | val_loss: 0.1089, score: 0.4675
Fold: 2, Epoch: 3/5, Step: 320 | val_loss: 0.10229, score: 0.45308
Fold: 2, Epoch: 3/5, Step: 360 | val_loss: 0.10542, score: 0.45981
Fold: 2, Epoch: 3/5, Step: 400 | val_loss: 0.10391, score: 0.45677
Fold2, Epoch3/5 | train_loss: 0.0941
Fold: 2, Epoch: 3/5, Step: end | val_loss: 0.10438, score: 0.45749


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 40 | val_loss: 0.1023, score: 0.45324
Fold: 2, Epoch: 4/5, Step: 80 | val_loss: 0.10263, score: 0.45397
Fold: 2, Epoch: 4/5, Step: 120 | val_loss: 0.1022, score: 0.45299
Fold: 2, Epoch: 4/5, Step: 160 | val_loss: 0.10335, score: 0.45533
Fold: 2, Epoch: 4/5, Step: 200 | val_loss: 0.10277, score: 0.45408
Fold: 2, Epoch: 4/5, Step: 240 | val_loss: 0.10343, score: 0.45556
Fold: 2, Epoch: 4/5, Step: 280 | val_loss: 0.10323, score: 0.45509
Fold: 2, Epoch: 4/5, Step: 320 | val_loss: 0.10287, score: 0.45433
Fold: 2, Epoch: 4/5, Step: 360 | val_loss: 0.1025, score: 0.45349
Fold: 2, Epoch: 4/5, Step: 400 | val_loss: 0.10243, score: 0.45334
Fold2, Epoch4/5 | train_loss: 0.08939
Fold: 2, Epoch: 4/5, Step: end | val_loss: 0.10243, score: 0.45334


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 40 | val_loss: 1.83664, score: 2.42428
Fold: 3, Epoch: 0/5, Step: 80 | val_loss: 0.25528, score: 0.73303
Fold: 3, Epoch: 0/5, Step: 120 | val_loss: 0.15586, score: 0.56219
Fold: 3, Epoch: 0/5, Step: 160 | val_loss: 0.11996, score: 0.49085
Fold: 3, Epoch: 0/5, Step: 200 | val_loss: 0.11018, score: 0.47018
Fold: 3, Epoch: 0/5, Step: 240 | val_loss: 0.12297, score: 0.49408
Fold: 3, Epoch: 0/5, Step: 280 | val_loss: 0.11767, score: 0.48661
Fold: 3, Epoch: 0/5, Step: 320 | val_loss: 0.1151, score: 0.48032
Fold: 3, Epoch: 0/5, Step: 360 | val_loss: 0.10655, score: 0.46156
Fold: 3, Epoch: 0/5, Step: 400 | val_loss: 0.11092, score: 0.46983
Fold3, Epoch0/5 | train_loss: 0.40702
Fold: 3, Epoch: 0/5, Step: end | val_loss: 0.10655, score: 0.46136


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 40 | val_loss: 0.11949, score: 0.48894
Fold: 3, Epoch: 1/5, Step: 80 | val_loss: 0.12706, score: 0.50276
Fold: 3, Epoch: 1/5, Step: 120 | val_loss: 0.10404, score: 0.45693
Fold: 3, Epoch: 1/5, Step: 160 | val_loss: 0.10652, score: 0.46257
Fold: 3, Epoch: 1/5, Step: 200 | val_loss: 0.10364, score: 0.45532
Fold: 3, Epoch: 1/5, Step: 240 | val_loss: 0.11093, score: 0.4709
Fold: 3, Epoch: 1/5, Step: 280 | val_loss: 0.10433, score: 0.45729
Fold: 3, Epoch: 1/5, Step: 320 | val_loss: 0.10789, score: 0.46453
Fold: 3, Epoch: 1/5, Step: 360 | val_loss: 0.11942, score: 0.48695
Fold: 3, Epoch: 1/5, Step: 400 | val_loss: 0.10602, score: 0.46102
Fold3, Epoch1/5 | train_loss: 0.10994
Fold: 3, Epoch: 1/5, Step: end | val_loss: 0.1052, score: 0.45891


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 40 | val_loss: 0.10203, score: 0.45159
Fold: 3, Epoch: 2/5, Step: 80 | val_loss: 0.11041, score: 0.47013
Fold: 3, Epoch: 2/5, Step: 120 | val_loss: 0.11712, score: 0.48265
Fold: 3, Epoch: 2/5, Step: 160 | val_loss: 0.10107, score: 0.44993
Fold: 3, Epoch: 2/5, Step: 200 | val_loss: 0.10209, score: 0.45208
Fold: 3, Epoch: 2/5, Step: 240 | val_loss: 0.10326, score: 0.4545
Fold: 3, Epoch: 2/5, Step: 280 | val_loss: 0.10442, score: 0.45747
Fold: 3, Epoch: 2/5, Step: 320 | val_loss: 0.13681, score: 0.52459
Fold: 3, Epoch: 2/5, Step: 360 | val_loss: 0.10764, score: 0.4648
Fold: 3, Epoch: 2/5, Step: 400 | val_loss: 0.11229, score: 0.47341
Fold3, Epoch2/5 | train_loss: 0.10281
Fold: 3, Epoch: 2/5, Step: end | val_loss: 0.102, score: 0.45207


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 40 | val_loss: 0.10137, score: 0.4506
Fold: 3, Epoch: 3/5, Step: 80 | val_loss: 0.10454, score: 0.45774
Fold: 3, Epoch: 3/5, Step: 120 | val_loss: 0.10141, score: 0.45067
Fold: 3, Epoch: 3/5, Step: 160 | val_loss: 0.10126, score: 0.45001
Fold: 3, Epoch: 3/5, Step: 200 | val_loss: 0.09845, score: 0.44408
Fold: 3, Epoch: 3/5, Step: 240 | val_loss: 0.10085, score: 0.44933
Fold: 3, Epoch: 3/5, Step: 280 | val_loss: 0.10353, score: 0.45567
Fold: 3, Epoch: 3/5, Step: 320 | val_loss: 0.09993, score: 0.44744
Fold: 3, Epoch: 3/5, Step: 360 | val_loss: 0.09949, score: 0.44664
Fold: 3, Epoch: 3/5, Step: 400 | val_loss: 0.09954, score: 0.44658
Fold3, Epoch3/5 | train_loss: 0.09492
Fold: 3, Epoch: 3/5, Step: end | val_loss: 0.09969, score: 0.4466


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 40 | val_loss: 0.09936, score: 0.44617
Fold: 3, Epoch: 4/5, Step: 80 | val_loss: 0.09965, score: 0.44691
Fold: 3, Epoch: 4/5, Step: 120 | val_loss: 0.09863, score: 0.44424
Fold: 3, Epoch: 4/5, Step: 160 | val_loss: 0.09825, score: 0.44355
Fold: 3, Epoch: 4/5, Step: 200 | val_loss: 0.09909, score: 0.44529
Fold: 3, Epoch: 4/5, Step: 240 | val_loss: 0.09815, score: 0.44342
Fold: 3, Epoch: 4/5, Step: 280 | val_loss: 0.09835, score: 0.4437
Fold: 3, Epoch: 4/5, Step: 320 | val_loss: 0.09835, score: 0.44366
Fold: 3, Epoch: 4/5, Step: 360 | val_loss: 0.09826, score: 0.44347
Fold: 3, Epoch: 4/5, Step: 400 | val_loss: 0.09824, score: 0.44343
Fold3, Epoch4/5 | train_loss: 0.08934
Fold: 3, Epoch: 4/5, Step: end | val_loss: 0.09824, score: 0.44343


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 40 | val_loss: 1.84024, score: 2.44121
Fold: 4, Epoch: 0/5, Step: 80 | val_loss: 0.29609, score: 0.80007
Fold: 4, Epoch: 0/5, Step: 120 | val_loss: 0.17377, score: 0.59511
Fold: 4, Epoch: 0/5, Step: 160 | val_loss: 0.13124, score: 0.51548
Fold: 4, Epoch: 0/5, Step: 200 | val_loss: 0.12435, score: 0.50131
Fold: 4, Epoch: 0/5, Step: 240 | val_loss: 0.12405, score: 0.50121
Fold: 4, Epoch: 0/5, Step: 280 | val_loss: 0.11858, score: 0.48852
Fold: 4, Epoch: 0/5, Step: 320 | val_loss: 0.12588, score: 0.50425
Fold: 4, Epoch: 0/5, Step: 360 | val_loss: 0.13739, score: 0.52727
Fold: 4, Epoch: 0/5, Step: 400 | val_loss: 0.12364, score: 0.49919
Fold4, Epoch0/5 | train_loss: 0.42785
Fold: 4, Epoch: 0/5, Step: end | val_loss: 0.12717, score: 0.50581


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 40 | val_loss: 0.11139, score: 0.47354
Fold: 4, Epoch: 1/5, Step: 80 | val_loss: 0.12874, score: 0.50575
Fold: 4, Epoch: 1/5, Step: 120 | val_loss: 0.11951, score: 0.49222
Fold: 4, Epoch: 1/5, Step: 160 | val_loss: 0.11738, score: 0.4862
Fold: 4, Epoch: 1/5, Step: 200 | val_loss: 0.11787, score: 0.48736
Fold: 4, Epoch: 1/5, Step: 240 | val_loss: 0.12277, score: 0.49681
Fold: 4, Epoch: 1/5, Step: 280 | val_loss: 0.13128, score: 0.51315
Fold: 4, Epoch: 1/5, Step: 320 | val_loss: 0.10804, score: 0.46632
Fold: 4, Epoch: 1/5, Step: 360 | val_loss: 0.12516, score: 0.50036
Fold: 4, Epoch: 1/5, Step: 400 | val_loss: 0.12825, score: 0.50956
Fold4, Epoch1/5 | train_loss: 0.11098
Fold: 4, Epoch: 1/5, Step: end | val_loss: 0.10919, score: 0.4695


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 40 | val_loss: 0.12346, score: 0.49945
Fold: 4, Epoch: 2/5, Step: 80 | val_loss: 0.11296, score: 0.47739
Fold: 4, Epoch: 2/5, Step: 120 | val_loss: 0.11007, score: 0.47102
Fold: 4, Epoch: 2/5, Step: 160 | val_loss: 0.10714, score: 0.46453
Fold: 4, Epoch: 2/5, Step: 200 | val_loss: 0.11435, score: 0.48
Fold: 4, Epoch: 2/5, Step: 240 | val_loss: 0.11136, score: 0.47352
Fold: 4, Epoch: 2/5, Step: 280 | val_loss: 0.10838, score: 0.46677
Fold: 4, Epoch: 2/5, Step: 320 | val_loss: 0.11238, score: 0.47627
Fold: 4, Epoch: 2/5, Step: 360 | val_loss: 0.1044, score: 0.4583
Fold: 4, Epoch: 2/5, Step: 400 | val_loss: 0.10555, score: 0.4607
Fold4, Epoch2/5 | train_loss: 0.10135
Fold: 4, Epoch: 2/5, Step: end | val_loss: 0.10559, score: 0.4613


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 40 | val_loss: 0.11302, score: 0.47688
Fold: 4, Epoch: 3/5, Step: 80 | val_loss: 0.10395, score: 0.45716
Fold: 4, Epoch: 3/5, Step: 120 | val_loss: 0.11174, score: 0.47472
Fold: 4, Epoch: 3/5, Step: 160 | val_loss: 0.10888, score: 0.46816
Fold: 4, Epoch: 3/5, Step: 200 | val_loss: 0.10845, score: 0.46702
Fold: 4, Epoch: 3/5, Step: 240 | val_loss: 0.10737, score: 0.46496
Fold: 4, Epoch: 3/5, Step: 280 | val_loss: 0.10274, score: 0.45466
Fold: 4, Epoch: 3/5, Step: 320 | val_loss: 0.10334, score: 0.45611
Fold: 4, Epoch: 3/5, Step: 360 | val_loss: 0.10364, score: 0.45658
Fold: 4, Epoch: 3/5, Step: 400 | val_loss: 0.10428, score: 0.45804
Fold4, Epoch3/5 | train_loss: 0.09517
Fold: 4, Epoch: 3/5, Step: end | val_loss: 0.10412, score: 0.45754


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 40 | val_loss: 0.10849, score: 0.46779
Fold: 4, Epoch: 4/5, Step: 80 | val_loss: 0.10307, score: 0.45554
Fold: 4, Epoch: 4/5, Step: 120 | val_loss: 0.10328, score: 0.45598
Fold: 4, Epoch: 4/5, Step: 160 | val_loss: 0.10542, score: 0.46076
Fold: 4, Epoch: 4/5, Step: 200 | val_loss: 0.10318, score: 0.45569
Fold: 4, Epoch: 4/5, Step: 240 | val_loss: 0.10309, score: 0.45551
Fold: 4, Epoch: 4/5, Step: 280 | val_loss: 0.10316, score: 0.45569
Fold: 4, Epoch: 4/5, Step: 320 | val_loss: 0.10363, score: 0.45679
Fold: 4, Epoch: 4/5, Step: 360 | val_loss: 0.10303, score: 0.45538
Fold: 4, Epoch: 4/5, Step: 400 | val_loss: 0.10298, score: 0.45524
Fold4, Epoch4/5 | train_loss: 0.09013
Fold: 4, Epoch: 4/5, Step: end | val_loss: 0.10294, score: 0.45517


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 0/5, Step: 40 | val_loss: 1.81597, score: 2.41364
Fold: 5, Epoch: 0/5, Step: 80 | val_loss: 0.24957, score: 0.71965
Fold: 5, Epoch: 0/5, Step: 120 | val_loss: 0.16304, score: 0.57496
Fold: 5, Epoch: 0/5, Step: 160 | val_loss: 0.11783, score: 0.48677
Fold: 5, Epoch: 0/5, Step: 200 | val_loss: 0.12283, score: 0.49629
Fold: 5, Epoch: 0/5, Step: 240 | val_loss: 0.11663, score: 0.4832
Fold: 5, Epoch: 0/5, Step: 280 | val_loss: 0.11327, score: 0.47714
Fold: 5, Epoch: 0/5, Step: 320 | val_loss: 0.12637, score: 0.49883
Fold: 5, Epoch: 0/5, Step: 360 | val_loss: 0.11137, score: 0.47229
Fold: 5, Epoch: 0/5, Step: 400 | val_loss: 0.12382, score: 0.49642
Fold5, Epoch0/5 | train_loss: 0.42069
Fold: 5, Epoch: 0/5, Step: end | val_loss: 0.10769, score: 0.46369


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 1/5, Step: 40 | val_loss: 0.10927, score: 0.46789
Fold: 5, Epoch: 1/5, Step: 80 | val_loss: 0.11424, score: 0.47765
Fold: 5, Epoch: 1/5, Step: 120 | val_loss: 0.10283, score: 0.45411
Fold: 5, Epoch: 1/5, Step: 160 | val_loss: 0.10404, score: 0.45596
Fold: 5, Epoch: 1/5, Step: 200 | val_loss: 0.11078, score: 0.47181
Fold: 5, Epoch: 1/5, Step: 240 | val_loss: 0.10318, score: 0.45447
Fold: 5, Epoch: 1/5, Step: 280 | val_loss: 0.10146, score: 0.45042
Fold: 5, Epoch: 1/5, Step: 320 | val_loss: 0.10447, score: 0.45745
Fold: 5, Epoch: 1/5, Step: 360 | val_loss: 0.11033, score: 0.46944
Fold: 5, Epoch: 1/5, Step: 400 | val_loss: 0.0996, score: 0.44634
Fold5, Epoch1/5 | train_loss: 0.11122
Fold: 5, Epoch: 1/5, Step: end | val_loss: 0.09898, score: 0.4453


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 2/5, Step: 40 | val_loss: 0.1061, score: 0.45905
Fold: 5, Epoch: 2/5, Step: 80 | val_loss: 0.09812, score: 0.443
Fold: 5, Epoch: 2/5, Step: 120 | val_loss: 0.10146, score: 0.45123
Fold: 5, Epoch: 2/5, Step: 160 | val_loss: 0.10264, score: 0.4527
Fold: 5, Epoch: 2/5, Step: 200 | val_loss: 0.10151, score: 0.45094
Fold: 5, Epoch: 2/5, Step: 240 | val_loss: 0.10132, score: 0.44958
Fold: 5, Epoch: 2/5, Step: 280 | val_loss: 0.10275, score: 0.45326
Fold: 5, Epoch: 2/5, Step: 320 | val_loss: 0.11356, score: 0.47657
Fold: 5, Epoch: 2/5, Step: 360 | val_loss: 0.10261, score: 0.45236
Fold: 5, Epoch: 2/5, Step: 400 | val_loss: 0.10523, score: 0.45839
Fold5, Epoch2/5 | train_loss: 0.10435
Fold: 5, Epoch: 2/5, Step: end | val_loss: 0.10214, score: 0.45161


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 3/5, Step: 40 | val_loss: 0.09613, score: 0.43886
Fold: 5, Epoch: 3/5, Step: 80 | val_loss: 0.10046, score: 0.44812
Fold: 5, Epoch: 3/5, Step: 120 | val_loss: 0.09514, score: 0.43613
Fold: 5, Epoch: 3/5, Step: 160 | val_loss: 0.0954, score: 0.43676
Fold: 5, Epoch: 3/5, Step: 200 | val_loss: 0.09472, score: 0.43538
Fold: 5, Epoch: 3/5, Step: 240 | val_loss: 0.10764, score: 0.46432
Fold: 5, Epoch: 3/5, Step: 280 | val_loss: 0.09748, score: 0.44165
Fold: 5, Epoch: 3/5, Step: 320 | val_loss: 0.09637, score: 0.43913
Fold: 5, Epoch: 3/5, Step: 360 | val_loss: 0.10141, score: 0.45055
Fold: 5, Epoch: 3/5, Step: 400 | val_loss: 0.09867, score: 0.44456
Fold5, Epoch3/5 | train_loss: 0.09536
Fold: 5, Epoch: 3/5, Step: end | val_loss: 0.09743, score: 0.44163


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 4/5, Step: 40 | val_loss: 0.09422, score: 0.4341
Fold: 5, Epoch: 4/5, Step: 80 | val_loss: 0.09619, score: 0.43868
Fold: 5, Epoch: 4/5, Step: 120 | val_loss: 0.09517, score: 0.43651
Fold: 5, Epoch: 4/5, Step: 160 | val_loss: 0.09486, score: 0.43559
Fold: 5, Epoch: 4/5, Step: 200 | val_loss: 0.09417, score: 0.43401
Fold: 5, Epoch: 4/5, Step: 240 | val_loss: 0.09412, score: 0.43387
Fold: 5, Epoch: 4/5, Step: 280 | val_loss: 0.09442, score: 0.43457
Fold: 5, Epoch: 4/5, Step: 320 | val_loss: 0.09463, score: 0.43509
Fold: 5, Epoch: 4/5, Step: 360 | val_loss: 0.0949, score: 0.43573
Fold: 5, Epoch: 4/5, Step: 400 | val_loss: 0.09478, score: 0.43545
Fold5, Epoch4/5 | train_loss: 0.09003
Fold: 5, Epoch: 4/5, Step: end | val_loss: 0.09478, score: 0.43545


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 0/5, Step: 40 | val_loss: 1.82497, score: 2.42011
Fold: 6, Epoch: 0/5, Step: 80 | val_loss: 0.22062, score: 0.67958
Fold: 6, Epoch: 0/5, Step: 120 | val_loss: 0.14714, score: 0.54626
Fold: 6, Epoch: 0/5, Step: 160 | val_loss: 0.13308, score: 0.51823
Fold: 6, Epoch: 0/5, Step: 200 | val_loss: 0.12193, score: 0.49556
Fold: 6, Epoch: 0/5, Step: 240 | val_loss: 0.12394, score: 0.49881
Fold: 6, Epoch: 0/5, Step: 280 | val_loss: 0.11561, score: 0.48228
Fold: 6, Epoch: 0/5, Step: 320 | val_loss: 0.12002, score: 0.4908
Fold: 6, Epoch: 0/5, Step: 360 | val_loss: 0.11311, score: 0.47684
Fold: 6, Epoch: 0/5, Step: 400 | val_loss: 0.12763, score: 0.50653
Fold6, Epoch0/5 | train_loss: 0.41218
Fold: 6, Epoch: 0/5, Step: end | val_loss: 0.11176, score: 0.47381


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 1/5, Step: 40 | val_loss: 0.12974, score: 0.51037
Fold: 6, Epoch: 1/5, Step: 80 | val_loss: 0.11684, score: 0.48414
Fold: 6, Epoch: 1/5, Step: 120 | val_loss: 0.1135, score: 0.47772
Fold: 6, Epoch: 1/5, Step: 160 | val_loss: 0.12306, score: 0.49508
Fold: 6, Epoch: 1/5, Step: 200 | val_loss: 0.11289, score: 0.47623
Fold: 6, Epoch: 1/5, Step: 240 | val_loss: 0.11418, score: 0.4784
Fold: 6, Epoch: 1/5, Step: 280 | val_loss: 0.14292, score: 0.53696
Fold: 6, Epoch: 1/5, Step: 320 | val_loss: 0.11112, score: 0.47227
Fold: 6, Epoch: 1/5, Step: 360 | val_loss: 0.11415, score: 0.47843
Fold: 6, Epoch: 1/5, Step: 400 | val_loss: 0.11, score: 0.47044
Fold6, Epoch1/5 | train_loss: 0.11161
Fold: 6, Epoch: 1/5, Step: end | val_loss: 0.11246, score: 0.4757


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 2/5, Step: 40 | val_loss: 0.10727, score: 0.46416
Fold: 6, Epoch: 2/5, Step: 80 | val_loss: 0.11476, score: 0.48033
Fold: 6, Epoch: 2/5, Step: 120 | val_loss: 0.1173, score: 0.48503
Fold: 6, Epoch: 2/5, Step: 160 | val_loss: 0.11196, score: 0.47443
Fold: 6, Epoch: 2/5, Step: 200 | val_loss: 0.11139, score: 0.47319
Fold: 6, Epoch: 2/5, Step: 240 | val_loss: 0.11693, score: 0.48467
Fold: 6, Epoch: 2/5, Step: 280 | val_loss: 0.10854, score: 0.4669
Fold: 6, Epoch: 2/5, Step: 320 | val_loss: 0.10863, score: 0.467
Fold: 6, Epoch: 2/5, Step: 360 | val_loss: 0.11032, score: 0.47089
Fold: 6, Epoch: 2/5, Step: 400 | val_loss: 0.11134, score: 0.47261
Fold6, Epoch2/5 | train_loss: 0.10296
Fold: 6, Epoch: 2/5, Step: end | val_loss: 0.11787, score: 0.4868


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 3/5, Step: 40 | val_loss: 0.11204, score: 0.47424
Fold: 6, Epoch: 3/5, Step: 80 | val_loss: 0.11028, score: 0.47067
Fold: 6, Epoch: 3/5, Step: 120 | val_loss: 0.11203, score: 0.47438
Fold: 6, Epoch: 3/5, Step: 160 | val_loss: 0.11145, score: 0.47289
Fold: 6, Epoch: 3/5, Step: 200 | val_loss: 0.11828, score: 0.48812
Fold: 6, Epoch: 3/5, Step: 240 | val_loss: 0.10708, score: 0.46369
Fold: 6, Epoch: 3/5, Step: 280 | val_loss: 0.10775, score: 0.46517
Fold: 6, Epoch: 3/5, Step: 320 | val_loss: 0.1104, score: 0.471
Fold: 6, Epoch: 3/5, Step: 360 | val_loss: 0.10756, score: 0.46491
Fold: 6, Epoch: 3/5, Step: 400 | val_loss: 0.10904, score: 0.46818
Fold6, Epoch3/5 | train_loss: 0.09428
Fold: 6, Epoch: 3/5, Step: end | val_loss: 0.10805, score: 0.46576


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 4/5, Step: 40 | val_loss: 0.1084, score: 0.46664
Fold: 6, Epoch: 4/5, Step: 80 | val_loss: 0.1081, score: 0.466
Fold: 6, Epoch: 4/5, Step: 120 | val_loss: 0.1085, score: 0.46697
Fold: 6, Epoch: 4/5, Step: 160 | val_loss: 0.10753, score: 0.46469
Fold: 6, Epoch: 4/5, Step: 200 | val_loss: 0.10724, score: 0.46415
Fold: 6, Epoch: 4/5, Step: 240 | val_loss: 0.1071, score: 0.46374
Fold: 6, Epoch: 4/5, Step: 280 | val_loss: 0.10718, score: 0.46394
Fold: 6, Epoch: 4/5, Step: 320 | val_loss: 0.10702, score: 0.46362
Fold: 6, Epoch: 4/5, Step: 360 | val_loss: 0.10704, score: 0.46367
Fold: 6, Epoch: 4/5, Step: 400 | val_loss: 0.10705, score: 0.46368
Fold6, Epoch4/5 | train_loss: 0.08904
Fold: 6, Epoch: 4/5, Step: end | val_loss: 0.10703, score: 0.46364


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 0/5, Step: 40 | val_loss: 1.83071, score: 2.41988
Fold: 7, Epoch: 0/5, Step: 80 | val_loss: 0.28277, score: 0.766
Fold: 7, Epoch: 0/5, Step: 120 | val_loss: 0.15521, score: 0.56086
Fold: 7, Epoch: 0/5, Step: 160 | val_loss: 0.13262, score: 0.51667
Fold: 7, Epoch: 0/5, Step: 200 | val_loss: 0.11837, score: 0.48727
Fold: 7, Epoch: 0/5, Step: 240 | val_loss: 0.11329, score: 0.47756
Fold: 7, Epoch: 0/5, Step: 280 | val_loss: 0.10969, score: 0.46914
Fold: 7, Epoch: 0/5, Step: 320 | val_loss: 0.11224, score: 0.47493
Fold: 7, Epoch: 0/5, Step: 360 | val_loss: 0.10736, score: 0.46314
Fold: 7, Epoch: 0/5, Step: 400 | val_loss: 0.099, score: 0.44538
Fold7, Epoch0/5 | train_loss: 0.43053
Fold: 7, Epoch: 0/5, Step: end | val_loss: 0.10266, score: 0.45416


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 1/5, Step: 40 | val_loss: 0.11284, score: 0.47514
Fold: 7, Epoch: 1/5, Step: 80 | val_loss: 0.09887, score: 0.44503
Fold: 7, Epoch: 1/5, Step: 120 | val_loss: 0.11497, score: 0.48024
Fold: 7, Epoch: 1/5, Step: 160 | val_loss: 0.10039, score: 0.44815
Fold: 7, Epoch: 1/5, Step: 200 | val_loss: 0.11703, score: 0.48432
Fold: 7, Epoch: 1/5, Step: 240 | val_loss: 0.11287, score: 0.47484
Fold: 7, Epoch: 1/5, Step: 280 | val_loss: 0.10248, score: 0.45313
Fold: 7, Epoch: 1/5, Step: 320 | val_loss: 0.09899, score: 0.4455
Fold: 7, Epoch: 1/5, Step: 360 | val_loss: 0.1071, score: 0.46294
Fold: 7, Epoch: 1/5, Step: 400 | val_loss: 0.10586, score: 0.45949
Fold7, Epoch1/5 | train_loss: 0.11121
Fold: 7, Epoch: 1/5, Step: end | val_loss: 0.10562, score: 0.4596


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 2/5, Step: 40 | val_loss: 0.10039, score: 0.44849
Fold: 7, Epoch: 2/5, Step: 80 | val_loss: 0.11925, score: 0.48792
Fold: 7, Epoch: 2/5, Step: 120 | val_loss: 0.10223, score: 0.45181
Fold: 7, Epoch: 2/5, Step: 160 | val_loss: 0.09689, score: 0.44083
Fold: 7, Epoch: 2/5, Step: 200 | val_loss: 0.10288, score: 0.45378
Fold: 7, Epoch: 2/5, Step: 240 | val_loss: 0.09931, score: 0.44616
Fold: 7, Epoch: 2/5, Step: 280 | val_loss: 0.10311, score: 0.45437
Fold: 7, Epoch: 2/5, Step: 320 | val_loss: 0.09948, score: 0.44646
Fold: 7, Epoch: 2/5, Step: 360 | val_loss: 0.11482, score: 0.47899
Fold: 7, Epoch: 2/5, Step: 400 | val_loss: 0.10368, score: 0.45521
Fold7, Epoch2/5 | train_loss: 0.1026
Fold: 7, Epoch: 2/5, Step: end | val_loss: 0.0957, score: 0.43775


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 3/5, Step: 40 | val_loss: 0.10173, score: 0.4518
Fold: 7, Epoch: 3/5, Step: 80 | val_loss: 0.09706, score: 0.44085
Fold: 7, Epoch: 3/5, Step: 120 | val_loss: 0.10782, score: 0.4656
Fold: 7, Epoch: 3/5, Step: 160 | val_loss: 0.097, score: 0.44055
Fold: 7, Epoch: 3/5, Step: 200 | val_loss: 0.09652, score: 0.43947
Fold: 7, Epoch: 3/5, Step: 240 | val_loss: 0.09978, score: 0.44718
Fold: 7, Epoch: 3/5, Step: 280 | val_loss: 0.09625, score: 0.43913
Fold: 7, Epoch: 3/5, Step: 320 | val_loss: 0.09487, score: 0.43565
Fold: 7, Epoch: 3/5, Step: 360 | val_loss: 0.09572, score: 0.43782
Fold: 7, Epoch: 3/5, Step: 400 | val_loss: 0.09589, score: 0.43811
Fold7, Epoch3/5 | train_loss: 0.09464
Fold: 7, Epoch: 3/5, Step: end | val_loss: 0.10166, score: 0.45176


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 4/5, Step: 40 | val_loss: 0.09428, score: 0.43448
Fold: 7, Epoch: 4/5, Step: 80 | val_loss: 0.09627, score: 0.43925
Fold: 7, Epoch: 4/5, Step: 120 | val_loss: 0.09463, score: 0.43534
Fold: 7, Epoch: 4/5, Step: 160 | val_loss: 0.09655, score: 0.4398
Fold: 7, Epoch: 4/5, Step: 200 | val_loss: 0.09763, score: 0.44241
Fold: 7, Epoch: 4/5, Step: 240 | val_loss: 0.0947, score: 0.43549
Fold: 7, Epoch: 4/5, Step: 280 | val_loss: 0.09489, score: 0.43588
Fold: 7, Epoch: 4/5, Step: 320 | val_loss: 0.09488, score: 0.43591
Fold: 7, Epoch: 4/5, Step: 360 | val_loss: 0.0947, score: 0.43549
Fold: 7, Epoch: 4/5, Step: 400 | val_loss: 0.09464, score: 0.43537
Fold7, Epoch4/5 | train_loss: 0.08937
Fold: 7, Epoch: 4/5, Step: end | val_loss: 0.09463, score: 0.43533


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 0/5, Step: 40 | val_loss: 1.87154, score: 2.47367
Fold: 8, Epoch: 0/5, Step: 80 | val_loss: 0.27809, score: 0.77925
Fold: 8, Epoch: 0/5, Step: 120 | val_loss: 0.16119, score: 0.57663
Fold: 8, Epoch: 0/5, Step: 160 | val_loss: 0.13301, score: 0.518
Fold: 8, Epoch: 0/5, Step: 200 | val_loss: 0.12075, score: 0.49355
Fold: 8, Epoch: 0/5, Step: 240 | val_loss: 0.11468, score: 0.48011
Fold: 8, Epoch: 0/5, Step: 280 | val_loss: 0.11844, score: 0.48758
Fold: 8, Epoch: 0/5, Step: 320 | val_loss: 0.14952, score: 0.546
Fold: 8, Epoch: 0/5, Step: 360 | val_loss: 0.11187, score: 0.47371
Fold: 8, Epoch: 0/5, Step: 400 | val_loss: 0.14286, score: 0.53159
Fold8, Epoch0/5 | train_loss: 0.43217
Fold: 8, Epoch: 0/5, Step: end | val_loss: 0.10821, score: 0.46578


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 1/5, Step: 40 | val_loss: 0.10535, score: 0.45979
Fold: 8, Epoch: 1/5, Step: 80 | val_loss: 0.12065, score: 0.4909
Fold: 8, Epoch: 1/5, Step: 120 | val_loss: 0.10886, score: 0.46756
Fold: 8, Epoch: 1/5, Step: 160 | val_loss: 0.10547, score: 0.46009
Fold: 8, Epoch: 1/5, Step: 200 | val_loss: 0.11239, score: 0.47512
Fold: 8, Epoch: 1/5, Step: 240 | val_loss: 0.11056, score: 0.4716
Fold: 8, Epoch: 1/5, Step: 280 | val_loss: 0.10606, score: 0.4604
Fold: 8, Epoch: 1/5, Step: 320 | val_loss: 0.10916, score: 0.46753
Fold: 8, Epoch: 1/5, Step: 360 | val_loss: 0.10143, score: 0.45059
Fold: 8, Epoch: 1/5, Step: 400 | val_loss: 0.10782, score: 0.46451
Fold8, Epoch1/5 | train_loss: 0.11099
Fold: 8, Epoch: 1/5, Step: end | val_loss: 0.10518, score: 0.45886


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 2/5, Step: 40 | val_loss: 0.10407, score: 0.45686
Fold: 8, Epoch: 2/5, Step: 80 | val_loss: 0.10116, score: 0.45018
Fold: 8, Epoch: 2/5, Step: 120 | val_loss: 0.1026, score: 0.4535
Fold: 8, Epoch: 2/5, Step: 160 | val_loss: 0.10384, score: 0.45589
Fold: 8, Epoch: 2/5, Step: 200 | val_loss: 0.10177, score: 0.45156
Fold: 8, Epoch: 2/5, Step: 240 | val_loss: 0.10633, score: 0.46183
Fold: 8, Epoch: 2/5, Step: 280 | val_loss: 0.10692, score: 0.46294
Fold: 8, Epoch: 2/5, Step: 320 | val_loss: 0.10604, score: 0.46117
Fold: 8, Epoch: 2/5, Step: 360 | val_loss: 0.10529, score: 0.45959
Fold: 8, Epoch: 2/5, Step: 400 | val_loss: 0.10156, score: 0.45104
Fold8, Epoch2/5 | train_loss: 0.1021
Fold: 8, Epoch: 2/5, Step: end | val_loss: 0.10153, score: 0.45053


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 3/5, Step: 40 | val_loss: 0.1002, score: 0.44799
Fold: 8, Epoch: 3/5, Step: 80 | val_loss: 0.1, score: 0.44757
Fold: 8, Epoch: 3/5, Step: 120 | val_loss: 0.10548, score: 0.45954
Fold: 8, Epoch: 3/5, Step: 160 | val_loss: 0.09801, score: 0.44314
Fold: 8, Epoch: 3/5, Step: 200 | val_loss: 0.09882, score: 0.44496
Fold: 8, Epoch: 3/5, Step: 240 | val_loss: 0.10178, score: 0.45173
Fold: 8, Epoch: 3/5, Step: 280 | val_loss: 0.09916, score: 0.44561
Fold: 8, Epoch: 3/5, Step: 320 | val_loss: 0.10156, score: 0.45142
Fold: 8, Epoch: 3/5, Step: 360 | val_loss: 0.10059, score: 0.44888
Fold: 8, Epoch: 3/5, Step: 400 | val_loss: 0.10024, score: 0.44801
Fold8, Epoch3/5 | train_loss: 0.09415
Fold: 8, Epoch: 3/5, Step: end | val_loss: 0.09978, score: 0.44714


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 4/5, Step: 40 | val_loss: 0.09877, score: 0.4448
Fold: 8, Epoch: 4/5, Step: 80 | val_loss: 0.09807, score: 0.44316
Fold: 8, Epoch: 4/5, Step: 120 | val_loss: 0.09962, score: 0.4466
Fold: 8, Epoch: 4/5, Step: 160 | val_loss: 0.09869, score: 0.44455
Fold: 8, Epoch: 4/5, Step: 200 | val_loss: 0.09831, score: 0.4437
Fold: 8, Epoch: 4/5, Step: 240 | val_loss: 0.09883, score: 0.44487
Fold: 8, Epoch: 4/5, Step: 280 | val_loss: 0.0985, score: 0.44419
Fold: 8, Epoch: 4/5, Step: 320 | val_loss: 0.0984, score: 0.44395
Fold: 8, Epoch: 4/5, Step: 360 | val_loss: 0.09859, score: 0.44438
Fold: 8, Epoch: 4/5, Step: 400 | val_loss: 0.09849, score: 0.44413
Fold8, Epoch4/5 | train_loss: 0.08904
Fold: 8, Epoch: 4/5, Step: end | val_loss: 0.09847, score: 0.44409


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 0/5, Step: 40 | val_loss: 1.63614, score: 2.23924
Fold: 9, Epoch: 0/5, Step: 80 | val_loss: 0.2621, score: 0.74173
Fold: 9, Epoch: 0/5, Step: 120 | val_loss: 0.17994, score: 0.60943
Fold: 9, Epoch: 0/5, Step: 160 | val_loss: 0.13547, score: 0.52367
Fold: 9, Epoch: 0/5, Step: 200 | val_loss: 0.1286, score: 0.50866
Fold: 9, Epoch: 0/5, Step: 240 | val_loss: 0.12931, score: 0.51005
Fold: 9, Epoch: 0/5, Step: 280 | val_loss: 0.12627, score: 0.50458
Fold: 9, Epoch: 0/5, Step: 320 | val_loss: 0.13738, score: 0.52635
Fold: 9, Epoch: 0/5, Step: 360 | val_loss: 0.11966, score: 0.49061
Fold: 9, Epoch: 0/5, Step: 400 | val_loss: 0.11045, score: 0.47053
Fold9, Epoch0/5 | train_loss: 0.38887
Fold: 9, Epoch: 0/5, Step: end | val_loss: 0.13455, score: 0.51735


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 1/5, Step: 40 | val_loss: 0.11376, score: 0.47749
Fold: 9, Epoch: 1/5, Step: 80 | val_loss: 0.11229, score: 0.474
Fold: 9, Epoch: 1/5, Step: 120 | val_loss: 0.1292, score: 0.50542
Fold: 9, Epoch: 1/5, Step: 160 | val_loss: 0.11539, score: 0.48125
Fold: 9, Epoch: 1/5, Step: 200 | val_loss: 0.13094, score: 0.50876
Fold: 9, Epoch: 1/5, Step: 240 | val_loss: 0.10917, score: 0.46718
Fold: 9, Epoch: 1/5, Step: 280 | val_loss: 0.11805, score: 0.4855
Fold: 9, Epoch: 1/5, Step: 320 | val_loss: 0.10743, score: 0.46354
Fold: 9, Epoch: 1/5, Step: 360 | val_loss: 0.12209, score: 0.49599
Fold: 9, Epoch: 1/5, Step: 400 | val_loss: 0.11415, score: 0.4784
Fold9, Epoch1/5 | train_loss: 0.10987
Fold: 9, Epoch: 1/5, Step: end | val_loss: 0.12016, score: 0.49201


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 2/5, Step: 40 | val_loss: 0.11136, score: 0.47151
Fold: 9, Epoch: 2/5, Step: 80 | val_loss: 0.12699, score: 0.50334
Fold: 9, Epoch: 2/5, Step: 120 | val_loss: 0.11008, score: 0.4687
Fold: 9, Epoch: 2/5, Step: 160 | val_loss: 0.10913, score: 0.4678
Fold: 9, Epoch: 2/5, Step: 200 | val_loss: 0.15245, score: 0.5531
Fold: 9, Epoch: 2/5, Step: 240 | val_loss: 0.10754, score: 0.4636
Fold: 9, Epoch: 2/5, Step: 280 | val_loss: 0.10849, score: 0.46615
Fold: 9, Epoch: 2/5, Step: 320 | val_loss: 0.10759, score: 0.46362
Fold: 9, Epoch: 2/5, Step: 360 | val_loss: 0.11718, score: 0.48425
Fold: 9, Epoch: 2/5, Step: 400 | val_loss: 0.10793, score: 0.46407
Fold9, Epoch2/5 | train_loss: 0.10162
Fold: 9, Epoch: 2/5, Step: end | val_loss: 0.10932, score: 0.46712


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 3/5, Step: 40 | val_loss: 0.10742, score: 0.46309
Fold: 9, Epoch: 3/5, Step: 80 | val_loss: 0.1072, score: 0.46298
Fold: 9, Epoch: 3/5, Step: 120 | val_loss: 0.10585, score: 0.45994
Fold: 9, Epoch: 3/5, Step: 160 | val_loss: 0.10563, score: 0.45957
Fold: 9, Epoch: 3/5, Step: 200 | val_loss: 0.10897, score: 0.46653
Fold: 9, Epoch: 3/5, Step: 240 | val_loss: 0.10603, score: 0.46017
Fold: 9, Epoch: 3/5, Step: 280 | val_loss: 0.10377, score: 0.45533
Fold: 9, Epoch: 3/5, Step: 320 | val_loss: 0.10534, score: 0.45898
Fold: 9, Epoch: 3/5, Step: 360 | val_loss: 0.10465, score: 0.45724
Fold: 9, Epoch: 3/5, Step: 400 | val_loss: 0.10467, score: 0.45757
Fold9, Epoch3/5 | train_loss: 0.0935
Fold: 9, Epoch: 3/5, Step: end | val_loss: 0.10433, score: 0.45654


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 4/5, Step: 40 | val_loss: 0.10418, score: 0.45617
Fold: 9, Epoch: 4/5, Step: 80 | val_loss: 0.10524, score: 0.45854
Fold: 9, Epoch: 4/5, Step: 120 | val_loss: 0.10414, score: 0.45626
Fold: 9, Epoch: 4/5, Step: 160 | val_loss: 0.10432, score: 0.45651
Fold: 9, Epoch: 4/5, Step: 200 | val_loss: 0.10392, score: 0.45566
Fold: 9, Epoch: 4/5, Step: 240 | val_loss: 0.10344, score: 0.4546
Fold: 9, Epoch: 4/5, Step: 280 | val_loss: 0.10341, score: 0.45453
Fold: 9, Epoch: 4/5, Step: 320 | val_loss: 0.10362, score: 0.45496
Fold: 9, Epoch: 4/5, Step: 360 | val_loss: 0.10345, score: 0.45459
Fold: 9, Epoch: 4/5, Step: 400 | val_loss: 0.1034, score: 0.45449
Fold9, Epoch4/5 | train_loss: 0.08825
Fold: 9, Epoch: 4/5, Step: end | val_loss: 0.10338, score: 0.45444
fold score: [0.4445799905123669, 0.4558004080321796, 0.45298656692709866, 0.44341678267524187, 0.4546562491612033, 0.4338745606923188, 0.46362314958143613, 0.43448281107693754, 0.44314068714959387, 0.45443651806387486]
CV: 0.4483

Starting upload for file tokenizer.tar


100%|██████████| 11.4M/11.4M [00:04<00:00, 2.80MB/s]


Upload successful: tokenizer.tar (11MB)
Starting upload for file preds.tar


100%|██████████| 180k/180k [00:02<00:00, 67.9kB/s] 


Upload successful: preds.tar (180KB)
Starting upload for file model.tar


100%|██████████| 16.2G/16.2G [41:40<00:00, 6.96MB/s]   


Upload successful: model.tar (16GB)
Starting upload for file fig.tar


ReadTimeoutError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=None)