In [1]:
! nvidia-smi

Sun Nov 20 12:10:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  Off |
|  0%   33C    P8    29W / 480W |   1030MiB / 24564MiB |     19%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os

class Config:
    AUTHOR = "shu421"

    EXP = "exp159"
    NAME = "feedback3-Exp159-deberta-v3-base"
    # MODEL_PATH = "microsoft/deberta-v3-base"
    MODEL_PATH = "funnel-transformer/medium"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-english-language-learning"
    BASE_PATH = '/root/feedback3/'

    api_path = "/root/.kaggle/kaggle.json"

    apex=True
    seed = 42
    num_fold = 10
    trn_fold = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    batch_size = 8
    n_epochs = 5
    max_len = 1024
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 80
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

In [5]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

# ! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# ! pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

from kaggle.api.kaggle_api_extended import KaggleApi



In [6]:
def setup(cfg):
    # cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    # cfg.DRIVE = cfg.DRIVE_PATH
    # cfg.EXP = (cfg.NAME if cfg.NAME is not None 
    #     else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    # )
    cfg.INPUT = os.path.join(cfg.BASE_PATH, 'input')
    cfg.OUTPUT = os.path.join(cfg.BASE_PATH, 'output')
    cfg.SUBMISSION = os.path.join(cfg.BASE_PATH, 'submission')
    cfg.DATASET = os.path.join(cfg.BASE_PATH, 'dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    
    if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
        # load dataset
        ! pip install --upgrade --force-reinstall --no-deps kaggle
        ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
        filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
        ! unzip -d $cfg.INPUT $filepath
        
    
    for path in cfg.DATASET_PATH:
        datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
        if not os.path.exists(datasetpath):
            os.makedirs(datasetpath, exist_ok=True)
            ! kaggle datasets download $path -p $datasetpath
            filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
            ! unzip -d $datasetpath $filepath
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [7]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [8]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

In [9]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [10]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [11]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.transformer.encoder.layer[:8])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        output = self.fc(self.ln(feature))
        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

In [12]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

# initialize layer
def reinit_bert(model):
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [13]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [14]:
def valid_fn(cfg, valid_loader, model, valid_df, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        # with tqdm(valid_loader, total=len(valid_loader)) as pbar:
        # for (inputs, labels) in pbar:
        for (inputs, labels) in valid_loader:
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast():
                loss, output = model(inputs, labels)
            
            output = output.detach().cpu().numpy()
            val_preds.append(output)
            val_losses.append(loss.item() * len(labels))
            val_nums.append(len(labels))
                # pbar.set_postfix({
                #     'val_loss': loss.item()
                # })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    # val_log = {
    #     'val_loss': val_loss,
    #     'mcrmse': score
    # }
    # display(val_log)
    print(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epochs}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        # print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    
    return best_val_preds, best_val_score

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        print('='*30, f'Fold{fold}', '='*30)
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = reinit_bert(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

        # enable FGM
        fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            print('='*20, f'epoch{epoch}', '='*20)
            train_losses = []
            train_nums = []
            model.train() 
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                     # FGM attack
                    fgm.attack()
                    with autocast(enabled=cfg.apex):
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        best_val_preds, best_val_score = valid_fn(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            epoch,
                            step,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

            train_loss = sum(train_losses)/sum(train_nums)
            # train_log = {
            #     'train_loss':train_loss
            # }
            # display(train_log)

            # evaluating(epoch)
            print(f'Fold{fold}, Epoch{epoch}/{cfg.n_epochs} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score = valid_fn(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    print('fold score：', fold_score)
    print('CV:', round(score, 4))
    return score

In [15]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true


print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

train = processing_features(train)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

score = training(cfg, train)

if cfg.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 80 | val_loss: 0.17514, score: 0.59782
Fold: 0, Epoch: 0/5, Step: 160 | val_loss: 0.12283, score: 0.49668
Fold: 0, Epoch: 0/5, Step: 240 | val_loss: 0.13218, score: 0.5143
Fold: 0, Epoch: 0/5, Step: 320 | val_loss: 0.11004, score: 0.46888
Fold: 0, Epoch: 0/5, Step: 400 | val_loss: 0.10303, score: 0.45411
Fold0, Epoch0/5 | train_loss: 0.34459
Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.14226, score: 0.53352


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 80 | val_loss: 0.10102, score: 0.44996
Fold: 0, Epoch: 1/5, Step: 160 | val_loss: 0.10036, score: 0.44836
Fold: 0, Epoch: 1/5, Step: 240 | val_loss: 0.10037, score: 0.44818
Fold: 0, Epoch: 1/5, Step: 320 | val_loss: 0.0997, score: 0.4468
Fold: 0, Epoch: 1/5, Step: 400 | val_loss: 0.09978, score: 0.44694
Fold0, Epoch1/5 | train_loss: 0.10237
Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.09979, score: 0.44695


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 80 | val_loss: 0.11359, score: 0.47741
Fold: 0, Epoch: 2/5, Step: 160 | val_loss: 0.10075, score: 0.44902
Fold: 0, Epoch: 2/5, Step: 240 | val_loss: 0.11472, score: 0.47991
Fold: 0, Epoch: 2/5, Step: 320 | val_loss: 0.10063, score: 0.44898
Fold: 0, Epoch: 2/5, Step: 400 | val_loss: 0.10316, score: 0.45454
Fold0, Epoch2/5 | train_loss: 0.09571
Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.10342, score: 0.45504


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 80 | val_loss: 0.10651, score: 0.4621
Fold: 0, Epoch: 3/5, Step: 160 | val_loss: 0.10306, score: 0.45422
Fold: 0, Epoch: 3/5, Step: 240 | val_loss: 0.10038, score: 0.44817
Fold: 0, Epoch: 3/5, Step: 320 | val_loss: 0.10478, score: 0.45818
Fold: 0, Epoch: 3/5, Step: 400 | val_loss: 0.10215, score: 0.45238
Fold0, Epoch3/5 | train_loss: 0.09063
Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.10052, score: 0.44879


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 80 | val_loss: 0.10029, score: 0.44823
Fold: 0, Epoch: 4/5, Step: 160 | val_loss: 0.10065, score: 0.44898
Fold: 0, Epoch: 4/5, Step: 240 | val_loss: 0.10006, score: 0.44759
Fold: 0, Epoch: 4/5, Step: 320 | val_loss: 0.10019, score: 0.44788
Fold: 0, Epoch: 4/5, Step: 400 | val_loss: 0.10011, score: 0.44771
Fold0, Epoch4/5 | train_loss: 0.08595
Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.10011, score: 0.4477


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 80 | val_loss: 0.18445, score: 0.61974
Fold: 1, Epoch: 0/5, Step: 160 | val_loss: 0.12481, score: 0.50091
Fold: 1, Epoch: 0/5, Step: 240 | val_loss: 0.21844, score: 0.65999
Fold: 1, Epoch: 0/5, Step: 320 | val_loss: 0.11304, score: 0.47626
Fold: 1, Epoch: 0/5, Step: 400 | val_loss: 0.12349, score: 0.49772
Fold1, Epoch0/5 | train_loss: 0.38617
Fold: 1, Epoch: 0/5, Step: end | val_loss: 0.10757, score: 0.46433


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 80 | val_loss: 0.11355, score: 0.47707
Fold: 1, Epoch: 1/5, Step: 160 | val_loss: 0.10897, score: 0.46758
Fold: 1, Epoch: 1/5, Step: 240 | val_loss: 0.10824, score: 0.46557
Fold: 1, Epoch: 1/5, Step: 320 | val_loss: 0.10715, score: 0.46355
Fold: 1, Epoch: 1/5, Step: 400 | val_loss: 0.10509, score: 0.45897
Fold1, Epoch1/5 | train_loss: 0.10311
Fold: 1, Epoch: 1/5, Step: end | val_loss: 0.10953, score: 0.46879


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 80 | val_loss: 0.10634, score: 0.46151
Fold: 1, Epoch: 2/5, Step: 160 | val_loss: 0.1054, score: 0.45935
Fold: 1, Epoch: 2/5, Step: 240 | val_loss: 0.10847, score: 0.46662
Fold: 1, Epoch: 2/5, Step: 320 | val_loss: 0.10734, score: 0.46416
Fold: 1, Epoch: 2/5, Step: 400 | val_loss: 0.10553, score: 0.46016
Fold1, Epoch2/5 | train_loss: 0.09636
Fold: 1, Epoch: 2/5, Step: end | val_loss: 0.10516, score: 0.45937


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 80 | val_loss: 0.10668, score: 0.46281
Fold: 1, Epoch: 3/5, Step: 160 | val_loss: 0.10583, score: 0.46052
Fold: 1, Epoch: 3/5, Step: 240 | val_loss: 0.10315, score: 0.45487
Fold: 1, Epoch: 3/5, Step: 320 | val_loss: 0.10264, score: 0.45367
Fold: 1, Epoch: 3/5, Step: 400 | val_loss: 0.1033, score: 0.45517
Fold1, Epoch3/5 | train_loss: 0.09039
Fold: 1, Epoch: 3/5, Step: end | val_loss: 0.10325, score: 0.45495


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 80 | val_loss: 0.10285, score: 0.45406
Fold: 1, Epoch: 4/5, Step: 160 | val_loss: 0.10287, score: 0.45419
Fold: 1, Epoch: 4/5, Step: 240 | val_loss: 0.10239, score: 0.45311
Fold: 1, Epoch: 4/5, Step: 320 | val_loss: 0.10227, score: 0.45285
Fold: 1, Epoch: 4/5, Step: 400 | val_loss: 0.10231, score: 0.45295
Fold1, Epoch4/5 | train_loss: 0.08663
Fold: 1, Epoch: 4/5, Step: end | val_loss: 0.10233, score: 0.45299


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 80 | val_loss: 0.17609, score: 0.60143
Fold: 2, Epoch: 0/5, Step: 160 | val_loss: 0.15176, score: 0.55334
Fold: 2, Epoch: 0/5, Step: 240 | val_loss: 0.11285, score: 0.47643
Fold: 2, Epoch: 0/5, Step: 320 | val_loss: 0.1173, score: 0.48602
Fold: 2, Epoch: 0/5, Step: 400 | val_loss: 0.11237, score: 0.47533
Fold2, Epoch0/5 | train_loss: 0.36991
Fold: 2, Epoch: 0/5, Step: end | val_loss: 0.17744, score: 0.59414


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 80 | val_loss: 0.10778, score: 0.46546
Fold: 2, Epoch: 1/5, Step: 160 | val_loss: 0.10929, score: 0.46853
Fold: 2, Epoch: 1/5, Step: 240 | val_loss: 0.10731, score: 0.46438
Fold: 2, Epoch: 1/5, Step: 320 | val_loss: 0.1085, score: 0.46714
Fold: 2, Epoch: 1/5, Step: 400 | val_loss: 0.11168, score: 0.47317
Fold2, Epoch1/5 | train_loss: 0.10688
Fold: 2, Epoch: 1/5, Step: end | val_loss: 0.10905, score: 0.4681


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 80 | val_loss: 0.10693, score: 0.46376
Fold: 2, Epoch: 2/5, Step: 160 | val_loss: 0.1049, score: 0.45917
Fold: 2, Epoch: 2/5, Step: 240 | val_loss: 0.10534, score: 0.46021
Fold: 2, Epoch: 2/5, Step: 320 | val_loss: 0.10405, score: 0.45731
Fold: 2, Epoch: 2/5, Step: 400 | val_loss: 0.10529, score: 0.4601
Fold2, Epoch2/5 | train_loss: 0.09364
Fold: 2, Epoch: 2/5, Step: end | val_loss: 0.10325, score: 0.45556


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 80 | val_loss: 0.10462, score: 0.45835
Fold: 2, Epoch: 3/5, Step: 160 | val_loss: 0.10462, score: 0.45841
Fold: 2, Epoch: 3/5, Step: 240 | val_loss: 0.10427, score: 0.45753
Fold: 2, Epoch: 3/5, Step: 320 | val_loss: 0.10361, score: 0.4562
Fold: 2, Epoch: 3/5, Step: 400 | val_loss: 0.10452, score: 0.4584
Fold2, Epoch3/5 | train_loss: 0.0879
Fold: 2, Epoch: 3/5, Step: end | val_loss: 0.1062, score: 0.46183


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 80 | val_loss: 0.10314, score: 0.45522
Fold: 2, Epoch: 4/5, Step: 160 | val_loss: 0.10457, score: 0.45819
Fold: 2, Epoch: 4/5, Step: 240 | val_loss: 0.10316, score: 0.45521
Fold: 2, Epoch: 4/5, Step: 320 | val_loss: 0.10391, score: 0.45683
Fold: 2, Epoch: 4/5, Step: 400 | val_loss: 0.10398, score: 0.45696
Fold2, Epoch4/5 | train_loss: 0.08477
Fold: 2, Epoch: 4/5, Step: end | val_loss: 0.10396, score: 0.45692


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 80 | val_loss: 0.18571, score: 0.61569
Fold: 3, Epoch: 0/5, Step: 160 | val_loss: 0.12908, score: 0.50786
Fold: 3, Epoch: 0/5, Step: 240 | val_loss: 0.13081, score: 0.5109
Fold: 3, Epoch: 0/5, Step: 320 | val_loss: 0.11223, score: 0.4743
Fold: 3, Epoch: 0/5, Step: 400 | val_loss: 0.11069, score: 0.47111
Fold3, Epoch0/5 | train_loss: 0.38896
Fold: 3, Epoch: 0/5, Step: end | val_loss: 0.11171, score: 0.47356


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 80 | val_loss: 0.10606, score: 0.46099
Fold: 3, Epoch: 1/5, Step: 160 | val_loss: 0.10974, score: 0.46902
Fold: 3, Epoch: 1/5, Step: 240 | val_loss: 0.11022, score: 0.46948
Fold: 3, Epoch: 1/5, Step: 320 | val_loss: 0.10405, score: 0.45627
Fold: 3, Epoch: 1/5, Step: 400 | val_loss: 0.10513, score: 0.45838
Fold3, Epoch1/5 | train_loss: 0.10421
Fold: 3, Epoch: 1/5, Step: end | val_loss: 0.1107, score: 0.47022


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 80 | val_loss: 0.10184, score: 0.45147
Fold: 3, Epoch: 2/5, Step: 160 | val_loss: 0.10299, score: 0.45419
Fold: 3, Epoch: 2/5, Step: 240 | val_loss: 0.10067, score: 0.44864
Fold: 3, Epoch: 2/5, Step: 320 | val_loss: 0.11544, score: 0.47998
Fold: 3, Epoch: 2/5, Step: 400 | val_loss: 0.10154, score: 0.4505
Fold3, Epoch2/5 | train_loss: 0.09468
Fold: 3, Epoch: 2/5, Step: end | val_loss: 0.10244, score: 0.45289


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 80 | val_loss: 0.09836, score: 0.44361
Fold: 3, Epoch: 3/5, Step: 160 | val_loss: 0.09954, score: 0.4462
Fold: 3, Epoch: 3/5, Step: 240 | val_loss: 0.09848, score: 0.44384
Fold: 3, Epoch: 3/5, Step: 320 | val_loss: 0.10047, score: 0.44841
Fold: 3, Epoch: 3/5, Step: 400 | val_loss: 0.09908, score: 0.44521
Fold3, Epoch3/5 | train_loss: 0.08816
Fold: 3, Epoch: 3/5, Step: end | val_loss: 0.10348, score: 0.45517


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 80 | val_loss: 0.09945, score: 0.44605
Fold: 3, Epoch: 4/5, Step: 160 | val_loss: 0.09887, score: 0.44469
Fold: 3, Epoch: 4/5, Step: 240 | val_loss: 0.09819, score: 0.44318
Fold: 3, Epoch: 4/5, Step: 320 | val_loss: 0.0981, score: 0.44297
Fold: 3, Epoch: 4/5, Step: 400 | val_loss: 0.09819, score: 0.44315
Fold3, Epoch4/5 | train_loss: 0.08416
Fold: 3, Epoch: 4/5, Step: end | val_loss: 0.0982, score: 0.44318


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 80 | val_loss: 0.19452, score: 0.63179
Fold: 4, Epoch: 0/5, Step: 160 | val_loss: 0.1255, score: 0.50385
Fold: 4, Epoch: 0/5, Step: 240 | val_loss: 0.12011, score: 0.49238
Fold: 4, Epoch: 0/5, Step: 320 | val_loss: 0.11897, score: 0.49047
Fold: 4, Epoch: 0/5, Step: 400 | val_loss: 0.12009, score: 0.49152
Fold4, Epoch0/5 | train_loss: 0.40276
Fold: 4, Epoch: 0/5, Step: end | val_loss: 0.12488, score: 0.50096


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 80 | val_loss: 0.11824, score: 0.48908
Fold: 4, Epoch: 1/5, Step: 160 | val_loss: 0.12255, score: 0.49803
Fold: 4, Epoch: 1/5, Step: 240 | val_loss: 0.11047, score: 0.47174
Fold: 4, Epoch: 1/5, Step: 320 | val_loss: 0.10786, score: 0.46574
Fold: 4, Epoch: 1/5, Step: 400 | val_loss: 0.12828, score: 0.50859
Fold4, Epoch1/5 | train_loss: 0.10408
Fold: 4, Epoch: 1/5, Step: end | val_loss: 0.10968, score: 0.47024


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 80 | val_loss: 0.10557, score: 0.46091
Fold: 4, Epoch: 2/5, Step: 160 | val_loss: 0.10484, score: 0.45926
Fold: 4, Epoch: 2/5, Step: 240 | val_loss: 0.10764, score: 0.46544
Fold: 4, Epoch: 2/5, Step: 320 | val_loss: 0.1053, score: 0.4604
Fold: 4, Epoch: 2/5, Step: 400 | val_loss: 0.10426, score: 0.45799
Fold4, Epoch2/5 | train_loss: 0.09371
Fold: 4, Epoch: 2/5, Step: end | val_loss: 0.10541, score: 0.46051


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 80 | val_loss: 0.10309, score: 0.4551
Fold: 4, Epoch: 3/5, Step: 160 | val_loss: 0.10549, score: 0.4606
Fold: 4, Epoch: 3/5, Step: 240 | val_loss: 0.10411, score: 0.45765
Fold: 4, Epoch: 3/5, Step: 320 | val_loss: 0.10306, score: 0.45527
Fold: 4, Epoch: 3/5, Step: 400 | val_loss: 0.1039, score: 0.45719
Fold4, Epoch3/5 | train_loss: 0.0887
Fold: 4, Epoch: 3/5, Step: end | val_loss: 0.10325, score: 0.45582


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 80 | val_loss: 0.10379, score: 0.45718
Fold: 4, Epoch: 4/5, Step: 160 | val_loss: 0.10241, score: 0.45385
Fold: 4, Epoch: 4/5, Step: 240 | val_loss: 0.10257, score: 0.45425
Fold: 4, Epoch: 4/5, Step: 320 | val_loss: 0.10249, score: 0.45407
Fold: 4, Epoch: 4/5, Step: 400 | val_loss: 0.1024, score: 0.45386
Fold4, Epoch4/5 | train_loss: 0.08511
Fold: 4, Epoch: 4/5, Step: end | val_loss: 0.10239, score: 0.45385


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 0/5, Step: 80 | val_loss: 0.21629, score: 0.66711
Fold: 5, Epoch: 0/5, Step: 160 | val_loss: 0.1207, score: 0.49277
Fold: 5, Epoch: 0/5, Step: 240 | val_loss: 0.11614, score: 0.48343
Fold: 5, Epoch: 0/5, Step: 320 | val_loss: 0.11534, score: 0.47973
Fold: 5, Epoch: 0/5, Step: 400 | val_loss: 0.10422, score: 0.45666
Fold5, Epoch0/5 | train_loss: 0.4184
Fold: 5, Epoch: 0/5, Step: end | val_loss: 0.11974, score: 0.48896


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 1/5, Step: 80 | val_loss: 0.1165, score: 0.48379
Fold: 5, Epoch: 1/5, Step: 160 | val_loss: 0.09966, score: 0.44677
Fold: 5, Epoch: 1/5, Step: 240 | val_loss: 0.09917, score: 0.44565
Fold: 5, Epoch: 1/5, Step: 320 | val_loss: 0.09867, score: 0.44417
Fold: 5, Epoch: 1/5, Step: 400 | val_loss: 0.09951, score: 0.44638
Fold5, Epoch1/5 | train_loss: 0.10316
Fold: 5, Epoch: 1/5, Step: end | val_loss: 0.09717, score: 0.44102


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 2/5, Step: 80 | val_loss: 0.09707, score: 0.44045
Fold: 5, Epoch: 2/5, Step: 160 | val_loss: 0.09664, score: 0.43954
Fold: 5, Epoch: 2/5, Step: 240 | val_loss: 0.10019, score: 0.44788
Fold: 5, Epoch: 2/5, Step: 320 | val_loss: 0.10425, score: 0.45713
Fold: 5, Epoch: 2/5, Step: 400 | val_loss: 0.09876, score: 0.44498
Fold5, Epoch2/5 | train_loss: 0.09698
Fold: 5, Epoch: 2/5, Step: end | val_loss: 0.10053, score: 0.44876


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 3/5, Step: 80 | val_loss: 0.09767, score: 0.4422
Fold: 5, Epoch: 3/5, Step: 160 | val_loss: 0.09752, score: 0.44186
Fold: 5, Epoch: 3/5, Step: 240 | val_loss: 0.09673, score: 0.43996
Fold: 5, Epoch: 3/5, Step: 320 | val_loss: 0.09621, score: 0.43871
Fold: 5, Epoch: 3/5, Step: 400 | val_loss: 0.0962, score: 0.43879
Fold5, Epoch3/5 | train_loss: 0.09186
Fold: 5, Epoch: 3/5, Step: end | val_loss: 0.09876, score: 0.4447


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 4/5, Step: 80 | val_loss: 0.09571, score: 0.43752
Fold: 5, Epoch: 4/5, Step: 160 | val_loss: 0.0958, score: 0.43781
Fold: 5, Epoch: 4/5, Step: 240 | val_loss: 0.0955, score: 0.43718
Fold: 5, Epoch: 4/5, Step: 320 | val_loss: 0.09671, score: 0.44006
Fold: 5, Epoch: 4/5, Step: 400 | val_loss: 0.09581, score: 0.43791
Fold5, Epoch4/5 | train_loss: 0.08817
Fold: 5, Epoch: 4/5, Step: end | val_loss: 0.09581, score: 0.43792


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 0/5, Step: 80 | val_loss: 0.2216, score: 0.67349
Fold: 6, Epoch: 0/5, Step: 160 | val_loss: 0.15014, score: 0.55212
Fold: 6, Epoch: 0/5, Step: 240 | val_loss: 0.12369, score: 0.49893
Fold: 6, Epoch: 0/5, Step: 320 | val_loss: 0.12147, score: 0.49414
Fold: 6, Epoch: 0/5, Step: 400 | val_loss: 0.12752, score: 0.50714
Fold6, Epoch0/5 | train_loss: 0.42578
Fold: 6, Epoch: 0/5, Step: end | val_loss: 0.12595, score: 0.50178


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 1/5, Step: 80 | val_loss: 0.11927, score: 0.49016
Fold: 6, Epoch: 1/5, Step: 160 | val_loss: 0.12012, score: 0.49142
Fold: 6, Epoch: 1/5, Step: 240 | val_loss: 0.111, score: 0.47245
Fold: 6, Epoch: 1/5, Step: 320 | val_loss: 0.11131, score: 0.47281
Fold: 6, Epoch: 1/5, Step: 400 | val_loss: 0.1139, score: 0.47811
Fold6, Epoch1/5 | train_loss: 0.10083
Fold: 6, Epoch: 1/5, Step: end | val_loss: 0.11371, score: 0.47808


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 2/5, Step: 80 | val_loss: 0.1081, score: 0.46597
Fold: 6, Epoch: 2/5, Step: 160 | val_loss: 0.11974, score: 0.48941
Fold: 6, Epoch: 2/5, Step: 240 | val_loss: 0.11297, score: 0.47644
Fold: 6, Epoch: 2/5, Step: 320 | val_loss: 0.10965, score: 0.46946
Fold: 6, Epoch: 2/5, Step: 400 | val_loss: 0.10996, score: 0.47014
Fold6, Epoch2/5 | train_loss: 0.0957
Fold: 6, Epoch: 2/5, Step: end | val_loss: 0.10905, score: 0.46824


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 3/5, Step: 80 | val_loss: 0.11035, score: 0.47061
Fold: 6, Epoch: 3/5, Step: 160 | val_loss: 0.10779, score: 0.46548
Fold: 6, Epoch: 3/5, Step: 240 | val_loss: 0.10896, score: 0.46786
Fold: 6, Epoch: 3/5, Step: 320 | val_loss: 0.107, score: 0.46349
Fold: 6, Epoch: 3/5, Step: 400 | val_loss: 0.10696, score: 0.46357
Fold6, Epoch3/5 | train_loss: 0.09026
Fold: 6, Epoch: 3/5, Step: end | val_loss: 0.10693, score: 0.46355


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 4/5, Step: 80 | val_loss: 0.10689, score: 0.46343
Fold: 6, Epoch: 4/5, Step: 160 | val_loss: 0.10652, score: 0.46269
Fold: 6, Epoch: 4/5, Step: 240 | val_loss: 0.10647, score: 0.46255
Fold: 6, Epoch: 4/5, Step: 320 | val_loss: 0.10659, score: 0.46279
Fold: 6, Epoch: 4/5, Step: 400 | val_loss: 0.10662, score: 0.46285
Fold6, Epoch4/5 | train_loss: 0.08611
Fold: 6, Epoch: 4/5, Step: end | val_loss: 0.10663, score: 0.46288


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 0/5, Step: 80 | val_loss: 0.25376, score: 0.72718
Fold: 7, Epoch: 0/5, Step: 160 | val_loss: 0.12366, score: 0.49961
Fold: 7, Epoch: 0/5, Step: 240 | val_loss: 0.10754, score: 0.46472
Fold: 7, Epoch: 0/5, Step: 320 | val_loss: 0.11051, score: 0.47139
Fold: 7, Epoch: 0/5, Step: 400 | val_loss: 0.11437, score: 0.47804
Fold7, Epoch0/5 | train_loss: 0.40705
Fold: 7, Epoch: 0/5, Step: end | val_loss: 0.10553, score: 0.45987


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 1/5, Step: 80 | val_loss: 0.10068, score: 0.44924
Fold: 7, Epoch: 1/5, Step: 160 | val_loss: 0.0997, score: 0.44728
Fold: 7, Epoch: 1/5, Step: 240 | val_loss: 0.09992, score: 0.44748
Fold: 7, Epoch: 1/5, Step: 320 | val_loss: 0.09729, score: 0.44136
Fold: 7, Epoch: 1/5, Step: 400 | val_loss: 0.09939, score: 0.44617
Fold7, Epoch1/5 | train_loss: 0.10738
Fold: 7, Epoch: 1/5, Step: end | val_loss: 0.10045, score: 0.44849


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 2/5, Step: 80 | val_loss: 0.09941, score: 0.44661
Fold: 7, Epoch: 2/5, Step: 160 | val_loss: 0.10379, score: 0.45511
Fold: 7, Epoch: 2/5, Step: 240 | val_loss: 0.10762, score: 0.46318
Fold: 7, Epoch: 2/5, Step: 320 | val_loss: 0.09702, score: 0.44079
Fold: 7, Epoch: 2/5, Step: 400 | val_loss: 0.09913, score: 0.44602
Fold7, Epoch2/5 | train_loss: 0.09693
Fold: 7, Epoch: 2/5, Step: end | val_loss: 0.09893, score: 0.4455


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 3/5, Step: 80 | val_loss: 0.10432, score: 0.45788
Fold: 7, Epoch: 3/5, Step: 160 | val_loss: 0.09876, score: 0.4447
Fold: 7, Epoch: 3/5, Step: 240 | val_loss: 0.10013, score: 0.448
Fold: 7, Epoch: 3/5, Step: 320 | val_loss: 0.09719, score: 0.44124
Fold: 7, Epoch: 3/5, Step: 400 | val_loss: 0.09588, score: 0.43833
Fold7, Epoch3/5 | train_loss: 0.09054
Fold: 7, Epoch: 3/5, Step: end | val_loss: 0.09663, score: 0.44008


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 4/5, Step: 80 | val_loss: 0.09661, score: 0.44007
Fold: 7, Epoch: 4/5, Step: 160 | val_loss: 0.09616, score: 0.43901
Fold: 7, Epoch: 4/5, Step: 240 | val_loss: 0.09561, score: 0.43766
Fold: 7, Epoch: 4/5, Step: 320 | val_loss: 0.09638, score: 0.43948
Fold: 7, Epoch: 4/5, Step: 400 | val_loss: 0.0962, score: 0.43904
Fold7, Epoch4/5 | train_loss: 0.08385
Fold: 7, Epoch: 4/5, Step: end | val_loss: 0.09618, score: 0.43899


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 0/5, Step: 80 | val_loss: 0.19821, score: 0.63977
Fold: 8, Epoch: 0/5, Step: 160 | val_loss: 0.15963, score: 0.56685
Fold: 8, Epoch: 0/5, Step: 240 | val_loss: 0.13056, score: 0.51339
Fold: 8, Epoch: 0/5, Step: 320 | val_loss: 0.11333, score: 0.47671
Fold: 8, Epoch: 0/5, Step: 400 | val_loss: 0.11731, score: 0.48506
Fold8, Epoch0/5 | train_loss: 0.40333
Fold: 8, Epoch: 0/5, Step: end | val_loss: 0.12238, score: 0.49481


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 1/5, Step: 80 | val_loss: 0.10718, score: 0.46342
Fold: 8, Epoch: 1/5, Step: 160 | val_loss: 0.10411, score: 0.45681
Fold: 8, Epoch: 1/5, Step: 240 | val_loss: 0.10374, score: 0.4559
Fold: 8, Epoch: 1/5, Step: 320 | val_loss: 0.1035, score: 0.45525
Fold: 8, Epoch: 1/5, Step: 400 | val_loss: 0.10395, score: 0.45626
Fold8, Epoch1/5 | train_loss: 0.10206
Fold: 8, Epoch: 1/5, Step: end | val_loss: 0.10571, score: 0.46031


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 2/5, Step: 80 | val_loss: 0.10035, score: 0.44818
Fold: 8, Epoch: 2/5, Step: 160 | val_loss: 0.1037, score: 0.45567
Fold: 8, Epoch: 2/5, Step: 240 | val_loss: 0.10117, score: 0.44988
Fold: 8, Epoch: 2/5, Step: 320 | val_loss: 0.10465, score: 0.4576
Fold: 8, Epoch: 2/5, Step: 400 | val_loss: 0.12198, score: 0.49447
Fold8, Epoch2/5 | train_loss: 0.09666
Fold: 8, Epoch: 2/5, Step: end | val_loss: 0.09977, score: 0.44676


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 3/5, Step: 80 | val_loss: 0.1011, score: 0.44983
Fold: 8, Epoch: 3/5, Step: 160 | val_loss: 0.10822, score: 0.46575
Fold: 8, Epoch: 3/5, Step: 240 | val_loss: 0.10011, score: 0.44756
Fold: 8, Epoch: 3/5, Step: 320 | val_loss: 0.09977, score: 0.44684
Fold: 8, Epoch: 3/5, Step: 400 | val_loss: 0.1, score: 0.44746
Fold8, Epoch3/5 | train_loss: 0.09144
Fold: 8, Epoch: 3/5, Step: end | val_loss: 0.10075, score: 0.44899


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 4/5, Step: 80 | val_loss: 0.09949, score: 0.44618
Fold: 8, Epoch: 4/5, Step: 160 | val_loss: 0.09913, score: 0.4454
Fold: 8, Epoch: 4/5, Step: 240 | val_loss: 0.09912, score: 0.44531
Fold: 8, Epoch: 4/5, Step: 320 | val_loss: 0.09896, score: 0.44496
Fold: 8, Epoch: 4/5, Step: 400 | val_loss: 0.0991, score: 0.44528
Fold8, Epoch4/5 | train_loss: 0.0879
Fold: 8, Epoch: 4/5, Step: end | val_loss: 0.09912, score: 0.44531


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 0/5, Step: 80 | val_loss: 0.26213, score: 0.73724
Fold: 9, Epoch: 0/5, Step: 160 | val_loss: 0.14535, score: 0.54339
Fold: 9, Epoch: 0/5, Step: 240 | val_loss: 0.12948, score: 0.51073
Fold: 9, Epoch: 0/5, Step: 320 | val_loss: 0.1182, score: 0.48689
Fold: 9, Epoch: 0/5, Step: 400 | val_loss: 0.11799, score: 0.48645
Fold9, Epoch0/5 | train_loss: 0.43238
Fold: 9, Epoch: 0/5, Step: end | val_loss: 0.11226, score: 0.4743


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 1/5, Step: 80 | val_loss: 0.1091, score: 0.46709
Fold: 9, Epoch: 1/5, Step: 160 | val_loss: 0.10929, score: 0.4671
Fold: 9, Epoch: 1/5, Step: 240 | val_loss: 0.10873, score: 0.46622
Fold: 9, Epoch: 1/5, Step: 320 | val_loss: 0.11319, score: 0.47567
Fold: 9, Epoch: 1/5, Step: 400 | val_loss: 0.14441, score: 0.54022
Fold9, Epoch1/5 | train_loss: 0.10087
Fold: 9, Epoch: 1/5, Step: end | val_loss: 0.10898, score: 0.46711


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 2/5, Step: 80 | val_loss: 0.10855, score: 0.46564
Fold: 9, Epoch: 2/5, Step: 160 | val_loss: 0.10663, score: 0.4615
Fold: 9, Epoch: 2/5, Step: 240 | val_loss: 0.10772, score: 0.46452
Fold: 9, Epoch: 2/5, Step: 320 | val_loss: 0.10815, score: 0.46451
Fold: 9, Epoch: 2/5, Step: 400 | val_loss: 0.10726, score: 0.46317
Fold9, Epoch2/5 | train_loss: 0.09594
Fold: 9, Epoch: 2/5, Step: end | val_loss: 0.10892, score: 0.46674


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 3/5, Step: 80 | val_loss: 0.10528, score: 0.45854
Fold: 9, Epoch: 3/5, Step: 160 | val_loss: 0.10674, score: 0.46158
Fold: 9, Epoch: 3/5, Step: 240 | val_loss: 0.10488, score: 0.45761
Fold: 9, Epoch: 3/5, Step: 320 | val_loss: 0.1052, score: 0.45823
Fold: 9, Epoch: 3/5, Step: 400 | val_loss: 0.10495, score: 0.45782
Fold9, Epoch3/5 | train_loss: 0.09042
Fold: 9, Epoch: 3/5, Step: end | val_loss: 0.10494, score: 0.45774


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 4/5, Step: 80 | val_loss: 0.10476, score: 0.45739
Fold: 9, Epoch: 4/5, Step: 160 | val_loss: 0.10417, score: 0.45601
Fold: 9, Epoch: 4/5, Step: 240 | val_loss: 0.1045, score: 0.45686
Fold: 9, Epoch: 4/5, Step: 320 | val_loss: 0.10417, score: 0.45606
Fold: 9, Epoch: 4/5, Step: 400 | val_loss: 0.10416, score: 0.45603
Fold9, Epoch4/5 | train_loss: 0.08691
Fold: 9, Epoch: 4/5, Step: end | val_loss: 0.10416, score: 0.45604
fold score： [0.4468001680544985, 0.45285275712439355, 0.45520555669261176, 0.44296621119440704, 0.45384907299571203, 0.4371808683039457, 0.46254519846381287, 0.43766342078696063, 0.4449580609233609, 0.45601487242001904]
CV: 0.4492
Starting upload for file tokenizer.tar


100%|██████████| 10.6M/10.6M [00:03<00:00, 3.01MB/s]


Upload successful: tokenizer.tar (11MB)
Starting upload for file preds.tar


100%|██████████| 180k/180k [00:02<00:00, 70.0kB/s]


Upload successful: preds.tar (180KB)
Starting upload for file model.tar


100%|██████████| 6.87G/6.87G [08:27<00:00, 14.6MB/s]  


Upload successful: model.tar (7GB)
Starting upload for file fig.tar


ReadTimeoutError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=None)