In [1]:
! nvidia-smi

Mon Nov 21 15:17:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  Off |
| 31%   48C    P5    60W / 480W |   2395MiB / 24564MiB |     20%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "shu421"

    EXP = "exp162"
    MODEL_PATH = "microsoft/deberta-v3-base"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-english-language-learning"
    BASE_PATH = '/root/feedback3/'

    api_path = "/root/.kaggle/kaggle.json"

    apex=True
    seed = 42
    num_fold = 10
    trn_fold = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    batch_size = 8
    n_epochs = 5
    max_len = 1024
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 80
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1

    # stochastic weight averaging
    swa = True
    swa_start = 3
    swa_learning_rate = 1e-4
    anneal_epochs = 3 
    anneal_strategy='cos'

    # weight and bias
    wandb = True
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

# ! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# ! pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

from torch.optim.swa_utils import (
    AveragedModel, update_bn, SWALR
)

from kaggle.api.kaggle_api_extended import KaggleApi

In [4]:
# ====================================================
# wandb
# ====================================================
if Config.wandb:
    
    import wandb
    import json

    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb_config = json.load(open('/root/.kaggle/wandb.json', 'rb'))
        secret_value_0 = wandb_config['key']
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=Config.MODEL_PATH,
                     config=class2dict(Config),
                     group=Config.MODEL_PATH,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: Currently logged in as: [33mshu421[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
def setup(cfg):
    # cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    # cfg.DRIVE = cfg.DRIVE_PATH
    # cfg.EXP = (cfg.NAME if cfg.NAME is not None 
    #     else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    # )
    cfg.INPUT = os.path.join(cfg.BASE_PATH, 'input')
    cfg.OUTPUT = os.path.join(cfg.BASE_PATH, 'output')
    cfg.SUBMISSION = os.path.join(cfg.BASE_PATH, 'submission')
    cfg.DATASET = os.path.join(cfg.BASE_PATH, 'dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    
    if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
        # load dataset
        ! pip install --upgrade --force-reinstall --no-deps kaggle
        ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
        filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
        ! unzip -d $cfg.INPUT $filepath
        
    
    for path in cfg.DATASET_PATH:
        datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
        if not os.path.exists(datasetpath):
            os.makedirs(datasetpath, exist_ok=True)
            ! kaggle datasets download $path -p $datasetpath
            filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
            ! unzip -d $datasetpath $filepath
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [6]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [7]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

In [8]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [9]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [10]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.transformer.encoder.layer[:8])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        output = self.fc(self.ln(feature))
        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

In [11]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

# initialize layer
def reinit_bert(model):
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [12]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [13]:
def valid_fn(cfg, valid_loader, model, valid_df, fold, epoch, step, best_val_preds, best_val_score, swa_model):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    swa_model.eval()
    with torch.no_grad():
        # with tqdm(valid_loader, total=len(valid_loader)) as pbar:
        # for (inputs, labels) in pbar:
        for (inputs, labels) in valid_loader:
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast():
                if epoch >= cfg.swa_start:
                    loss, output = swa_model(inputs, labels)
                else:
                    loss, output = model(inputs, labels)
            
            output = output.detach().cpu().numpy()
            val_preds.append(output)
            val_losses.append(loss.item() * len(labels))
            val_nums.append(len(labels))
                # pbar.set_postfix({
                #     'val_loss': loss.item()
                # })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    # val_log = {
    #     'val_loss': val_loss,
    #     'mcrmse': score
    # }
    # display(val_log)
    print(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epochs}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        # print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        if epoch >= cfg.swa_start:
            torch.save(
                swa_model.state_dict(), 
                os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
            )   
        else:
            torch.save(
                model.state_dict(), 
                os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
            )
    
    return best_val_preds, best_val_score, val_loss

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        print('='*30, f'Fold{fold}', '='*30)
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = reinit_bert(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

        # stochastic weight averaging
        swa_model = AveragedModel(model)
        swa_scheduler = SWALR(
            optimizer, swa_lr=cfg.swa_learning_rate, 
            anneal_epochs=cfg.anneal_epochs, 
            anneal_strategy=cfg.anneal_strategy
        )

        # enable FGM
        fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            print('='*20, f'epoch{epoch}', '='*20)
            train_losses = []
            train_nums = []
            model.train() 
            swa_model.train()
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                     # FGM attack
                    fgm.attack()
                    with autocast(enabled=cfg.apex):
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()

                        if epoch >= cfg.swa_start:
                            swa_model.update_parameters(model)
                            swa_scheduler.step()
                        else:
                            scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        best_val_preds, best_val_score, val_loss = valid_fn(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            epoch,
                            step,
                            best_val_preds,
                            best_val_score,
                            swa_model
                        )
                        model.train()
                        swa_model.train()

                    if cfg.wandb:
                        if epoch >= cfg.swa_start:
                            wandb.log({f"[fold{fold}] train_loss": loss.item(),
                                    f"[fold{fold}] lr": swa_scheduler.get_last_lr()[0]})
                        else:
                            wandb.log({f"[fold{fold}] train_loss": loss.item(),
                                    f"[fold{fold}] lr": scheduler.get_lr()[0]})

            train_loss = sum(train_losses)/sum(train_nums)
            # train_log = {
            #     'train_loss':train_loss
            # }
            # display(train_log)

            if cfg.swa:
                update_bn(train_loader, swa_model, device=torch.device('cuda'))
            print(f'Fold{fold}, Epoch{epoch}/{cfg.n_epochs} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
                swa_model,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch, 
                        f"[fold{fold}] avg_train_loss": loss.item(), 
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        
        torch.cuda.empty_cache()
        del model, swa_model
        gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    print('fold score：', fold_score)
    print('CV:', round(score, 4))
    return score

In [14]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true


print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

train = processing_features(train)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

score = training(cfg, train)

if cfg.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 80 | val_loss: 0.17513, score: 0.59782
Fold: 0, Epoch: 0/5, Step: 160 | val_loss: 0.12283, score: 0.49666
Fold: 0, Epoch: 0/5, Step: 240 | val_loss: 0.13221, score: 0.51435
Fold: 0, Epoch: 0/5, Step: 320 | val_loss: 0.11004, score: 0.46886
Fold: 0, Epoch: 0/5, Step: 400 | val_loss: 0.10303, score: 0.45411
Fold0, Epoch0/5 | train_loss: 0.34459
Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.14228, score: 0.53356


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 80 | val_loss: 0.10102, score: 0.44996
Fold: 0, Epoch: 1/5, Step: 160 | val_loss: 0.10036, score: 0.44836
Fold: 0, Epoch: 1/5, Step: 240 | val_loss: 0.10038, score: 0.44819
Fold: 0, Epoch: 1/5, Step: 320 | val_loss: 0.0997, score: 0.44681
Fold: 0, Epoch: 1/5, Step: 400 | val_loss: 0.09978, score: 0.44695
Fold0, Epoch1/5 | train_loss: 0.10237
Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.09979, score: 0.44694


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 80 | val_loss: 0.11359, score: 0.4774
Fold: 0, Epoch: 2/5, Step: 160 | val_loss: 0.10076, score: 0.44902
Fold: 0, Epoch: 2/5, Step: 240 | val_loss: 0.11471, score: 0.4799
Fold: 0, Epoch: 2/5, Step: 320 | val_loss: 0.10062, score: 0.44895
Fold: 0, Epoch: 2/5, Step: 400 | val_loss: 0.10315, score: 0.45453
Fold0, Epoch2/5 | train_loss: 0.09571
Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.10343, score: 0.45505


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 80 | val_loss: 0.10177, score: 0.45134
Fold: 0, Epoch: 3/5, Step: 160 | val_loss: 0.10025, score: 0.44798
Fold: 0, Epoch: 3/5, Step: 240 | val_loss: 0.09907, score: 0.44527
Fold: 0, Epoch: 3/5, Step: 320 | val_loss: 0.09914, score: 0.44548
Fold: 0, Epoch: 3/5, Step: 400 | val_loss: 0.0992, score: 0.44567
Fold0, Epoch3/5 | train_loss: 0.11715
Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.09918, score: 0.44563


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 80 | val_loss: 0.09895, score: 0.44513
Fold: 0, Epoch: 4/5, Step: 160 | val_loss: 0.09916, score: 0.44562
Fold: 0, Epoch: 4/5, Step: 240 | val_loss: 0.0992, score: 0.44571
Fold: 0, Epoch: 4/5, Step: 320 | val_loss: 0.09928, score: 0.44589
Fold: 0, Epoch: 4/5, Step: 400 | val_loss: 0.09932, score: 0.446
Fold0, Epoch4/5 | train_loss: 0.10491
Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.09943, score: 0.44624


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 80 | val_loss: 0.18443, score: 0.61972
Fold: 1, Epoch: 0/5, Step: 160 | val_loss: 0.1248, score: 0.50088
Fold: 1, Epoch: 0/5, Step: 240 | val_loss: 0.21845, score: 0.65999
Fold: 1, Epoch: 0/5, Step: 320 | val_loss: 0.11301, score: 0.4762
Fold: 1, Epoch: 0/5, Step: 400 | val_loss: 0.12348, score: 0.49771
Fold1, Epoch0/5 | train_loss: 0.38616
Fold: 1, Epoch: 0/5, Step: end | val_loss: 0.10758, score: 0.46437


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 80 | val_loss: 0.11354, score: 0.47707
Fold: 1, Epoch: 1/5, Step: 160 | val_loss: 0.10897, score: 0.46758
Fold: 1, Epoch: 1/5, Step: 240 | val_loss: 0.10824, score: 0.46556
Fold: 1, Epoch: 1/5, Step: 320 | val_loss: 0.10715, score: 0.46357
Fold: 1, Epoch: 1/5, Step: 400 | val_loss: 0.10509, score: 0.45897
Fold1, Epoch1/5 | train_loss: 0.10311
Fold: 1, Epoch: 1/5, Step: end | val_loss: 0.10954, score: 0.46881


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 80 | val_loss: 0.10634, score: 0.46152
Fold: 1, Epoch: 2/5, Step: 160 | val_loss: 0.1054, score: 0.45935
Fold: 1, Epoch: 2/5, Step: 240 | val_loss: 0.10846, score: 0.46661
Fold: 1, Epoch: 2/5, Step: 320 | val_loss: 0.10735, score: 0.46418
Fold: 1, Epoch: 2/5, Step: 400 | val_loss: 0.10552, score: 0.46015
Fold1, Epoch2/5 | train_loss: 0.09636
Fold: 1, Epoch: 2/5, Step: end | val_loss: 0.10516, score: 0.45935


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 80 | val_loss: 0.10621, score: 0.46141
Fold: 1, Epoch: 3/5, Step: 160 | val_loss: 0.10612, score: 0.46126
Fold: 1, Epoch: 3/5, Step: 240 | val_loss: 0.10523, score: 0.45926
Fold: 1, Epoch: 3/5, Step: 320 | val_loss: 0.10442, score: 0.45742
Fold: 1, Epoch: 3/5, Step: 400 | val_loss: 0.10483, score: 0.45828
Fold1, Epoch3/5 | train_loss: 0.12591
Fold: 1, Epoch: 3/5, Step: end | val_loss: 0.10478, score: 0.45814


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 80 | val_loss: 0.1044, score: 0.45726
Fold: 1, Epoch: 4/5, Step: 160 | val_loss: 0.1041, score: 0.45664
Fold: 1, Epoch: 4/5, Step: 240 | val_loss: 0.10403, score: 0.45652
Fold: 1, Epoch: 4/5, Step: 320 | val_loss: 0.10377, score: 0.45596
Fold: 1, Epoch: 4/5, Step: 400 | val_loss: 0.10356, score: 0.45551
Fold1, Epoch4/5 | train_loss: 0.10133
Fold: 1, Epoch: 4/5, Step: end | val_loss: 0.10345, score: 0.45525


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 80 | val_loss: 0.1761, score: 0.60144
Fold: 2, Epoch: 0/5, Step: 160 | val_loss: 0.15174, score: 0.5533
Fold: 2, Epoch: 0/5, Step: 240 | val_loss: 0.11286, score: 0.47646
Fold: 2, Epoch: 0/5, Step: 320 | val_loss: 0.1173, score: 0.48602
Fold: 2, Epoch: 0/5, Step: 400 | val_loss: 0.11237, score: 0.47533
Fold2, Epoch0/5 | train_loss: 0.36991
Fold: 2, Epoch: 0/5, Step: end | val_loss: 0.17743, score: 0.59414


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 80 | val_loss: 0.10777, score: 0.46546
Fold: 2, Epoch: 1/5, Step: 160 | val_loss: 0.10929, score: 0.46854
Fold: 2, Epoch: 1/5, Step: 240 | val_loss: 0.10731, score: 0.46439
Fold: 2, Epoch: 1/5, Step: 320 | val_loss: 0.10851, score: 0.46716
Fold: 2, Epoch: 1/5, Step: 400 | val_loss: 0.11168, score: 0.47317
Fold2, Epoch1/5 | train_loss: 0.10687
Fold: 2, Epoch: 1/5, Step: end | val_loss: 0.10905, score: 0.4681


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 80 | val_loss: 0.10693, score: 0.46376
Fold: 2, Epoch: 2/5, Step: 160 | val_loss: 0.1049, score: 0.45917
Fold: 2, Epoch: 2/5, Step: 240 | val_loss: 0.10534, score: 0.46021
Fold: 2, Epoch: 2/5, Step: 320 | val_loss: 0.10405, score: 0.45731
Fold: 2, Epoch: 2/5, Step: 400 | val_loss: 0.1053, score: 0.46012
Fold2, Epoch2/5 | train_loss: 0.09365
Fold: 2, Epoch: 2/5, Step: end | val_loss: 0.10325, score: 0.45556


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 80 | val_loss: 0.1049, score: 0.459
Fold: 2, Epoch: 3/5, Step: 160 | val_loss: 0.10434, score: 0.45784
Fold: 2, Epoch: 3/5, Step: 240 | val_loss: 0.10385, score: 0.45671
Fold: 2, Epoch: 3/5, Step: 320 | val_loss: 0.10337, score: 0.45566
Fold: 2, Epoch: 3/5, Step: 400 | val_loss: 0.10339, score: 0.45579
Fold2, Epoch3/5 | train_loss: 0.11664
Fold: 2, Epoch: 3/5, Step: end | val_loss: 0.10343, score: 0.45593


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 80 | val_loss: 0.10366, score: 0.45651
Fold: 2, Epoch: 4/5, Step: 160 | val_loss: 0.10368, score: 0.45652
Fold: 2, Epoch: 4/5, Step: 240 | val_loss: 0.10368, score: 0.45653
Fold: 2, Epoch: 4/5, Step: 320 | val_loss: 0.10372, score: 0.4566
Fold: 2, Epoch: 4/5, Step: 400 | val_loss: 0.10371, score: 0.45659
Fold2, Epoch4/5 | train_loss: 0.09734
Fold: 2, Epoch: 4/5, Step: end | val_loss: 0.10379, score: 0.45678


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 80 | val_loss: 0.18569, score: 0.61568
Fold: 3, Epoch: 0/5, Step: 160 | val_loss: 0.1291, score: 0.50789
Fold: 3, Epoch: 0/5, Step: 240 | val_loss: 0.13083, score: 0.51094
Fold: 3, Epoch: 0/5, Step: 320 | val_loss: 0.11224, score: 0.4743
Fold: 3, Epoch: 0/5, Step: 400 | val_loss: 0.1107, score: 0.47113
Fold3, Epoch0/5 | train_loss: 0.38896
Fold: 3, Epoch: 0/5, Step: end | val_loss: 0.11171, score: 0.47355


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 80 | val_loss: 0.10606, score: 0.461
Fold: 3, Epoch: 1/5, Step: 160 | val_loss: 0.10973, score: 0.469
Fold: 3, Epoch: 1/5, Step: 240 | val_loss: 0.11024, score: 0.46951
Fold: 3, Epoch: 1/5, Step: 320 | val_loss: 0.10405, score: 0.45628
Fold: 3, Epoch: 1/5, Step: 400 | val_loss: 0.10514, score: 0.45838
Fold3, Epoch1/5 | train_loss: 0.10421
Fold: 3, Epoch: 1/5, Step: end | val_loss: 0.11071, score: 0.47022


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 80 | val_loss: 0.10183, score: 0.45146
Fold: 3, Epoch: 2/5, Step: 160 | val_loss: 0.10299, score: 0.45419
Fold: 3, Epoch: 2/5, Step: 240 | val_loss: 0.10067, score: 0.44865
Fold: 3, Epoch: 2/5, Step: 320 | val_loss: 0.11543, score: 0.47995
Fold: 3, Epoch: 2/5, Step: 400 | val_loss: 0.10154, score: 0.45052
Fold3, Epoch2/5 | train_loss: 0.09468
Fold: 3, Epoch: 2/5, Step: end | val_loss: 0.10245, score: 0.45291


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 80 | val_loss: 0.10121, score: 0.45012
Fold: 3, Epoch: 3/5, Step: 160 | val_loss: 0.09984, score: 0.44686
Fold: 3, Epoch: 3/5, Step: 240 | val_loss: 0.09993, score: 0.44699
Fold: 3, Epoch: 3/5, Step: 320 | val_loss: 0.09977, score: 0.44665
Fold: 3, Epoch: 3/5, Step: 400 | val_loss: 0.09988, score: 0.44689
Fold3, Epoch3/5 | train_loss: 0.1188
Fold: 3, Epoch: 3/5, Step: end | val_loss: 0.09976, score: 0.44662


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 80 | val_loss: 0.09941, score: 0.44585
Fold: 3, Epoch: 4/5, Step: 160 | val_loss: 0.09933, score: 0.44567
Fold: 3, Epoch: 4/5, Step: 240 | val_loss: 0.09935, score: 0.44572
Fold: 3, Epoch: 4/5, Step: 320 | val_loss: 0.09931, score: 0.44566
Fold: 3, Epoch: 4/5, Step: 400 | val_loss: 0.09909, score: 0.44516
Fold3, Epoch4/5 | train_loss: 0.10126
Fold: 3, Epoch: 4/5, Step: end | val_loss: 0.09897, score: 0.4449


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 80 | val_loss: 0.19453, score: 0.6318
Fold: 4, Epoch: 0/5, Step: 160 | val_loss: 0.12549, score: 0.50383
Fold: 4, Epoch: 0/5, Step: 240 | val_loss: 0.12011, score: 0.49237
Fold: 4, Epoch: 0/5, Step: 320 | val_loss: 0.11898, score: 0.4905
Fold: 4, Epoch: 0/5, Step: 400 | val_loss: 0.12012, score: 0.49158
Fold4, Epoch0/5 | train_loss: 0.40276
Fold: 4, Epoch: 0/5, Step: end | val_loss: 0.12487, score: 0.50093


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 80 | val_loss: 0.11822, score: 0.48905
Fold: 4, Epoch: 1/5, Step: 160 | val_loss: 0.12256, score: 0.49804
Fold: 4, Epoch: 1/5, Step: 240 | val_loss: 0.11046, score: 0.47172
Fold: 4, Epoch: 1/5, Step: 320 | val_loss: 0.10785, score: 0.46571
Fold: 4, Epoch: 1/5, Step: 400 | val_loss: 0.12829, score: 0.5086
Fold4, Epoch1/5 | train_loss: 0.10408
Fold: 4, Epoch: 1/5, Step: end | val_loss: 0.10968, score: 0.47023


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 80 | val_loss: 0.10557, score: 0.46091
Fold: 4, Epoch: 2/5, Step: 160 | val_loss: 0.10485, score: 0.45928
Fold: 4, Epoch: 2/5, Step: 240 | val_loss: 0.10765, score: 0.46547
Fold: 4, Epoch: 2/5, Step: 320 | val_loss: 0.1053, score: 0.4604
Fold: 4, Epoch: 2/5, Step: 400 | val_loss: 0.10426, score: 0.45798
Fold4, Epoch2/5 | train_loss: 0.09371
Fold: 4, Epoch: 2/5, Step: end | val_loss: 0.1054, score: 0.46049


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 80 | val_loss: 0.10627, score: 0.46252
Fold: 4, Epoch: 3/5, Step: 160 | val_loss: 0.10632, score: 0.4625
Fold: 4, Epoch: 3/5, Step: 240 | val_loss: 0.10597, score: 0.46168
Fold: 4, Epoch: 3/5, Step: 320 | val_loss: 0.1063, score: 0.46238
Fold: 4, Epoch: 3/5, Step: 400 | val_loss: 0.10629, score: 0.46243
Fold4, Epoch3/5 | train_loss: 0.12434
Fold: 4, Epoch: 3/5, Step: end | val_loss: 0.10622, score: 0.4623


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 80 | val_loss: 0.10593, score: 0.46167
Fold: 4, Epoch: 4/5, Step: 160 | val_loss: 0.10556, score: 0.46085
Fold: 4, Epoch: 4/5, Step: 240 | val_loss: 0.10513, score: 0.45989
Fold: 4, Epoch: 4/5, Step: 320 | val_loss: 0.10494, score: 0.45951
Fold: 4, Epoch: 4/5, Step: 400 | val_loss: 0.10484, score: 0.4593
Fold4, Epoch4/5 | train_loss: 0.0931
Fold: 4, Epoch: 4/5, Step: end | val_loss: 0.10478, score: 0.45916


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 0/5, Step: 80 | val_loss: 0.21628, score: 0.6671
Fold: 5, Epoch: 0/5, Step: 160 | val_loss: 0.12071, score: 0.49279
Fold: 5, Epoch: 0/5, Step: 240 | val_loss: 0.11613, score: 0.48341
Fold: 5, Epoch: 0/5, Step: 320 | val_loss: 0.11533, score: 0.47971
Fold: 5, Epoch: 0/5, Step: 400 | val_loss: 0.10422, score: 0.45666
Fold5, Epoch0/5 | train_loss: 0.4184
Fold: 5, Epoch: 0/5, Step: end | val_loss: 0.1197, score: 0.48888


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 1/5, Step: 80 | val_loss: 0.11651, score: 0.4838
Fold: 5, Epoch: 1/5, Step: 160 | val_loss: 0.09966, score: 0.44677
Fold: 5, Epoch: 1/5, Step: 240 | val_loss: 0.09918, score: 0.44567
Fold: 5, Epoch: 1/5, Step: 320 | val_loss: 0.09867, score: 0.44418
Fold: 5, Epoch: 1/5, Step: 400 | val_loss: 0.09951, score: 0.44639
Fold5, Epoch1/5 | train_loss: 0.10316
Fold: 5, Epoch: 1/5, Step: end | val_loss: 0.09716, score: 0.441


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 2/5, Step: 80 | val_loss: 0.09707, score: 0.44044
Fold: 5, Epoch: 2/5, Step: 160 | val_loss: 0.09664, score: 0.43954
Fold: 5, Epoch: 2/5, Step: 240 | val_loss: 0.1002, score: 0.44791
Fold: 5, Epoch: 2/5, Step: 320 | val_loss: 0.10425, score: 0.45713
Fold: 5, Epoch: 2/5, Step: 400 | val_loss: 0.09875, score: 0.44496
Fold5, Epoch2/5 | train_loss: 0.09698
Fold: 5, Epoch: 2/5, Step: end | val_loss: 0.10053, score: 0.44877


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 3/5, Step: 80 | val_loss: 0.09791, score: 0.44234
Fold: 5, Epoch: 3/5, Step: 160 | val_loss: 0.09728, score: 0.441
Fold: 5, Epoch: 3/5, Step: 240 | val_loss: 0.0975, score: 0.44157
Fold: 5, Epoch: 3/5, Step: 320 | val_loss: 0.09733, score: 0.44119
Fold: 5, Epoch: 3/5, Step: 400 | val_loss: 0.09727, score: 0.44109
Fold5, Epoch3/5 | train_loss: 0.11897
Fold: 5, Epoch: 3/5, Step: end | val_loss: 0.09762, score: 0.44193


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 4/5, Step: 80 | val_loss: 0.09728, score: 0.44117
Fold: 5, Epoch: 4/5, Step: 160 | val_loss: 0.09709, score: 0.44076
Fold: 5, Epoch: 4/5, Step: 240 | val_loss: 0.09696, score: 0.44052
Fold: 5, Epoch: 4/5, Step: 320 | val_loss: 0.09706, score: 0.44075
Fold: 5, Epoch: 4/5, Step: 400 | val_loss: 0.09683, score: 0.44024
Fold5, Epoch4/5 | train_loss: 0.10157
Fold: 5, Epoch: 4/5, Step: end | val_loss: 0.09691, score: 0.44043


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 0/5, Step: 80 | val_loss: 0.22158, score: 0.67346
Fold: 6, Epoch: 0/5, Step: 160 | val_loss: 0.15015, score: 0.55214
Fold: 6, Epoch: 0/5, Step: 240 | val_loss: 0.12369, score: 0.49894
Fold: 6, Epoch: 0/5, Step: 320 | val_loss: 0.12147, score: 0.49415
Fold: 6, Epoch: 0/5, Step: 400 | val_loss: 0.12755, score: 0.5072
Fold6, Epoch0/5 | train_loss: 0.42578
Fold: 6, Epoch: 0/5, Step: end | val_loss: 0.12596, score: 0.50179


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 1/5, Step: 80 | val_loss: 0.11927, score: 0.49016
Fold: 6, Epoch: 1/5, Step: 160 | val_loss: 0.12011, score: 0.49142
Fold: 6, Epoch: 1/5, Step: 240 | val_loss: 0.111, score: 0.47246
Fold: 6, Epoch: 1/5, Step: 320 | val_loss: 0.1113, score: 0.47279
Fold: 6, Epoch: 1/5, Step: 400 | val_loss: 0.11389, score: 0.47809
Fold6, Epoch1/5 | train_loss: 0.10084
Fold: 6, Epoch: 1/5, Step: end | val_loss: 0.1137, score: 0.47805


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 2/5, Step: 80 | val_loss: 0.1081, score: 0.46597
Fold: 6, Epoch: 2/5, Step: 160 | val_loss: 0.11973, score: 0.48939
Fold: 6, Epoch: 2/5, Step: 240 | val_loss: 0.11297, score: 0.47642
Fold: 6, Epoch: 2/5, Step: 320 | val_loss: 0.10964, score: 0.46944
Fold: 6, Epoch: 2/5, Step: 400 | val_loss: 0.10995, score: 0.47012
Fold6, Epoch2/5 | train_loss: 0.0957
Fold: 6, Epoch: 2/5, Step: end | val_loss: 0.10906, score: 0.46826


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 3/5, Step: 80 | val_loss: 0.1079, score: 0.46556
Fold: 6, Epoch: 3/5, Step: 160 | val_loss: 0.10785, score: 0.46541
Fold: 6, Epoch: 3/5, Step: 240 | val_loss: 0.10863, score: 0.46714
Fold: 6, Epoch: 3/5, Step: 320 | val_loss: 0.1083, score: 0.46635
Fold: 6, Epoch: 3/5, Step: 400 | val_loss: 0.10731, score: 0.46422
Fold6, Epoch3/5 | train_loss: 0.12592
Fold: 6, Epoch: 3/5, Step: end | val_loss: 0.10712, score: 0.46383


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 4/5, Step: 80 | val_loss: 0.10711, score: 0.46387
Fold: 6, Epoch: 4/5, Step: 160 | val_loss: 0.10707, score: 0.46381
Fold: 6, Epoch: 4/5, Step: 240 | val_loss: 0.10688, score: 0.46342
Fold: 6, Epoch: 4/5, Step: 320 | val_loss: 0.10697, score: 0.46365
Fold: 6, Epoch: 4/5, Step: 400 | val_loss: 0.10716, score: 0.4641
Fold6, Epoch4/5 | train_loss: 0.09444
Fold: 6, Epoch: 4/5, Step: end | val_loss: 0.10716, score: 0.46411


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 0/5, Step: 80 | val_loss: 0.25376, score: 0.72718
Fold: 7, Epoch: 0/5, Step: 160 | val_loss: 0.12365, score: 0.49959
Fold: 7, Epoch: 0/5, Step: 240 | val_loss: 0.10754, score: 0.46472
Fold: 7, Epoch: 0/5, Step: 320 | val_loss: 0.11053, score: 0.47143
Fold: 7, Epoch: 0/5, Step: 400 | val_loss: 0.11437, score: 0.47803
Fold7, Epoch0/5 | train_loss: 0.40705
Fold: 7, Epoch: 0/5, Step: end | val_loss: 0.10553, score: 0.45989


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 1/5, Step: 80 | val_loss: 0.10071, score: 0.44929
Fold: 7, Epoch: 1/5, Step: 160 | val_loss: 0.0997, score: 0.44728
Fold: 7, Epoch: 1/5, Step: 240 | val_loss: 0.09993, score: 0.4475
Fold: 7, Epoch: 1/5, Step: 320 | val_loss: 0.0973, score: 0.44137
Fold: 7, Epoch: 1/5, Step: 400 | val_loss: 0.09939, score: 0.44618
Fold7, Epoch1/5 | train_loss: 0.10738
Fold: 7, Epoch: 1/5, Step: end | val_loss: 0.10046, score: 0.44852


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 2/5, Step: 80 | val_loss: 0.09941, score: 0.44661
Fold: 7, Epoch: 2/5, Step: 160 | val_loss: 0.10379, score: 0.4551
Fold: 7, Epoch: 2/5, Step: 240 | val_loss: 0.10762, score: 0.46319
Fold: 7, Epoch: 2/5, Step: 320 | val_loss: 0.09702, score: 0.4408
Fold: 7, Epoch: 2/5, Step: 400 | val_loss: 0.09913, score: 0.446
Fold7, Epoch2/5 | train_loss: 0.09693
Fold: 7, Epoch: 2/5, Step: end | val_loss: 0.09893, score: 0.4455


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 3/5, Step: 80 | val_loss: 0.09972, score: 0.44703
Fold: 7, Epoch: 3/5, Step: 160 | val_loss: 0.09799, score: 0.44303
Fold: 7, Epoch: 3/5, Step: 240 | val_loss: 0.09781, score: 0.44267
Fold: 7, Epoch: 3/5, Step: 320 | val_loss: 0.09823, score: 0.44363
Fold: 7, Epoch: 3/5, Step: 400 | val_loss: 0.09857, score: 0.44447
Fold7, Epoch3/5 | train_loss: 0.12218
Fold: 7, Epoch: 3/5, Step: end | val_loss: 0.09855, score: 0.44442


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 4/5, Step: 80 | val_loss: 0.09849, score: 0.44434
Fold: 7, Epoch: 4/5, Step: 160 | val_loss: 0.09818, score: 0.44365
Fold: 7, Epoch: 4/5, Step: 240 | val_loss: 0.09776, score: 0.4427
Fold: 7, Epoch: 4/5, Step: 320 | val_loss: 0.09775, score: 0.44267
Fold: 7, Epoch: 4/5, Step: 400 | val_loss: 0.09786, score: 0.44292
Fold7, Epoch4/5 | train_loss: 0.10213
Fold: 7, Epoch: 4/5, Step: end | val_loss: 0.09778, score: 0.44272


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 0/5, Step: 80 | val_loss: 0.19821, score: 0.63976
Fold: 8, Epoch: 0/5, Step: 160 | val_loss: 0.1595, score: 0.56661
Fold: 8, Epoch: 0/5, Step: 240 | val_loss: 0.13061, score: 0.51348
Fold: 8, Epoch: 0/5, Step: 320 | val_loss: 0.11333, score: 0.47671
Fold: 8, Epoch: 0/5, Step: 400 | val_loss: 0.11734, score: 0.48513
Fold8, Epoch0/5 | train_loss: 0.40334
Fold: 8, Epoch: 0/5, Step: end | val_loss: 0.12242, score: 0.49487


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 1/5, Step: 80 | val_loss: 0.10717, score: 0.4634
Fold: 8, Epoch: 1/5, Step: 160 | val_loss: 0.10412, score: 0.45683
Fold: 8, Epoch: 1/5, Step: 240 | val_loss: 0.10375, score: 0.45592
Fold: 8, Epoch: 1/5, Step: 320 | val_loss: 0.10349, score: 0.45522
Fold: 8, Epoch: 1/5, Step: 400 | val_loss: 0.10395, score: 0.45627
Fold8, Epoch1/5 | train_loss: 0.10206
Fold: 8, Epoch: 1/5, Step: end | val_loss: 0.10571, score: 0.46031


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 2/5, Step: 80 | val_loss: 0.10035, score: 0.44817
Fold: 8, Epoch: 2/5, Step: 160 | val_loss: 0.1037, score: 0.45565
Fold: 8, Epoch: 2/5, Step: 240 | val_loss: 0.10118, score: 0.44991
Fold: 8, Epoch: 2/5, Step: 320 | val_loss: 0.10466, score: 0.45761
Fold: 8, Epoch: 2/5, Step: 400 | val_loss: 0.12198, score: 0.49449
Fold8, Epoch2/5 | train_loss: 0.09666
Fold: 8, Epoch: 2/5, Step: end | val_loss: 0.09978, score: 0.44678


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 3/5, Step: 80 | val_loss: 0.10258, score: 0.45338
Fold: 8, Epoch: 3/5, Step: 160 | val_loss: 0.10355, score: 0.4555
Fold: 8, Epoch: 3/5, Step: 240 | val_loss: 0.10351, score: 0.45532
Fold: 8, Epoch: 3/5, Step: 320 | val_loss: 0.10261, score: 0.45329
Fold: 8, Epoch: 3/5, Step: 400 | val_loss: 0.10261, score: 0.45329
Fold8, Epoch3/5 | train_loss: 0.12004
Fold: 8, Epoch: 3/5, Step: end | val_loss: 0.10289, score: 0.45391


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 4/5, Step: 80 | val_loss: 0.10297, score: 0.4541
Fold: 8, Epoch: 4/5, Step: 160 | val_loss: 0.10286, score: 0.45387
Fold: 8, Epoch: 4/5, Step: 240 | val_loss: 0.1029, score: 0.45395
Fold: 8, Epoch: 4/5, Step: 320 | val_loss: 0.1027, score: 0.45347
Fold: 8, Epoch: 4/5, Step: 400 | val_loss: 0.1025, score: 0.453
Fold8, Epoch4/5 | train_loss: 0.10109
Fold: 8, Epoch: 4/5, Step: end | val_loss: 0.10236, score: 0.45269


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 0/5, Step: 80 | val_loss: 0.26213, score: 0.73723
Fold: 9, Epoch: 0/5, Step: 160 | val_loss: 0.14533, score: 0.54334
Fold: 9, Epoch: 0/5, Step: 240 | val_loss: 0.12947, score: 0.5107
Fold: 9, Epoch: 0/5, Step: 320 | val_loss: 0.11822, score: 0.48693
Fold: 9, Epoch: 0/5, Step: 400 | val_loss: 0.11798, score: 0.48643
Fold9, Epoch0/5 | train_loss: 0.43238
Fold: 9, Epoch: 0/5, Step: end | val_loss: 0.11226, score: 0.4743


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 1/5, Step: 80 | val_loss: 0.10911, score: 0.46711
Fold: 9, Epoch: 1/5, Step: 160 | val_loss: 0.1093, score: 0.46711
Fold: 9, Epoch: 1/5, Step: 240 | val_loss: 0.10872, score: 0.4662
Fold: 9, Epoch: 1/5, Step: 320 | val_loss: 0.11318, score: 0.47566
Fold: 9, Epoch: 1/5, Step: 400 | val_loss: 0.14439, score: 0.54018
Fold9, Epoch1/5 | train_loss: 0.10087
Fold: 9, Epoch: 1/5, Step: end | val_loss: 0.10898, score: 0.4671


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 2/5, Step: 80 | val_loss: 0.10855, score: 0.46564
Fold: 9, Epoch: 2/5, Step: 160 | val_loss: 0.10662, score: 0.46148
Fold: 9, Epoch: 2/5, Step: 240 | val_loss: 0.10772, score: 0.46451
Fold: 9, Epoch: 2/5, Step: 320 | val_loss: 0.10814, score: 0.4645
Fold: 9, Epoch: 2/5, Step: 400 | val_loss: 0.10726, score: 0.46317
Fold9, Epoch2/5 | train_loss: 0.09594
Fold: 9, Epoch: 2/5, Step: end | val_loss: 0.10893, score: 0.46678


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 3/5, Step: 80 | val_loss: 0.10713, score: 0.46273
Fold: 9, Epoch: 3/5, Step: 160 | val_loss: 0.10795, score: 0.46484
Fold: 9, Epoch: 3/5, Step: 240 | val_loss: 0.10799, score: 0.46496
Fold: 9, Epoch: 3/5, Step: 320 | val_loss: 0.10771, score: 0.46432
Fold: 9, Epoch: 3/5, Step: 400 | val_loss: 0.10793, score: 0.46472
Fold9, Epoch3/5 | train_loss: 0.12781
Fold: 9, Epoch: 3/5, Step: end | val_loss: 0.10757, score: 0.4639


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 4/5, Step: 80 | val_loss: 0.10732, score: 0.46331
Fold: 9, Epoch: 4/5, Step: 160 | val_loss: 0.10698, score: 0.46255
Fold: 9, Epoch: 4/5, Step: 240 | val_loss: 0.10688, score: 0.46234
Fold: 9, Epoch: 4/5, Step: 320 | val_loss: 0.10671, score: 0.46192
Fold: 9, Epoch: 4/5, Step: 400 | val_loss: 0.1065, score: 0.46145
Fold9, Epoch4/5 | train_loss: 0.09496
Fold: 9, Epoch: 4/5, Step: end | val_loss: 0.10641, score: 0.46124
fold score： [0.44512661586439156, 0.4552513050167125, 0.4555576907555463, 0.44489782920262055, 0.4579835722407695, 0.4395395599051892, 0.46342276725324655, 0.4408003016861983, 0.44677723659754337, 0.46124222675796933]
CV: 0.4513
Starting upload for file tokenizer.tar


100%|██████████| 10.6M/10.6M [00:04<00:00, 2.54MB/s]


Upload successful: tokenizer.tar (11MB)
Starting upload for file preds.tar


100%|██████████| 180k/180k [00:02<00:00, 65.5kB/s]


Upload successful: preds.tar (180KB)
Starting upload for file model.tar


100%|██████████| 6.87G/6.87G [07:59<00:00, 15.4MB/s]  


Upload successful: model.tar (7GB)
Starting upload for file fig.tar


ReadTimeoutError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=None)