In [1]:
! nvidia-smi

Tue Nov 22 15:48:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  Off |
|  0%   40C    P3    56W / 480W |   2724MiB / 24564MiB |     64%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "shu421"

    EXP = "exp163"
    MODEL_PATH = "microsoft/deberta-v3-large"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-english-language-learning"
    BASE_PATH = '/root/feedback3/'

    api_path = "/root/.kaggle/kaggle.json"

    apex=True
    seed = 42
    num_fold = 10
    trn_fold = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    batch_size = 8
    n_epochs = 5
    max_len = 1024
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 80
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1

    # start epoch for AWP
    start_epoch = 2

    # AWP
    awp_lr = 1e-5
    awp_eps = 1e-2


    # weight and bias
    wandb = True
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# ! pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F


from kaggle.api.kaggle_api_extended import KaggleApi

[0m

In [4]:
# ====================================================
# wandb
# ====================================================
if Config.wandb:
    
    import wandb
    import json

    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb_config = json.load(open('/root/.kaggle/wandb.json', 'rb'))
        secret_value_0 = wandb_config['key']
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project=Config.COMPETITION, 
                     name=Config.EXP,
                     config=class2dict(Config),
                     group=Config.MODEL_PATH,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: Currently logged in as: [33mshu421[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
def setup(cfg):
    # cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    # cfg.DRIVE = cfg.DRIVE_PATH
    # cfg.EXP = (cfg.NAME if cfg.NAME is not None 
    #     else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    # )
    cfg.INPUT = os.path.join(cfg.BASE_PATH, 'input')
    cfg.OUTPUT = os.path.join(cfg.BASE_PATH, 'output')
    cfg.SUBMISSION = os.path.join(cfg.BASE_PATH, 'submission')
    cfg.DATASET = os.path.join(cfg.BASE_PATH, 'dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    
    if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
        # load dataset
        ! pip install --upgrade --force-reinstall --no-deps kaggle
        ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
        filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
        ! unzip -d $cfg.INPUT $filepath
        
    
    for path in cfg.DATASET_PATH:
        datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
        if not os.path.exists(datasetpath):
            os.makedirs(datasetpath, exist_ok=True)
            ! kaggle datasets download $path -p $datasetpath
            filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
            ! unzip -d $datasetpath $filepath
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [6]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [7]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

In [8]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [9]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [10]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.transformer.encoder.layer[:8])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        output = self.fc(self.ln(feature))
        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

In [11]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

# initialize layer
def reinit_bert(model):
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [12]:
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss

class AWP:
    def __init__(
        self,
        model: Module,
        optimizer: Optimizer,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs: dict, labels: dict) -> Tensor:
        with autocast():
            self._save()
            self._attack_step() # モデルを近傍の悪い方へ改変
            adv_loss, _ = self.model(inputs, labels)
            self.optimizer.zero_grad()
        return adv_loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    # 直前に損失関数に通してパラメータの勾配を取得できるようにしておく必要あり
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [13]:
def valid_fn(cfg, valid_loader, model, valid_df, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        # with tqdm(valid_loader, total=len(valid_loader)) as pbar:
        # for (inputs, labels) in pbar:
        for (inputs, labels) in valid_loader:
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast():
                loss, output = model(inputs, labels)
            
            output = output.detach().cpu().numpy()
            val_preds.append(output)
            val_losses.append(loss.item() * len(labels))
            val_nums.append(len(labels))
                # pbar.set_postfix({
                #     'val_loss': loss.item()
                # })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    # val_log = {
    #     'val_loss': val_loss,
    #     'mcrmse': score
    # }
    # display(val_log)
    print(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epochs}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        # print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    
    return best_val_preds, best_val_score, val_loss

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        print('='*30, f'Fold{fold}', '='*30)
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = reinit_bert(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            print('='*20, f'epoch{epoch}', '='*20)
            train_losses = []
            train_nums = []
            model.train() 

            # enable AWP
            awp = AWP(
                model, 
                optimizer,
                adv_lr=cfg.awp_lr, 
                adv_eps=cfg.awp_eps
            )

            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                    if epoch >= cfg.start_epoch:
                        loss = awp.attack_backward(inputs, labels)
                        scaler.scale(loss).backward()
                        awp._restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        best_val_preds, best_val_score, val_loss = valid_fn(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            epoch,
                            step,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

                    if cfg.wandb:
                        wandb.log({f"[fold{fold}] train_loss": loss.item(),
                                f"[fold{fold}] lr": scheduler.get_lr()[0]})

            train_loss = sum(train_losses)/sum(train_nums)
            # train_log = {
            #     'train_loss':train_loss
            # }
            # display(train_log)

            print(f'Fold{fold}, Epoch{epoch}/{cfg.n_epochs} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score, val_loss = valid_fn(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

            if cfg.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch, 
                        f"[fold{fold}] avg_train_loss": loss.item(), 
                        f"[fold{fold}] avg_val_loss": val_loss,
                        f"[fold{fold}] score": best_val_score})

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        
        torch.cuda.empty_cache()
        del model, awp
        gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    print('fold score：', fold_score)
    print('CV:', round(score, 4))
    return score

In [14]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true


print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

train = processing_features(train)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

score = training(cfg, train)

if cfg.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 80 | val_loss: 0.22764, score: 0.6866
Fold: 0, Epoch: 0/5, Step: 160 | val_loss: 0.12534, score: 0.5019
Fold: 0, Epoch: 0/5, Step: 240 | val_loss: 0.14269, score: 0.53504
Fold: 0, Epoch: 0/5, Step: 320 | val_loss: 0.12154, score: 0.49418
Fold: 0, Epoch: 0/5, Step: 400 | val_loss: 0.11144, score: 0.47332
Fold0, Epoch0/5 | train_loss: 0.33228
Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.16781, score: 0.57764


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 80 | val_loss: 0.11944, score: 0.48951
Fold: 0, Epoch: 1/5, Step: 160 | val_loss: 0.10096, score: 0.44946
Fold: 0, Epoch: 1/5, Step: 240 | val_loss: 0.10789, score: 0.46504
Fold: 0, Epoch: 1/5, Step: 320 | val_loss: 0.11566, score: 0.48078
Fold: 0, Epoch: 1/5, Step: 400 | val_loss: 0.10588, score: 0.46008
Fold0, Epoch1/5 | train_loss: 0.1036
Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.11059, score: 0.47083


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 80 | val_loss: 0.10673, score: 0.46278
Fold: 0, Epoch: 2/5, Step: 160 | val_loss: 0.10699, score: 0.46303
Fold: 0, Epoch: 2/5, Step: 240 | val_loss: 0.10227, score: 0.45292
Fold: 0, Epoch: 2/5, Step: 320 | val_loss: 0.10163, score: 0.45093
Fold: 0, Epoch: 2/5, Step: 400 | val_loss: 0.0999, score: 0.44718
Fold0, Epoch2/5 | train_loss: 0.09354
Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.10082, score: 0.44921


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 80 | val_loss: 0.10327, score: 0.45438
Fold: 0, Epoch: 3/5, Step: 160 | val_loss: 0.10239, score: 0.45308
Fold: 0, Epoch: 3/5, Step: 240 | val_loss: 0.1025, score: 0.45301
Fold: 0, Epoch: 3/5, Step: 320 | val_loss: 0.09953, score: 0.44638
Fold: 0, Epoch: 3/5, Step: 400 | val_loss: 0.09879, score: 0.44473
Fold0, Epoch3/5 | train_loss: 0.08085
Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.09998, score: 0.44744


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 80 | val_loss: 0.10138, score: 0.45051
Fold: 0, Epoch: 4/5, Step: 160 | val_loss: 0.10117, score: 0.45013
Fold: 0, Epoch: 4/5, Step: 240 | val_loss: 0.10051, score: 0.44858
Fold: 0, Epoch: 4/5, Step: 320 | val_loss: 0.10032, score: 0.44823
Fold: 0, Epoch: 4/5, Step: 400 | val_loss: 0.10012, score: 0.44777
Fold0, Epoch4/5 | train_loss: 0.07167
Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.10013, score: 0.44778


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 80 | val_loss: 0.20264, score: 0.6479
Fold: 1, Epoch: 0/5, Step: 160 | val_loss: 0.12169, score: 0.49494
Fold: 1, Epoch: 0/5, Step: 240 | val_loss: 0.13158, score: 0.51571
Fold: 1, Epoch: 0/5, Step: 320 | val_loss: 0.13145, score: 0.51391
Fold: 1, Epoch: 0/5, Step: 400 | val_loss: 0.12307, score: 0.4978
Fold1, Epoch0/5 | train_loss: 0.37248
Fold: 1, Epoch: 0/5, Step: end | val_loss: 0.12933, score: 0.50597


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 80 | val_loss: 0.11037, score: 0.47097
Fold: 1, Epoch: 1/5, Step: 160 | val_loss: 0.11401, score: 0.47961
Fold: 1, Epoch: 1/5, Step: 240 | val_loss: 0.12734, score: 0.5062
Fold: 1, Epoch: 1/5, Step: 320 | val_loss: 0.11113, score: 0.47212
Fold: 1, Epoch: 1/5, Step: 400 | val_loss: 0.10724, score: 0.4642
Fold1, Epoch1/5 | train_loss: 0.10723
Fold: 1, Epoch: 1/5, Step: end | val_loss: 0.12422, score: 0.50049


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 80 | val_loss: 0.11424, score: 0.47848
Fold: 1, Epoch: 2/5, Step: 160 | val_loss: 0.10507, score: 0.45928
Fold: 1, Epoch: 2/5, Step: 240 | val_loss: 0.11279, score: 0.47522
Fold: 1, Epoch: 2/5, Step: 320 | val_loss: 0.10674, score: 0.46326
Fold: 1, Epoch: 2/5, Step: 400 | val_loss: 0.10731, score: 0.46442
Fold1, Epoch2/5 | train_loss: 0.09463
Fold: 1, Epoch: 2/5, Step: end | val_loss: 0.10439, score: 0.45777


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 80 | val_loss: 0.10764, score: 0.46506
Fold: 1, Epoch: 3/5, Step: 160 | val_loss: 0.10771, score: 0.46528
Fold: 1, Epoch: 3/5, Step: 240 | val_loss: 0.10951, score: 0.46924
Fold: 1, Epoch: 3/5, Step: 320 | val_loss: 0.10672, score: 0.46276
Fold: 1, Epoch: 3/5, Step: 400 | val_loss: 0.10776, score: 0.46513
Fold1, Epoch3/5 | train_loss: 0.08316
Fold: 1, Epoch: 3/5, Step: end | val_loss: 0.10657, score: 0.46261


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 80 | val_loss: 0.10709, score: 0.46409
Fold: 1, Epoch: 4/5, Step: 160 | val_loss: 0.10547, score: 0.46032
Fold: 1, Epoch: 4/5, Step: 240 | val_loss: 0.10573, score: 0.46094
Fold: 1, Epoch: 4/5, Step: 320 | val_loss: 0.1053, score: 0.45994
Fold: 1, Epoch: 4/5, Step: 400 | val_loss: 0.10557, score: 0.46061
Fold1, Epoch4/5 | train_loss: 0.07401
Fold: 1, Epoch: 4/5, Step: end | val_loss: 0.10555, score: 0.46055


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 80 | val_loss: 0.17221, score: 0.59178
Fold: 2, Epoch: 0/5, Step: 160 | val_loss: 0.15476, score: 0.55536
Fold: 2, Epoch: 0/5, Step: 240 | val_loss: 0.12265, score: 0.4971
Fold: 2, Epoch: 0/5, Step: 320 | val_loss: 0.17343, score: 0.59032
Fold: 2, Epoch: 0/5, Step: 400 | val_loss: 0.12369, score: 0.49874
Fold2, Epoch0/5 | train_loss: 0.37224
Fold: 2, Epoch: 0/5, Step: end | val_loss: 0.12624, score: 0.5028


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 80 | val_loss: 0.10766, score: 0.46568
Fold: 2, Epoch: 1/5, Step: 160 | val_loss: 0.1099, score: 0.46963
Fold: 2, Epoch: 1/5, Step: 240 | val_loss: 0.1184, score: 0.48719
Fold: 2, Epoch: 1/5, Step: 320 | val_loss: 0.11092, score: 0.47245
Fold: 2, Epoch: 1/5, Step: 400 | val_loss: 0.1079, score: 0.46503
Fold2, Epoch1/5 | train_loss: 0.1053
Fold: 2, Epoch: 1/5, Step: end | val_loss: 0.11511, score: 0.48009


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 80 | val_loss: 0.11056, score: 0.47067
Fold: 2, Epoch: 2/5, Step: 160 | val_loss: 0.10588, score: 0.46111
Fold: 2, Epoch: 2/5, Step: 240 | val_loss: 0.10383, score: 0.45654
Fold: 2, Epoch: 2/5, Step: 320 | val_loss: 0.10427, score: 0.45734
Fold: 2, Epoch: 2/5, Step: 400 | val_loss: 0.10837, score: 0.46597
Fold2, Epoch2/5 | train_loss: 0.08952
Fold: 2, Epoch: 2/5, Step: end | val_loss: 0.1105, score: 0.47056


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 80 | val_loss: 0.10573, score: 0.46041
Fold: 2, Epoch: 3/5, Step: 160 | val_loss: 0.10438, score: 0.45766
Fold: 2, Epoch: 3/5, Step: 240 | val_loss: 0.1054, score: 0.45986
Fold: 2, Epoch: 3/5, Step: 320 | val_loss: 0.10919, score: 0.46799
Fold: 2, Epoch: 3/5, Step: 400 | val_loss: 0.10574, score: 0.4605
Fold2, Epoch3/5 | train_loss: 0.08217
Fold: 2, Epoch: 3/5, Step: end | val_loss: 0.10547, score: 0.45982


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 80 | val_loss: 0.10408, score: 0.4569
Fold: 2, Epoch: 4/5, Step: 160 | val_loss: 0.10402, score: 0.45676
Fold: 2, Epoch: 4/5, Step: 240 | val_loss: 0.10489, score: 0.45868
Fold: 2, Epoch: 4/5, Step: 320 | val_loss: 0.10397, score: 0.45671
Fold: 2, Epoch: 4/5, Step: 400 | val_loss: 0.10431, score: 0.45741
Fold2, Epoch4/5 | train_loss: 0.07658
Fold: 2, Epoch: 4/5, Step: end | val_loss: 0.10432, score: 0.45744


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 80 | val_loss: 0.15454, score: 0.56035
Fold: 3, Epoch: 0/5, Step: 160 | val_loss: 0.12227, score: 0.49566
Fold: 3, Epoch: 0/5, Step: 240 | val_loss: 0.16181, score: 0.56749
Fold: 3, Epoch: 0/5, Step: 320 | val_loss: 0.15369, score: 0.55297
Fold: 3, Epoch: 0/5, Step: 400 | val_loss: 0.11524, score: 0.47905
Fold3, Epoch0/5 | train_loss: 0.35801
Fold: 3, Epoch: 0/5, Step: end | val_loss: 0.12656, score: 0.49956


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 80 | val_loss: 0.10649, score: 0.46108
Fold: 3, Epoch: 1/5, Step: 160 | val_loss: 0.11264, score: 0.4741
Fold: 3, Epoch: 1/5, Step: 240 | val_loss: 0.11685, score: 0.48288
Fold: 3, Epoch: 1/5, Step: 320 | val_loss: 0.10392, score: 0.45591
Fold: 3, Epoch: 1/5, Step: 400 | val_loss: 0.10915, score: 0.46717
Fold3, Epoch1/5 | train_loss: 0.11231
Fold: 3, Epoch: 1/5, Step: end | val_loss: 0.1008, score: 0.44918


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 80 | val_loss: 0.10088, score: 0.44935
Fold: 3, Epoch: 2/5, Step: 160 | val_loss: 0.10668, score: 0.46215
Fold: 3, Epoch: 2/5, Step: 240 | val_loss: 0.1125, score: 0.47428
Fold: 3, Epoch: 2/5, Step: 320 | val_loss: 0.10605, score: 0.45937
Fold: 3, Epoch: 2/5, Step: 400 | val_loss: 0.1148, score: 0.47801
Fold3, Epoch2/5 | train_loss: 0.09545
Fold: 3, Epoch: 2/5, Step: end | val_loss: 0.0973, score: 0.44137


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 80 | val_loss: 0.09844, score: 0.444
Fold: 3, Epoch: 3/5, Step: 160 | val_loss: 0.10196, score: 0.45226
Fold: 3, Epoch: 3/5, Step: 240 | val_loss: 0.10188, score: 0.45199
Fold: 3, Epoch: 3/5, Step: 320 | val_loss: 0.09986, score: 0.44706
Fold: 3, Epoch: 3/5, Step: 400 | val_loss: 0.1022, score: 0.45231
Fold3, Epoch3/5 | train_loss: 0.0771
Fold: 3, Epoch: 3/5, Step: end | val_loss: 0.09811, score: 0.44334


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 80 | val_loss: 0.09835, score: 0.44379
Fold: 3, Epoch: 4/5, Step: 160 | val_loss: 0.09889, score: 0.44521
Fold: 3, Epoch: 4/5, Step: 240 | val_loss: 0.09856, score: 0.44431
Fold: 3, Epoch: 4/5, Step: 320 | val_loss: 0.09852, score: 0.44421
Fold: 3, Epoch: 4/5, Step: 400 | val_loss: 0.09846, score: 0.4441
Fold3, Epoch4/5 | train_loss: 0.06909
Fold: 3, Epoch: 4/5, Step: end | val_loss: 0.09847, score: 0.44412


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 80 | val_loss: 0.18222, score: 0.61332
Fold: 4, Epoch: 0/5, Step: 160 | val_loss: 0.12932, score: 0.51089
Fold: 4, Epoch: 0/5, Step: 240 | val_loss: 0.1233, score: 0.49922
Fold: 4, Epoch: 0/5, Step: 320 | val_loss: 0.13359, score: 0.51885
Fold: 4, Epoch: 0/5, Step: 400 | val_loss: 0.11652, score: 0.48444
Fold4, Epoch0/5 | train_loss: 0.36682
Fold: 4, Epoch: 0/5, Step: end | val_loss: 0.12506, score: 0.50138


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 80 | val_loss: 0.11526, score: 0.48143
Fold: 4, Epoch: 1/5, Step: 160 | val_loss: 0.10816, score: 0.46638
Fold: 4, Epoch: 1/5, Step: 240 | val_loss: 0.10878, score: 0.46784
Fold: 4, Epoch: 1/5, Step: 320 | val_loss: 0.10739, score: 0.46424
Fold: 4, Epoch: 1/5, Step: 400 | val_loss: 0.1183, score: 0.4878
Fold4, Epoch1/5 | train_loss: 0.10469
Fold: 4, Epoch: 1/5, Step: end | val_loss: 0.10699, score: 0.4642


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 80 | val_loss: 0.10774, score: 0.46595
Fold: 4, Epoch: 2/5, Step: 160 | val_loss: 0.11117, score: 0.47333
Fold: 4, Epoch: 2/5, Step: 240 | val_loss: 0.1082, score: 0.46631
Fold: 4, Epoch: 2/5, Step: 320 | val_loss: 0.10507, score: 0.45953
Fold: 4, Epoch: 2/5, Step: 400 | val_loss: 0.10788, score: 0.46576
Fold4, Epoch2/5 | train_loss: 0.09272
Fold: 4, Epoch: 2/5, Step: end | val_loss: 0.11683, score: 0.48517


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 80 | val_loss: 0.10911, score: 0.4679
Fold: 4, Epoch: 3/5, Step: 160 | val_loss: 0.10709, score: 0.46383
Fold: 4, Epoch: 3/5, Step: 240 | val_loss: 0.10813, score: 0.46632
Fold: 4, Epoch: 3/5, Step: 320 | val_loss: 0.10381, score: 0.45679
Fold: 4, Epoch: 3/5, Step: 400 | val_loss: 0.10516, score: 0.4599
Fold4, Epoch3/5 | train_loss: 0.08228
Fold: 4, Epoch: 3/5, Step: end | val_loss: 0.10573, score: 0.46121


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 80 | val_loss: 0.10395, score: 0.45715
Fold: 4, Epoch: 4/5, Step: 160 | val_loss: 0.104, score: 0.45737
Fold: 4, Epoch: 4/5, Step: 240 | val_loss: 0.10353, score: 0.45622
Fold: 4, Epoch: 4/5, Step: 320 | val_loss: 0.10355, score: 0.45631
Fold: 4, Epoch: 4/5, Step: 400 | val_loss: 0.10343, score: 0.45602
Fold4, Epoch4/5 | train_loss: 0.07256
Fold: 4, Epoch: 4/5, Step: end | val_loss: 0.10344, score: 0.45603


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 0/5, Step: 80 | val_loss: 0.17845, score: 0.60429
Fold: 5, Epoch: 0/5, Step: 160 | val_loss: 0.13298, score: 0.51489
Fold: 5, Epoch: 0/5, Step: 240 | val_loss: 0.12104, score: 0.49211
Fold: 5, Epoch: 0/5, Step: 320 | val_loss: 0.12351, score: 0.49211
Fold: 5, Epoch: 0/5, Step: 400 | val_loss: 0.13857, score: 0.52405
Fold5, Epoch0/5 | train_loss: 0.3665
Fold: 5, Epoch: 0/5, Step: end | val_loss: 0.11351, score: 0.47759


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 1/5, Step: 80 | val_loss: 0.12284, score: 0.49545
Fold: 5, Epoch: 1/5, Step: 160 | val_loss: 0.11067, score: 0.47036
Fold: 5, Epoch: 1/5, Step: 240 | val_loss: 0.11667, score: 0.48362
Fold: 5, Epoch: 1/5, Step: 320 | val_loss: 0.10373, score: 0.45533
Fold: 5, Epoch: 1/5, Step: 400 | val_loss: 0.09972, score: 0.44677
Fold5, Epoch1/5 | train_loss: 0.11276
Fold: 5, Epoch: 1/5, Step: end | val_loss: 0.12868, score: 0.50765


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 2/5, Step: 80 | val_loss: 0.1147, score: 0.47873
Fold: 5, Epoch: 2/5, Step: 160 | val_loss: 0.10622, score: 0.46171
Fold: 5, Epoch: 2/5, Step: 240 | val_loss: 0.10012, score: 0.44724
Fold: 5, Epoch: 2/5, Step: 320 | val_loss: 0.10462, score: 0.45694
Fold: 5, Epoch: 2/5, Step: 400 | val_loss: 0.10557, score: 0.46012
Fold5, Epoch2/5 | train_loss: 0.0959
Fold: 5, Epoch: 2/5, Step: end | val_loss: 0.09711, score: 0.44094


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 3/5, Step: 80 | val_loss: 0.10025, score: 0.44813
Fold: 5, Epoch: 3/5, Step: 160 | val_loss: 0.09688, score: 0.44033
Fold: 5, Epoch: 3/5, Step: 240 | val_loss: 0.10373, score: 0.45647
Fold: 5, Epoch: 3/5, Step: 320 | val_loss: 0.09606, score: 0.43854
Fold: 5, Epoch: 3/5, Step: 400 | val_loss: 0.09712, score: 0.44083
Fold5, Epoch3/5 | train_loss: 0.07819
Fold: 5, Epoch: 3/5, Step: end | val_loss: 0.09602, score: 0.43842


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 4/5, Step: 80 | val_loss: 0.09723, score: 0.44127
Fold: 5, Epoch: 4/5, Step: 160 | val_loss: 0.09832, score: 0.44394
Fold: 5, Epoch: 4/5, Step: 240 | val_loss: 0.09612, score: 0.43866
Fold: 5, Epoch: 4/5, Step: 320 | val_loss: 0.09699, score: 0.4407
Fold: 5, Epoch: 4/5, Step: 400 | val_loss: 0.0966, score: 0.43982
Fold5, Epoch4/5 | train_loss: 0.07109
Fold: 5, Epoch: 4/5, Step: end | val_loss: 0.09663, score: 0.43989


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 0/5, Step: 80 | val_loss: 0.18307, score: 0.61085
Fold: 6, Epoch: 0/5, Step: 160 | val_loss: 0.11689, score: 0.48497
Fold: 6, Epoch: 0/5, Step: 240 | val_loss: 0.1456, score: 0.54189
Fold: 6, Epoch: 0/5, Step: 320 | val_loss: 0.1139, score: 0.47859
Fold: 6, Epoch: 0/5, Step: 400 | val_loss: 0.12445, score: 0.50004
Fold6, Epoch0/5 | train_loss: 0.36578
Fold: 6, Epoch: 0/5, Step: end | val_loss: 0.11421, score: 0.47925


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 1/5, Step: 80 | val_loss: 0.11128, score: 0.47309
Fold: 6, Epoch: 1/5, Step: 160 | val_loss: 0.11183, score: 0.4739
Fold: 6, Epoch: 1/5, Step: 240 | val_loss: 0.13185, score: 0.51491
Fold: 6, Epoch: 1/5, Step: 320 | val_loss: 0.13686, score: 0.52319
Fold: 6, Epoch: 1/5, Step: 400 | val_loss: 0.11889, score: 0.48826
Fold6, Epoch1/5 | train_loss: 0.10321
Fold: 6, Epoch: 1/5, Step: end | val_loss: 0.11417, score: 0.47923


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 2/5, Step: 80 | val_loss: 0.11244, score: 0.47532
Fold: 6, Epoch: 2/5, Step: 160 | val_loss: 0.1117, score: 0.47396
Fold: 6, Epoch: 2/5, Step: 240 | val_loss: 0.11446, score: 0.47957
Fold: 6, Epoch: 2/5, Step: 320 | val_loss: 0.11318, score: 0.47641
Fold: 6, Epoch: 2/5, Step: 400 | val_loss: 0.11018, score: 0.47036
Fold6, Epoch2/5 | train_loss: 0.09463
Fold: 6, Epoch: 2/5, Step: end | val_loss: 0.10932, score: 0.46851


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 3/5, Step: 80 | val_loss: 0.10841, score: 0.46674
Fold: 6, Epoch: 3/5, Step: 160 | val_loss: 0.11197, score: 0.47462
Fold: 6, Epoch: 3/5, Step: 240 | val_loss: 0.11012, score: 0.47065
Fold: 6, Epoch: 3/5, Step: 320 | val_loss: 0.10921, score: 0.46878
Fold: 6, Epoch: 3/5, Step: 400 | val_loss: 0.10871, score: 0.46748
Fold6, Epoch3/5 | train_loss: 0.07874
Fold: 6, Epoch: 3/5, Step: end | val_loss: 0.11006, score: 0.47057


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 4/5, Step: 80 | val_loss: 0.10914, score: 0.46863
Fold: 6, Epoch: 4/5, Step: 160 | val_loss: 0.11027, score: 0.47088
Fold: 6, Epoch: 4/5, Step: 240 | val_loss: 0.10872, score: 0.46758
Fold: 6, Epoch: 4/5, Step: 320 | val_loss: 0.10891, score: 0.46806
Fold: 6, Epoch: 4/5, Step: 400 | val_loss: 0.1089, score: 0.46802
Fold6, Epoch4/5 | train_loss: 0.07321
Fold: 6, Epoch: 4/5, Step: end | val_loss: 0.10891, score: 0.46804


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 0/5, Step: 80 | val_loss: 0.17002, score: 0.58737
Fold: 7, Epoch: 0/5, Step: 160 | val_loss: 0.12472, score: 0.50003
Fold: 7, Epoch: 0/5, Step: 240 | val_loss: 0.11165, score: 0.47343
Fold: 7, Epoch: 0/5, Step: 320 | val_loss: 0.1129, score: 0.4722
Fold: 7, Epoch: 0/5, Step: 400 | val_loss: 0.11425, score: 0.47791
Fold7, Epoch0/5 | train_loss: 0.37722
Fold: 7, Epoch: 0/5, Step: end | val_loss: 0.10621, score: 0.46161


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 1/5, Step: 80 | val_loss: 0.10338, score: 0.45512
Fold: 7, Epoch: 1/5, Step: 160 | val_loss: 0.10389, score: 0.45606
Fold: 7, Epoch: 1/5, Step: 240 | val_loss: 0.10588, score: 0.45976
Fold: 7, Epoch: 1/5, Step: 320 | val_loss: 0.096, score: 0.43837
Fold: 7, Epoch: 1/5, Step: 400 | val_loss: 0.09819, score: 0.44301
Fold7, Epoch1/5 | train_loss: 0.10454
Fold: 7, Epoch: 1/5, Step: end | val_loss: 0.10132, score: 0.4513


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 2/5, Step: 80 | val_loss: 0.09566, score: 0.43796
Fold: 7, Epoch: 2/5, Step: 160 | val_loss: 0.10489, score: 0.45903
Fold: 7, Epoch: 2/5, Step: 240 | val_loss: 0.09885, score: 0.44499
Fold: 7, Epoch: 2/5, Step: 320 | val_loss: 0.09754, score: 0.44219
Fold: 7, Epoch: 2/5, Step: 400 | val_loss: 0.09798, score: 0.44372
Fold7, Epoch2/5 | train_loss: 0.09432
Fold: 7, Epoch: 2/5, Step: end | val_loss: 0.09555, score: 0.43747


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 3/5, Step: 80 | val_loss: 0.09425, score: 0.4345
Fold: 7, Epoch: 3/5, Step: 160 | val_loss: 0.09437, score: 0.43448
Fold: 7, Epoch: 3/5, Step: 240 | val_loss: 0.09621, score: 0.4389
Fold: 7, Epoch: 3/5, Step: 320 | val_loss: 0.09415, score: 0.43413
Fold: 7, Epoch: 3/5, Step: 400 | val_loss: 0.09358, score: 0.43289
Fold7, Epoch3/5 | train_loss: 0.08279
Fold: 7, Epoch: 3/5, Step: end | val_loss: 0.09313, score: 0.43189


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 4/5, Step: 80 | val_loss: 0.09533, score: 0.43729
Fold: 7, Epoch: 4/5, Step: 160 | val_loss: 0.09358, score: 0.43308
Fold: 7, Epoch: 4/5, Step: 240 | val_loss: 0.09311, score: 0.43189
Fold: 7, Epoch: 4/5, Step: 320 | val_loss: 0.09342, score: 0.43267
Fold: 7, Epoch: 4/5, Step: 400 | val_loss: 0.09345, score: 0.43273
Fold7, Epoch4/5 | train_loss: 0.07533
Fold: 7, Epoch: 4/5, Step: end | val_loss: 0.09343, score: 0.4327


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 0/5, Step: 80 | val_loss: 0.17435, score: 0.5971
Fold: 8, Epoch: 0/5, Step: 160 | val_loss: 0.11682, score: 0.48487
Fold: 8, Epoch: 0/5, Step: 240 | val_loss: 0.11023, score: 0.47029
Fold: 8, Epoch: 0/5, Step: 320 | val_loss: 0.11747, score: 0.48595
Fold: 8, Epoch: 0/5, Step: 400 | val_loss: 0.11529, score: 0.47975
Fold8, Epoch0/5 | train_loss: 0.3236
Fold: 8, Epoch: 0/5, Step: end | val_loss: 0.12858, score: 0.50685


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 1/5, Step: 80 | val_loss: 0.10868, score: 0.46594
Fold: 8, Epoch: 1/5, Step: 160 | val_loss: 0.10289, score: 0.45406
Fold: 8, Epoch: 1/5, Step: 240 | val_loss: 0.1162, score: 0.48187
Fold: 8, Epoch: 1/5, Step: 320 | val_loss: 0.10811, score: 0.46517
Fold: 8, Epoch: 1/5, Step: 400 | val_loss: 0.11041, score: 0.47053
Fold8, Epoch1/5 | train_loss: 0.10453
Fold: 8, Epoch: 1/5, Step: end | val_loss: 0.10372, score: 0.45539


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 2/5, Step: 80 | val_loss: 0.10111, score: 0.44977
Fold: 8, Epoch: 2/5, Step: 160 | val_loss: 0.1057, score: 0.46022
Fold: 8, Epoch: 2/5, Step: 240 | val_loss: 0.10705, score: 0.46298
Fold: 8, Epoch: 2/5, Step: 320 | val_loss: 0.09984, score: 0.44736
Fold: 8, Epoch: 2/5, Step: 400 | val_loss: 0.11625, score: 0.48312
Fold8, Epoch2/5 | train_loss: 0.09496
Fold: 8, Epoch: 2/5, Step: end | val_loss: 0.1012, score: 0.45017


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 3/5, Step: 80 | val_loss: 0.10213, score: 0.45231
Fold: 8, Epoch: 3/5, Step: 160 | val_loss: 0.10085, score: 0.44947
Fold: 8, Epoch: 3/5, Step: 240 | val_loss: 0.10207, score: 0.45204
Fold: 8, Epoch: 3/5, Step: 320 | val_loss: 0.09864, score: 0.44424
Fold: 8, Epoch: 3/5, Step: 400 | val_loss: 0.0995, score: 0.44654
Fold8, Epoch3/5 | train_loss: 0.08304
Fold: 8, Epoch: 3/5, Step: end | val_loss: 0.10329, score: 0.45509


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 4/5, Step: 80 | val_loss: 0.09802, score: 0.44284
Fold: 8, Epoch: 4/5, Step: 160 | val_loss: 0.0979, score: 0.44256
Fold: 8, Epoch: 4/5, Step: 240 | val_loss: 0.09794, score: 0.44262
Fold: 8, Epoch: 4/5, Step: 320 | val_loss: 0.09857, score: 0.44411
Fold: 8, Epoch: 4/5, Step: 400 | val_loss: 0.09812, score: 0.44306
Fold8, Epoch4/5 | train_loss: 0.0747
Fold: 8, Epoch: 4/5, Step: end | val_loss: 0.09812, score: 0.44308


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 0/5, Step: 80 | val_loss: 0.16151, score: 0.57372
Fold: 9, Epoch: 0/5, Step: 160 | val_loss: 0.12516, score: 0.5021
Fold: 9, Epoch: 0/5, Step: 240 | val_loss: 0.115, score: 0.48107
Fold: 9, Epoch: 0/5, Step: 320 | val_loss: 0.12268, score: 0.49567
Fold: 9, Epoch: 0/5, Step: 400 | val_loss: 0.1118, score: 0.47294
Fold9, Epoch0/5 | train_loss: 0.37006
Fold: 9, Epoch: 0/5, Step: end | val_loss: 0.11669, score: 0.48348


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 1/5, Step: 80 | val_loss: 0.11464, score: 0.47928
Fold: 9, Epoch: 1/5, Step: 160 | val_loss: 0.11044, score: 0.47015
Fold: 9, Epoch: 1/5, Step: 240 | val_loss: 0.1119, score: 0.47223
Fold: 9, Epoch: 1/5, Step: 320 | val_loss: 0.11774, score: 0.48466
Fold: 9, Epoch: 1/5, Step: 400 | val_loss: 0.11312, score: 0.47594
Fold9, Epoch1/5 | train_loss: 0.10438
Fold: 9, Epoch: 1/5, Step: end | val_loss: 0.10893, score: 0.46623


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 2/5, Step: 80 | val_loss: 0.11319, score: 0.475
Fold: 9, Epoch: 2/5, Step: 160 | val_loss: 0.10602, score: 0.46031
Fold: 9, Epoch: 2/5, Step: 240 | val_loss: 0.10665, score: 0.46196
Fold: 9, Epoch: 2/5, Step: 320 | val_loss: 0.1165, score: 0.48294
Fold: 9, Epoch: 2/5, Step: 400 | val_loss: 0.10676, score: 0.4621
Fold9, Epoch2/5 | train_loss: 0.09484
Fold: 9, Epoch: 2/5, Step: end | val_loss: 0.11277, score: 0.47464


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 3/5, Step: 80 | val_loss: 0.11059, score: 0.47056
Fold: 9, Epoch: 3/5, Step: 160 | val_loss: 0.10703, score: 0.46209
Fold: 9, Epoch: 3/5, Step: 240 | val_loss: 0.10613, score: 0.46079
Fold: 9, Epoch: 3/5, Step: 320 | val_loss: 0.10611, score: 0.46077
Fold: 9, Epoch: 3/5, Step: 400 | val_loss: 0.10602, score: 0.46035
Fold9, Epoch3/5 | train_loss: 0.08242
Fold: 9, Epoch: 3/5, Step: end | val_loss: 0.106, score: 0.46049


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 4/5, Step: 80 | val_loss: 0.10543, score: 0.45911
Fold: 9, Epoch: 4/5, Step: 160 | val_loss: 0.10672, score: 0.46184
Fold: 9, Epoch: 4/5, Step: 240 | val_loss: 0.10492, score: 0.45799
Fold: 9, Epoch: 4/5, Step: 320 | val_loss: 0.10461, score: 0.4573
Fold: 9, Epoch: 4/5, Step: 400 | val_loss: 0.10459, score: 0.45724
Fold9, Epoch4/5 | train_loss: 0.07357
Fold: 9, Epoch: 4/5, Step: end | val_loss: 0.10459, score: 0.45725
fold score： [0.4447257040634322, 0.4577677237281166, 0.4565376531477393, 0.44136759413379656, 0.45602312806741957, 0.4384206619844927, 0.4667427754558301, 0.4318870110472975, 0.4425581029318823, 0.45724036317789374]
CV: 0.4496
Starting upload for file tokenizer.tar


100%|██████████| 10.6M/10.6M [00:04<00:00, 2.49MB/s]


Upload successful: tokenizer.tar (11MB)
Starting upload for file preds.tar


100%|██████████| 180k/180k [00:03<00:00, 58.6kB/s]


Upload successful: preds.tar (180KB)
Starting upload for file model.tar


  6%|▌         | 1.00G/16.2G [01:05<16:28, 16.5MB/s]  


Upload unsuccessful: model.tar


In [15]:
if cfg.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

Starting upload for file tokenizer.tar


100%|██████████| 10.6M/10.6M [00:03<00:00, 2.88MB/s]


Upload successful: tokenizer.tar (11MB)
Starting upload for file preds.tar


100%|██████████| 180k/180k [00:02<00:00, 82.9kB/s]


Upload successful: preds.tar (180KB)
Starting upload for file model.tar


100%|██████████| 16.2G/16.2G [21:05<00:00, 13.8MB/s]  


Upload successful: model.tar (16GB)
Starting upload for file fig.tar


ReadTimeoutError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=None)