In [1]:
! nvidia-smi

Mon Nov 21 06:14:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
| 45%   71C    P2   354W / 480W |   8450MiB / 24564MiB |     96%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "shu421"

    EXP = "exp161"
    MODEL_PATH = "roberta-base"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-english-language-learning"
    BASE_PATH = '/root/feedback3/'

    api_path = "/root/.kaggle/kaggle.json"

    apex=True
    seed = 42
    num_fold = 10
    trn_fold = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    batch_size = 8
    n_epochs = 5
    max_len = 512
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 80
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": False
    }

    upload_from_colab = True

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# ! pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

from kaggle.api.kaggle_api_extended import KaggleApi

[0m

In [4]:
def setup(cfg):
    # cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    cfg.INPUT = os.path.join(cfg.BASE_PATH, 'input')
    cfg.OUTPUT = os.path.join(cfg.BASE_PATH, 'output')
    cfg.SUBMISSION = os.path.join(cfg.BASE_PATH, 'submission')
    cfg.DATASET = os.path.join(cfg.BASE_PATH, 'dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    
    if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
        # load dataset
        ! pip install --upgrade --force-reinstall --no-deps kaggle
        ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
        filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
        ! unzip -d $cfg.INPUT $filepath
        
    
    for path in cfg.DATASET_PATH:
        datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
        if not os.path.exists(datasetpath):
            os.makedirs(datasetpath, exist_ok=True)
            ! kaggle datasets download $path -p $datasetpath
            filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
            ! unzip -d $datasetpath $filepath
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [5]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [6]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

In [7]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [8]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [9]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.model = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.transformer.encoder.layer[:8])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        output = self.fc(self.ln(feature))
        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

In [10]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

# initialize layer
def reinit_bert(model):
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [11]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [12]:
def valid_fn(cfg, valid_loader, model, valid_df, fold, epoch, step, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        # with tqdm(valid_loader, total=len(valid_loader)) as pbar:
        # for (inputs, labels) in pbar:
        for (inputs, labels) in valid_loader:
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            with autocast():
                loss, output = model(inputs, labels)
            
            output = output.detach().cpu().numpy()
            val_preds.append(output)
            val_losses.append(loss.item() * len(labels))
            val_nums.append(len(labels))
                # pbar.set_postfix({
                #     'val_loss': loss.item()
                # })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    # val_log = {
    #     'val_loss': val_loss,
    #     'mcrmse': score
    # }
    # display(val_log)
    print(f'Fold: {fold}, Epoch: {epoch}/{cfg.n_epochs}, Step: {step} | val_loss: {np.round(val_loss, 5)}, score: {np.round(score, 5)}')

    if best_val_score > score:
        # print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    
    return best_val_preds, best_val_score

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        print('='*30, f'Fold{fold}', '='*30)
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        # model = reinit_bert(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

        # enable FGM
        fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            print('='*20, f'epoch{epoch}', '='*20)
            train_losses = []
            train_nums = []
            model.train() 
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                     # FGM attack
                    fgm.attack()
                    with autocast(enabled=cfg.apex):
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        best_val_preds, best_val_score = valid_fn(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            epoch,
                            step,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

            train_loss = sum(train_losses)/sum(train_nums)
            # train_log = {
            #     'train_loss':train_loss
            # }
            # display(train_log)

            # evaluating(epoch)
            print(f'Fold{fold}, Epoch{epoch}/{cfg.n_epochs} | train_loss: {np.round(train_loss, 5)}')
            best_val_preds, best_val_score = valid_fn(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                epoch,
                'end',
                best_val_preds,
                best_val_score,
            )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    print('fold score：', fold_score)
    print('CV:', round(score, 4))
    return score

In [13]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true


print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

train = processing_features(train)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

score = training(cfg, train)

if cfg.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 0/5, Step: 80 | val_loss: 0.22455, score: 0.68369
Fold: 0, Epoch: 0/5, Step: 160 | val_loss: 0.12314, score: 0.49754
Fold: 0, Epoch: 0/5, Step: 240 | val_loss: 0.11845, score: 0.48683
Fold: 0, Epoch: 0/5, Step: 320 | val_loss: 0.11029, score: 0.47021
Fold: 0, Epoch: 0/5, Step: 400 | val_loss: 0.11004, score: 0.46973
Fold0, Epoch0/5 | train_loss: 0.43658
Fold: 0, Epoch: 0/5, Step: end | val_loss: 0.15093, score: 0.54688


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 1/5, Step: 80 | val_loss: 0.10592, score: 0.46059
Fold: 0, Epoch: 1/5, Step: 160 | val_loss: 0.10481, score: 0.45832
Fold: 0, Epoch: 1/5, Step: 240 | val_loss: 0.10466, score: 0.45782
Fold: 0, Epoch: 1/5, Step: 320 | val_loss: 0.11094, score: 0.47187
Fold: 0, Epoch: 1/5, Step: 400 | val_loss: 0.10599, score: 0.46093
Fold0, Epoch1/5 | train_loss: 0.10455
Fold: 0, Epoch: 1/5, Step: end | val_loss: 0.10525, score: 0.45882


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 2/5, Step: 80 | val_loss: 0.10912, score: 0.4677
Fold: 0, Epoch: 2/5, Step: 160 | val_loss: 0.10641, score: 0.4613
Fold: 0, Epoch: 2/5, Step: 240 | val_loss: 0.10614, score: 0.4609
Fold: 0, Epoch: 2/5, Step: 320 | val_loss: 0.10715, score: 0.46344
Fold: 0, Epoch: 2/5, Step: 400 | val_loss: 0.10708, score: 0.46346
Fold0, Epoch2/5 | train_loss: 0.09324
Fold: 0, Epoch: 2/5, Step: end | val_loss: 0.10405, score: 0.45648


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 3/5, Step: 80 | val_loss: 0.10427, score: 0.4571
Fold: 0, Epoch: 3/5, Step: 160 | val_loss: 0.10504, score: 0.45895
Fold: 0, Epoch: 3/5, Step: 240 | val_loss: 0.1066, score: 0.46215
Fold: 0, Epoch: 3/5, Step: 320 | val_loss: 0.10507, score: 0.45877
Fold: 0, Epoch: 3/5, Step: 400 | val_loss: 0.10478, score: 0.45817
Fold0, Epoch3/5 | train_loss: 0.08521
Fold: 0, Epoch: 3/5, Step: end | val_loss: 0.10546, score: 0.45959


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 0, Epoch: 4/5, Step: 80 | val_loss: 0.10598, score: 0.46077
Fold: 0, Epoch: 4/5, Step: 160 | val_loss: 0.1055, score: 0.45965
Fold: 0, Epoch: 4/5, Step: 240 | val_loss: 0.10607, score: 0.46096
Fold: 0, Epoch: 4/5, Step: 320 | val_loss: 0.10594, score: 0.46068
Fold: 0, Epoch: 4/5, Step: 400 | val_loss: 0.10554, score: 0.45981
Fold0, Epoch4/5 | train_loss: 0.07976
Fold: 0, Epoch: 4/5, Step: end | val_loss: 0.10553, score: 0.45979


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 0/5, Step: 80 | val_loss: 0.2083, score: 0.66158
Fold: 1, Epoch: 0/5, Step: 160 | val_loss: 0.13147, score: 0.51616
Fold: 1, Epoch: 0/5, Step: 240 | val_loss: 0.13484, score: 0.52195
Fold: 1, Epoch: 0/5, Step: 320 | val_loss: 0.13619, score: 0.52256
Fold: 1, Epoch: 0/5, Step: 400 | val_loss: 0.12058, score: 0.49345
Fold1, Epoch0/5 | train_loss: 0.4218
Fold: 1, Epoch: 0/5, Step: end | val_loss: 0.12353, score: 0.49928


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 1/5, Step: 80 | val_loss: 0.11076, score: 0.47199
Fold: 1, Epoch: 1/5, Step: 160 | val_loss: 0.11021, score: 0.47102
Fold: 1, Epoch: 1/5, Step: 240 | val_loss: 0.10998, score: 0.47045
Fold: 1, Epoch: 1/5, Step: 320 | val_loss: 0.10738, score: 0.46508
Fold: 1, Epoch: 1/5, Step: 400 | val_loss: 0.10957, score: 0.47019
Fold1, Epoch1/5 | train_loss: 0.10123
Fold: 1, Epoch: 1/5, Step: end | val_loss: 0.11728, score: 0.48508


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 2/5, Step: 80 | val_loss: 0.10825, score: 0.46706
Fold: 1, Epoch: 2/5, Step: 160 | val_loss: 0.10766, score: 0.46571
Fold: 1, Epoch: 2/5, Step: 240 | val_loss: 0.10879, score: 0.46854
Fold: 1, Epoch: 2/5, Step: 320 | val_loss: 0.1071, score: 0.46461
Fold: 1, Epoch: 2/5, Step: 400 | val_loss: 0.1082, score: 0.46706
Fold1, Epoch2/5 | train_loss: 0.09256
Fold: 1, Epoch: 2/5, Step: end | val_loss: 0.11018, score: 0.47142


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 3/5, Step: 80 | val_loss: 0.10716, score: 0.4645
Fold: 1, Epoch: 3/5, Step: 160 | val_loss: 0.10732, score: 0.46485
Fold: 1, Epoch: 3/5, Step: 240 | val_loss: 0.10792, score: 0.46615
Fold: 1, Epoch: 3/5, Step: 320 | val_loss: 0.10815, score: 0.46706
Fold: 1, Epoch: 3/5, Step: 400 | val_loss: 0.10658, score: 0.4634
Fold1, Epoch3/5 | train_loss: 0.08787
Fold: 1, Epoch: 3/5, Step: end | val_loss: 0.10672, score: 0.46363


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 1, Epoch: 4/5, Step: 80 | val_loss: 0.10678, score: 0.46385
Fold: 1, Epoch: 4/5, Step: 160 | val_loss: 0.10718, score: 0.46471
Fold: 1, Epoch: 4/5, Step: 240 | val_loss: 0.10658, score: 0.46328
Fold: 1, Epoch: 4/5, Step: 320 | val_loss: 0.10664, score: 0.46344
Fold: 1, Epoch: 4/5, Step: 400 | val_loss: 0.10662, score: 0.46339
Fold1, Epoch4/5 | train_loss: 0.08346
Fold: 1, Epoch: 4/5, Step: end | val_loss: 0.10662, score: 0.46339


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 0/5, Step: 80 | val_loss: 0.31067, score: 0.81451
Fold: 2, Epoch: 0/5, Step: 160 | val_loss: 0.16412, score: 0.57698
Fold: 2, Epoch: 0/5, Step: 240 | val_loss: 0.1167, score: 0.48459
Fold: 2, Epoch: 0/5, Step: 320 | val_loss: 0.147, score: 0.54441
Fold: 2, Epoch: 0/5, Step: 400 | val_loss: 0.11591, score: 0.48412
Fold2, Epoch0/5 | train_loss: 0.55046
Fold: 2, Epoch: 0/5, Step: end | val_loss: 0.13165, score: 0.51411


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 1/5, Step: 80 | val_loss: 0.11874, score: 0.48911
Fold: 2, Epoch: 1/5, Step: 160 | val_loss: 0.1105, score: 0.47215
Fold: 2, Epoch: 1/5, Step: 240 | val_loss: 0.12476, score: 0.4997
Fold: 2, Epoch: 1/5, Step: 320 | val_loss: 0.10856, score: 0.46747
Fold: 2, Epoch: 1/5, Step: 400 | val_loss: 0.10792, score: 0.4662
Fold2, Epoch1/5 | train_loss: 0.1176
Fold: 2, Epoch: 1/5, Step: end | val_loss: 0.11603, score: 0.48287


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 2/5, Step: 80 | val_loss: 0.1082, score: 0.4663
Fold: 2, Epoch: 2/5, Step: 160 | val_loss: 0.10782, score: 0.46556
Fold: 2, Epoch: 2/5, Step: 240 | val_loss: 0.10553, score: 0.46067
Fold: 2, Epoch: 2/5, Step: 320 | val_loss: 0.10475, score: 0.45897
Fold: 2, Epoch: 2/5, Step: 400 | val_loss: 0.10738, score: 0.46463
Fold2, Epoch2/5 | train_loss: 0.09051
Fold: 2, Epoch: 2/5, Step: end | val_loss: 0.10483, score: 0.45906


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 3/5, Step: 80 | val_loss: 0.10488, score: 0.45931
Fold: 2, Epoch: 3/5, Step: 160 | val_loss: 0.10522, score: 0.46005
Fold: 2, Epoch: 3/5, Step: 240 | val_loss: 0.10488, score: 0.45919
Fold: 2, Epoch: 3/5, Step: 320 | val_loss: 0.10534, score: 0.46038
Fold: 2, Epoch: 3/5, Step: 400 | val_loss: 0.10523, score: 0.46012
Fold2, Epoch3/5 | train_loss: 0.08159
Fold: 2, Epoch: 3/5, Step: end | val_loss: 0.10567, score: 0.46087


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 2, Epoch: 4/5, Step: 80 | val_loss: 0.105, score: 0.45943
Fold: 2, Epoch: 4/5, Step: 160 | val_loss: 0.10524, score: 0.45997
Fold: 2, Epoch: 4/5, Step: 240 | val_loss: 0.10492, score: 0.45929
Fold: 2, Epoch: 4/5, Step: 320 | val_loss: 0.10493, score: 0.4593
Fold: 2, Epoch: 4/5, Step: 400 | val_loss: 0.10499, score: 0.45943
Fold2, Epoch4/5 | train_loss: 0.0783
Fold: 2, Epoch: 4/5, Step: end | val_loss: 0.10498, score: 0.45941


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 0/5, Step: 80 | val_loss: 0.23532, score: 0.70164
Fold: 3, Epoch: 0/5, Step: 160 | val_loss: 0.1297, score: 0.51014
Fold: 3, Epoch: 0/5, Step: 240 | val_loss: 0.12227, score: 0.49496
Fold: 3, Epoch: 0/5, Step: 320 | val_loss: 0.17315, score: 0.58817
Fold: 3, Epoch: 0/5, Step: 400 | val_loss: 0.11847, score: 0.48737
Fold3, Epoch0/5 | train_loss: 0.45499
Fold: 3, Epoch: 0/5, Step: end | val_loss: 0.11044, score: 0.47029


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 1/5, Step: 80 | val_loss: 0.1086, score: 0.46602
Fold: 3, Epoch: 1/5, Step: 160 | val_loss: 0.10719, score: 0.46308
Fold: 3, Epoch: 1/5, Step: 240 | val_loss: 0.10572, score: 0.45985
Fold: 3, Epoch: 1/5, Step: 320 | val_loss: 0.11312, score: 0.4755
Fold: 3, Epoch: 1/5, Step: 400 | val_loss: 0.10411, score: 0.45624
Fold3, Epoch1/5 | train_loss: 0.1033
Fold: 3, Epoch: 1/5, Step: end | val_loss: 0.12673, score: 0.50496


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 2/5, Step: 80 | val_loss: 0.1048, score: 0.45807
Fold: 3, Epoch: 2/5, Step: 160 | val_loss: 0.11765, score: 0.48529
Fold: 3, Epoch: 2/5, Step: 240 | val_loss: 0.10525, score: 0.4589
Fold: 3, Epoch: 2/5, Step: 320 | val_loss: 0.10392, score: 0.45595
Fold: 3, Epoch: 2/5, Step: 400 | val_loss: 0.10371, score: 0.45568
Fold3, Epoch2/5 | train_loss: 0.09336
Fold: 3, Epoch: 2/5, Step: end | val_loss: 0.10599, score: 0.46041


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 3/5, Step: 80 | val_loss: 0.10152, score: 0.45048
Fold: 3, Epoch: 3/5, Step: 160 | val_loss: 0.10216, score: 0.45212
Fold: 3, Epoch: 3/5, Step: 240 | val_loss: 0.10199, score: 0.45176
Fold: 3, Epoch: 3/5, Step: 320 | val_loss: 0.10251, score: 0.45301
Fold: 3, Epoch: 3/5, Step: 400 | val_loss: 0.10191, score: 0.45163
Fold3, Epoch3/5 | train_loss: 0.0866
Fold: 3, Epoch: 3/5, Step: end | val_loss: 0.10165, score: 0.4512


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 3, Epoch: 4/5, Step: 80 | val_loss: 0.10124, score: 0.45001
Fold: 3, Epoch: 4/5, Step: 160 | val_loss: 0.10129, score: 0.45022
Fold: 3, Epoch: 4/5, Step: 240 | val_loss: 0.10106, score: 0.44971
Fold: 3, Epoch: 4/5, Step: 320 | val_loss: 0.10106, score: 0.4497
Fold: 3, Epoch: 4/5, Step: 400 | val_loss: 0.1012, score: 0.45
Fold3, Epoch4/5 | train_loss: 0.0816
Fold: 3, Epoch: 4/5, Step: end | val_loss: 0.1012, score: 0.45001


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 0/5, Step: 80 | val_loss: 0.36694, score: 0.89769
Fold: 4, Epoch: 0/5, Step: 160 | val_loss: 0.15915, score: 0.5703
Fold: 4, Epoch: 0/5, Step: 240 | val_loss: 0.13684, score: 0.52555
Fold: 4, Epoch: 0/5, Step: 320 | val_loss: 0.12017, score: 0.49198
Fold: 4, Epoch: 0/5, Step: 400 | val_loss: 0.14854, score: 0.54918
Fold4, Epoch0/5 | train_loss: 0.49157
Fold: 4, Epoch: 0/5, Step: end | val_loss: 0.13125, score: 0.51364


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 1/5, Step: 80 | val_loss: 0.11241, score: 0.47581
Fold: 4, Epoch: 1/5, Step: 160 | val_loss: 0.11595, score: 0.48303
Fold: 4, Epoch: 1/5, Step: 240 | val_loss: 0.10988, score: 0.46989
Fold: 4, Epoch: 1/5, Step: 320 | val_loss: 0.1088, score: 0.46812
Fold: 4, Epoch: 1/5, Step: 400 | val_loss: 0.1079, score: 0.46562
Fold4, Epoch1/5 | train_loss: 0.1043
Fold: 4, Epoch: 1/5, Step: end | val_loss: 0.11384, score: 0.47864


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 2/5, Step: 80 | val_loss: 0.12932, score: 0.51088
Fold: 4, Epoch: 2/5, Step: 160 | val_loss: 0.1119, score: 0.47409
Fold: 4, Epoch: 2/5, Step: 240 | val_loss: 0.11131, score: 0.47251
Fold: 4, Epoch: 2/5, Step: 320 | val_loss: 0.11889, score: 0.48946
Fold: 4, Epoch: 2/5, Step: 400 | val_loss: 0.10821, score: 0.46649
Fold4, Epoch2/5 | train_loss: 0.09991
Fold: 4, Epoch: 2/5, Step: end | val_loss: 0.1079, score: 0.46529


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 3/5, Step: 80 | val_loss: 0.11167, score: 0.4739
Fold: 4, Epoch: 3/5, Step: 160 | val_loss: 0.10688, score: 0.46339
Fold: 4, Epoch: 3/5, Step: 240 | val_loss: 0.1209, score: 0.49359
Fold: 4, Epoch: 3/5, Step: 320 | val_loss: 0.10467, score: 0.45845
Fold: 4, Epoch: 3/5, Step: 400 | val_loss: 0.10756, score: 0.46499
Fold4, Epoch3/5 | train_loss: 0.08528
Fold: 4, Epoch: 3/5, Step: end | val_loss: 0.10454, score: 0.45825


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 4, Epoch: 4/5, Step: 80 | val_loss: 0.10371, score: 0.45625
Fold: 4, Epoch: 4/5, Step: 160 | val_loss: 0.104, score: 0.45701
Fold: 4, Epoch: 4/5, Step: 240 | val_loss: 0.10401, score: 0.45708
Fold: 4, Epoch: 4/5, Step: 320 | val_loss: 0.1031, score: 0.45494
Fold: 4, Epoch: 4/5, Step: 400 | val_loss: 0.10294, score: 0.45456
Fold4, Epoch4/5 | train_loss: 0.07645
Fold: 4, Epoch: 4/5, Step: end | val_loss: 0.10294, score: 0.45457


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 0/5, Step: 80 | val_loss: 0.26361, score: 0.7474
Fold: 5, Epoch: 0/5, Step: 160 | val_loss: 0.14195, score: 0.53453
Fold: 5, Epoch: 0/5, Step: 240 | val_loss: 0.13549, score: 0.51973
Fold: 5, Epoch: 0/5, Step: 320 | val_loss: 0.14832, score: 0.54585
Fold: 5, Epoch: 0/5, Step: 400 | val_loss: 0.12641, score: 0.50141
Fold5, Epoch0/5 | train_loss: 0.48825
Fold: 5, Epoch: 0/5, Step: end | val_loss: 0.14197, score: 0.53362


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 1/5, Step: 80 | val_loss: 0.1123, score: 0.47294
Fold: 5, Epoch: 1/5, Step: 160 | val_loss: 0.11869, score: 0.4891
Fold: 5, Epoch: 1/5, Step: 240 | val_loss: 0.1325, score: 0.51504
Fold: 5, Epoch: 1/5, Step: 320 | val_loss: 0.12008, score: 0.49027
Fold: 5, Epoch: 1/5, Step: 400 | val_loss: 0.10746, score: 0.46402
Fold5, Epoch1/5 | train_loss: 0.11311
Fold: 5, Epoch: 1/5, Step: end | val_loss: 0.10821, score: 0.46518


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 2/5, Step: 80 | val_loss: 0.10379, score: 0.45608
Fold: 5, Epoch: 2/5, Step: 160 | val_loss: 0.1033, score: 0.45512
Fold: 5, Epoch: 2/5, Step: 240 | val_loss: 0.1013, score: 0.45042
Fold: 5, Epoch: 2/5, Step: 320 | val_loss: 0.10174, score: 0.45144
Fold: 5, Epoch: 2/5, Step: 400 | val_loss: 0.10413, score: 0.45702
Fold5, Epoch2/5 | train_loss: 0.08803
Fold: 5, Epoch: 2/5, Step: end | val_loss: 0.10213, score: 0.45245


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 3/5, Step: 80 | val_loss: 0.10116, score: 0.4501
Fold: 5, Epoch: 3/5, Step: 160 | val_loss: 0.10151, score: 0.45095
Fold: 5, Epoch: 3/5, Step: 240 | val_loss: 0.10178, score: 0.45163
Fold: 5, Epoch: 3/5, Step: 320 | val_loss: 0.10194, score: 0.45202
Fold: 5, Epoch: 3/5, Step: 400 | val_loss: 0.10432, score: 0.45758
Fold5, Epoch3/5 | train_loss: 0.07713
Fold: 5, Epoch: 3/5, Step: end | val_loss: 0.10152, score: 0.451


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 5, Epoch: 4/5, Step: 80 | val_loss: 0.10247, score: 0.45329
Fold: 5, Epoch: 4/5, Step: 160 | val_loss: 0.1023, score: 0.45286
Fold: 5, Epoch: 4/5, Step: 240 | val_loss: 0.10174, score: 0.45152
Fold: 5, Epoch: 4/5, Step: 320 | val_loss: 0.10146, score: 0.45085
Fold: 5, Epoch: 4/5, Step: 400 | val_loss: 0.10141, score: 0.45072
Fold5, Epoch4/5 | train_loss: 0.07419
Fold: 5, Epoch: 4/5, Step: end | val_loss: 0.10141, score: 0.45073


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 0/5, Step: 80 | val_loss: 0.21172, score: 0.66475
Fold: 6, Epoch: 0/5, Step: 160 | val_loss: 0.14695, score: 0.54556
Fold: 6, Epoch: 0/5, Step: 240 | val_loss: 0.15031, score: 0.55108
Fold: 6, Epoch: 0/5, Step: 320 | val_loss: 0.14071, score: 0.53066
Fold: 6, Epoch: 0/5, Step: 400 | val_loss: 0.13314, score: 0.51628
Fold6, Epoch0/5 | train_loss: 0.48923
Fold: 6, Epoch: 0/5, Step: end | val_loss: 0.13954, score: 0.5309


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 1/5, Step: 80 | val_loss: 0.11316, score: 0.47691
Fold: 6, Epoch: 1/5, Step: 160 | val_loss: 0.11436, score: 0.47937
Fold: 6, Epoch: 1/5, Step: 240 | val_loss: 0.11169, score: 0.47379
Fold: 6, Epoch: 1/5, Step: 320 | val_loss: 0.11528, score: 0.48144
Fold: 6, Epoch: 1/5, Step: 400 | val_loss: 0.11469, score: 0.48017
Fold6, Epoch1/5 | train_loss: 0.10094
Fold: 6, Epoch: 1/5, Step: end | val_loss: 0.11191, score: 0.47435


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 2/5, Step: 80 | val_loss: 0.11021, score: 0.47066
Fold: 6, Epoch: 2/5, Step: 160 | val_loss: 0.1092, score: 0.46829
Fold: 6, Epoch: 2/5, Step: 240 | val_loss: 0.1161, score: 0.48361
Fold: 6, Epoch: 2/5, Step: 320 | val_loss: 0.11372, score: 0.47844
Fold: 6, Epoch: 2/5, Step: 400 | val_loss: 0.1163, score: 0.48391
Fold6, Epoch2/5 | train_loss: 0.09201
Fold: 6, Epoch: 2/5, Step: end | val_loss: 0.11103, score: 0.47256


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 3/5, Step: 80 | val_loss: 0.11094, score: 0.47239
Fold: 6, Epoch: 3/5, Step: 160 | val_loss: 0.10806, score: 0.46588
Fold: 6, Epoch: 3/5, Step: 240 | val_loss: 0.10783, score: 0.46535
Fold: 6, Epoch: 3/5, Step: 320 | val_loss: 0.10874, score: 0.46738
Fold: 6, Epoch: 3/5, Step: 400 | val_loss: 0.10808, score: 0.46604
Fold6, Epoch3/5 | train_loss: 0.08361
Fold: 6, Epoch: 3/5, Step: end | val_loss: 0.10911, score: 0.46817


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 6, Epoch: 4/5, Step: 80 | val_loss: 0.10816, score: 0.46618
Fold: 6, Epoch: 4/5, Step: 160 | val_loss: 0.10874, score: 0.46746
Fold: 6, Epoch: 4/5, Step: 240 | val_loss: 0.10889, score: 0.46778
Fold: 6, Epoch: 4/5, Step: 320 | val_loss: 0.10917, score: 0.46842
Fold: 6, Epoch: 4/5, Step: 400 | val_loss: 0.10894, score: 0.46789
Fold6, Epoch4/5 | train_loss: 0.0783
Fold: 6, Epoch: 4/5, Step: end | val_loss: 0.10894, score: 0.46789


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 0/5, Step: 80 | val_loss: 0.23192, score: 0.69175
Fold: 7, Epoch: 0/5, Step: 160 | val_loss: 0.12152, score: 0.49366
Fold: 7, Epoch: 0/5, Step: 240 | val_loss: 0.12238, score: 0.49625
Fold: 7, Epoch: 0/5, Step: 320 | val_loss: 0.12638, score: 0.50527
Fold: 7, Epoch: 0/5, Step: 400 | val_loss: 0.11289, score: 0.47505
Fold7, Epoch0/5 | train_loss: 0.44996
Fold: 7, Epoch: 0/5, Step: end | val_loss: 0.11434, score: 0.47906


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 1/5, Step: 80 | val_loss: 0.10284, score: 0.45425
Fold: 7, Epoch: 1/5, Step: 160 | val_loss: 0.12097, score: 0.4939
Fold: 7, Epoch: 1/5, Step: 240 | val_loss: 0.10838, score: 0.46637
Fold: 7, Epoch: 1/5, Step: 320 | val_loss: 0.14207, score: 0.53408
Fold: 7, Epoch: 1/5, Step: 400 | val_loss: 0.11654, score: 0.48163
Fold7, Epoch1/5 | train_loss: 0.1173
Fold: 7, Epoch: 1/5, Step: end | val_loss: 0.10435, score: 0.4572


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 2/5, Step: 80 | val_loss: 0.09957, score: 0.44659
Fold: 7, Epoch: 2/5, Step: 160 | val_loss: 0.10116, score: 0.45011
Fold: 7, Epoch: 2/5, Step: 240 | val_loss: 0.1025, score: 0.45307
Fold: 7, Epoch: 2/5, Step: 320 | val_loss: 0.1006, score: 0.4491
Fold: 7, Epoch: 2/5, Step: 400 | val_loss: 0.09796, score: 0.44316
Fold7, Epoch2/5 | train_loss: 0.09377
Fold: 7, Epoch: 2/5, Step: end | val_loss: 0.09906, score: 0.44535


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 3/5, Step: 80 | val_loss: 0.09773, score: 0.4426
Fold: 7, Epoch: 3/5, Step: 160 | val_loss: 0.10315, score: 0.45498
Fold: 7, Epoch: 3/5, Step: 240 | val_loss: 0.09859, score: 0.44454
Fold: 7, Epoch: 3/5, Step: 320 | val_loss: 0.09643, score: 0.43946
Fold: 7, Epoch: 3/5, Step: 400 | val_loss: 0.102, score: 0.45232
Fold7, Epoch3/5 | train_loss: 0.08565
Fold: 7, Epoch: 3/5, Step: end | val_loss: 0.09684, score: 0.44058


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 7, Epoch: 4/5, Step: 80 | val_loss: 0.09704, score: 0.44099
Fold: 7, Epoch: 4/5, Step: 160 | val_loss: 0.09609, score: 0.43876
Fold: 7, Epoch: 4/5, Step: 240 | val_loss: 0.09792, score: 0.44298
Fold: 7, Epoch: 4/5, Step: 320 | val_loss: 0.09687, score: 0.44057
Fold: 7, Epoch: 4/5, Step: 400 | val_loss: 0.09699, score: 0.44088
Fold7, Epoch4/5 | train_loss: 0.08071
Fold: 7, Epoch: 4/5, Step: end | val_loss: 0.09698, score: 0.44086


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 0/5, Step: 80 | val_loss: 0.2238, score: 0.68725
Fold: 8, Epoch: 0/5, Step: 160 | val_loss: 0.15148, score: 0.55272
Fold: 8, Epoch: 0/5, Step: 240 | val_loss: 0.13117, score: 0.51478
Fold: 8, Epoch: 0/5, Step: 320 | val_loss: 0.13081, score: 0.5126
Fold: 8, Epoch: 0/5, Step: 400 | val_loss: 0.1235, score: 0.49787
Fold8, Epoch0/5 | train_loss: 0.44757
Fold: 8, Epoch: 0/5, Step: end | val_loss: 0.1164, score: 0.48236


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 1/5, Step: 80 | val_loss: 0.10814, score: 0.46541
Fold: 8, Epoch: 1/5, Step: 160 | val_loss: 0.10517, score: 0.45899
Fold: 8, Epoch: 1/5, Step: 240 | val_loss: 0.10394, score: 0.45616
Fold: 8, Epoch: 1/5, Step: 320 | val_loss: 0.10236, score: 0.45288
Fold: 8, Epoch: 1/5, Step: 400 | val_loss: 0.10215, score: 0.45232
Fold8, Epoch1/5 | train_loss: 0.10447
Fold: 8, Epoch: 1/5, Step: end | val_loss: 0.10356, score: 0.45549


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 2/5, Step: 80 | val_loss: 0.10553, score: 0.46023
Fold: 8, Epoch: 2/5, Step: 160 | val_loss: 0.1089, score: 0.46762
Fold: 8, Epoch: 2/5, Step: 240 | val_loss: 0.1069, score: 0.46277
Fold: 8, Epoch: 2/5, Step: 320 | val_loss: 0.12279, score: 0.49623
Fold: 8, Epoch: 2/5, Step: 400 | val_loss: 0.10712, score: 0.46351
Fold8, Epoch2/5 | train_loss: 0.09893
Fold: 8, Epoch: 2/5, Step: end | val_loss: 0.10494, score: 0.4586


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 3/5, Step: 80 | val_loss: 0.10225, score: 0.45261
Fold: 8, Epoch: 3/5, Step: 160 | val_loss: 0.10189, score: 0.45183
Fold: 8, Epoch: 3/5, Step: 240 | val_loss: 0.10183, score: 0.45169
Fold: 8, Epoch: 3/5, Step: 320 | val_loss: 0.10288, score: 0.45422
Fold: 8, Epoch: 3/5, Step: 400 | val_loss: 0.10152, score: 0.451
Fold8, Epoch3/5 | train_loss: 0.08188
Fold: 8, Epoch: 3/5, Step: end | val_loss: 0.10274, score: 0.4537


  0%|          | 0/440 [00:00<?, ?it/s]

Fold: 8, Epoch: 4/5, Step: 80 | val_loss: 0.10189, score: 0.45182
Fold: 8, Epoch: 4/5, Step: 160 | val_loss: 0.10154, score: 0.45109
Fold: 8, Epoch: 4/5, Step: 240 | val_loss: 0.10143, score: 0.45083
Fold: 8, Epoch: 4/5, Step: 320 | val_loss: 0.10144, score: 0.45084
Fold: 8, Epoch: 4/5, Step: 400 | val_loss: 0.10141, score: 0.45076
Fold8, Epoch4/5 | train_loss: 0.0774
Fold: 8, Epoch: 4/5, Step: end | val_loss: 0.10142, score: 0.45078


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 0/5, Step: 80 | val_loss: 0.21594, score: 0.67151
Fold: 9, Epoch: 0/5, Step: 160 | val_loss: 0.13485, score: 0.52338
Fold: 9, Epoch: 0/5, Step: 240 | val_loss: 0.16041, score: 0.57361
Fold: 9, Epoch: 0/5, Step: 320 | val_loss: 0.15483, score: 0.55718
Fold: 9, Epoch: 0/5, Step: 400 | val_loss: 0.11252, score: 0.47495
Fold9, Epoch0/5 | train_loss: 0.39487
Fold: 9, Epoch: 0/5, Step: end | val_loss: 0.12281, score: 0.49619


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 1/5, Step: 80 | val_loss: 0.11516, score: 0.48075
Fold: 9, Epoch: 1/5, Step: 160 | val_loss: 0.11224, score: 0.47457
Fold: 9, Epoch: 1/5, Step: 240 | val_loss: 0.11189, score: 0.4736
Fold: 9, Epoch: 1/5, Step: 320 | val_loss: 0.11437, score: 0.47869
Fold: 9, Epoch: 1/5, Step: 400 | val_loss: 0.12184, score: 0.49373
Fold9, Epoch1/5 | train_loss: 0.10369
Fold: 9, Epoch: 1/5, Step: end | val_loss: 0.10926, score: 0.46769


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 2/5, Step: 80 | val_loss: 0.11184, score: 0.47338
Fold: 9, Epoch: 2/5, Step: 160 | val_loss: 0.11171, score: 0.47325
Fold: 9, Epoch: 2/5, Step: 240 | val_loss: 0.10993, score: 0.46906
Fold: 9, Epoch: 2/5, Step: 320 | val_loss: 0.11657, score: 0.48325
Fold: 9, Epoch: 2/5, Step: 400 | val_loss: 0.10953, score: 0.46799
Fold9, Epoch2/5 | train_loss: 0.09761
Fold: 9, Epoch: 2/5, Step: end | val_loss: 0.11675, score: 0.48337


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 3/5, Step: 80 | val_loss: 0.11038, score: 0.47012
Fold: 9, Epoch: 3/5, Step: 160 | val_loss: 0.10863, score: 0.46618
Fold: 9, Epoch: 3/5, Step: 240 | val_loss: 0.10944, score: 0.46795
Fold: 9, Epoch: 3/5, Step: 320 | val_loss: 0.10862, score: 0.46622
Fold: 9, Epoch: 3/5, Step: 400 | val_loss: 0.10862, score: 0.46614
Fold9, Epoch3/5 | train_loss: 0.08386
Fold: 9, Epoch: 3/5, Step: end | val_loss: 0.10858, score: 0.46604


  0%|          | 0/439 [00:00<?, ?it/s]

Fold: 9, Epoch: 4/5, Step: 80 | val_loss: 0.10827, score: 0.46539
Fold: 9, Epoch: 4/5, Step: 160 | val_loss: 0.10835, score: 0.46551
Fold: 9, Epoch: 4/5, Step: 240 | val_loss: 0.1085, score: 0.46592
Fold: 9, Epoch: 4/5, Step: 320 | val_loss: 0.10823, score: 0.46529
Fold: 9, Epoch: 4/5, Step: 400 | val_loss: 0.10829, score: 0.46539
Fold9, Epoch4/5 | train_loss: 0.07846
Fold: 9, Epoch: 4/5, Step: end | val_loss: 0.1083, score: 0.46541
fold score： [0.4564779799401758, 0.4632846054388818, 0.4589690508374812, 0.44970343141052593, 0.45456356696118083, 0.4501031649293044, 0.46535091844905985, 0.4387566668438836, 0.4507587608739125, 0.4652944788866483]
CV: 0.4556
Starting upload for file tokenizer.tar


100%|██████████| 3.22M/3.22M [00:02<00:00, 1.19MB/s]


Upload successful: tokenizer.tar (3MB)
Starting upload for file preds.tar


100%|██████████| 180k/180k [00:02<00:00, 64.6kB/s]


Upload successful: preds.tar (180KB)
Starting upload for file model.tar


100%|██████████| 4.67G/4.67G [05:19<00:00, 15.7MB/s] 


Upload successful: model.tar (5GB)
Starting upload for file fig.tar


ReadTimeoutError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=None)

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import mean_squared_error

# def mcrmse(targets, predictions):
#     colwise_mse = np.mean(np.square(targets - predictions), axis=0)
#     return np.mean(np.sqrt(colwise_mse), axis=0)

# train = pd.read_csv('../input/train.csv')
# folds = pd.read_csv('../output/exp160/preds/folds.csv')
# target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
# folds[target_list] = 0

# preds = []
# for i in range(10):
#     pred_ = np.load(f'../output/exp160/preds/oof_pred_fold{i}.npy')
#     preds.append(pred_)
#     folds.loc[folds['0']==i, target_list] = pred_
#     score = mcrmse(folds[folds['0']==i][target_list], pd.DataFrame(pred_))
#     print(f'fold{i}: {score}')

# mcrmse(train[target_list], folds[target_list])

fold0: 0.6321429482080485
fold1: 0.7513558480491872
fold2: 0.6952740962262757
fold3: 0.7008789512655644
fold4: 0.688097824772301
fold5: 0.6452762622556365
fold6: 0.6562408702077936
fold7: 0.7249738351260905
fold8: 0.7719788497548336
fold9: 0.765888346223662


0.45903549569745755