In [1]:
NOTEBOOK_NAME = "ex1-lecr-trn-fold"

In [2]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
!nvidia-smi

Tue Feb 14 23:10:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   39C    P8    20W / 300W |      0MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import pickle
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [32]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    debug = False
    upload_data = True
    print_freq = 500
    num_workers = 4
    model = "xlm-roberta-base"
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 32
    weight_decay = 0.01
    max_grad_norm = 0.012
    max_len = 512
    n_folds = 5
    seed = 42
    epochs = 5
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations"
    finetune_url = "/notebooks/kaggle_lecr/data/ex1-finetuning-mpnet-mnrloss-ep10-fold"
    uns_url = "/notebooks/kaggle_lecr/output/ex1-uns-top-n-10-para-mpnet-fold"

In [6]:
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

In [7]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    train = pd.read_csv(f"{cfg.uns_url}/train.csv")
    train['topics_titles'].fillna("Title does not exist", inplace = True)
    train['content_titles'].fillna("Title does not exist", inplace = True)
    
    topics = pd.read_csv(cfg.data_url + "/" + "topics.csv")
    content = pd.read_csv(cfg.data_url + "/" + "content.csv")
    correlations = pd.read_csv(cfg.data_url + "/" + "correlations.csv")

    print(' ')
    print('-' * 50)
    print(f"train.shape: {train.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return train, topics, content, correlations

In [8]:
def preprocess(train: pd.DataFrame):
    # Create feature column
    train['text'] = train['topics_titles'] + '[SEP]' + train['content_titles']
    return train

In [9]:
def merge_cat_info(train: pd.DataFrame, topics: pd.DataFrame, content:pd.DataFrame):
    merge_train = pd.merge(train, topics[["id", "level", "category"]], left_on="topics_ids", right_on="id", how="left")
    merge_train = merge_train.drop("id", axis=1)
    merge_train = pd.merge(merge_train, content[["id", "kind"]], left_on="content_ids", right_on="id", how="left")
    merge_train = merge_train.drop("id", axis=1)
    
    merge_train["level_tag"] = merge_train["level"].apply(lambda x: f"[LEVEL{x}]")
    merge_train["category_tag"] = merge_train["category"].apply(lambda x: f"[CATEGORY_{x.upper()}]")
    merge_train["kind_tag"] = merge_train["kind"].apply(lambda x: f"[KIND_{x.upper()}]")
    
    level_tag_list = sorted(merge_train["level_tag"].unique()) 
    category_list = sorted(merge_train["category_tag"].unique()) 
    kind_list = sorted(merge_train["kind_tag"].unique()) 
    
    train['topics_titles'] = merge_train['level_tag'] + merge_train["category_tag"] + merge_train['topics_titles'] 
    train['content_titles'] = merge_train['kind_tag'] + merge_train['content_titles']
    
    return train, level_tag_list, category_list, kind_list

In [10]:
# =========================================================================================
# CV split
# =========================================================================================
def cv_split(train, cfg):
    import pdb; pdb.set_trace()
    kfold = StratifiedGroupKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train, train['target'], train['topics_ids'])):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

In [11]:
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [12]:
# =========================================================================================
# Get max length
# =========================================================================================
def get_max_length(train, cfg):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    cfg.max_len = max(lengths) + 5 # cls + sep + level + category + kind
    print(f"max_len: {cfg.max_len}")

In [13]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Custom dataset
# =========================================================================================
class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
# =========================================================================================
# Collate function for training
# =========================================================================================
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# =========================================================================================
# Model
# =========================================================================================
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        # AutoTokenizer.from_pretrainedでadditional_special_tokensをした際は、増えたtoken分、新しい語彙として登録が必要らしい
        # https://cocoinit23.com/pytorch-runtimeerror-cuda-error-device-side-assert-triggered/
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# =========================================================================================
# Helper functions
# =========================================================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg

# =========================================================================================
# Valid function loop
# =========================================================================================
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold
    
# =========================================================================================
# Train & Evaluate
# =========================================================================================
def train_and_evaluate_one_fold(train, correlations, fold, cfg):
    print(' ')
    print(f"========== fold: {fold} training ==========")
    # Split train & validation
    x_train = train[train['fold'] != fold]
    x_val = train[train['fold'] == fold]
    valid_labels = x_val['target'].values
    train_dataset = custom_dataset(x_train, cfg)
    valid_dataset = custom_dataset(x_val, cfg)
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    model.to(device)
    # Optimizer
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    optimizer_parameters = get_optimizer_params(
        model, 
        encoder_lr = cfg.encoder_lr, 
        decoder_lr = cfg.decoder_lr,
        weight_decay = cfg.weight_decay
    )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    # Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = num_warmup_steps, 
        num_training_steps = num_train_steps, 
        num_cycles = cfg.num_cycles
        )
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    best_score = 0
    for epoch in range(cfg.epochs):
        start_time = time.time()
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        # Compute f2_score
        score, threshold = get_best_threshold(x_val, predictions, correlations)
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        if score > best_score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save(
                {'model': model.state_dict(), 'predictions': predictions}, 
                f"{OUTPUT_DIR}/{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}.pth"
                )
            val_predictions = predictions
    torch.cuda.empty_cache()
    gc.collect()
    # Get best threshold
    best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
    # Save Score, Threshold
    score = {"best_score": best_score, "best_threshold": best_threshold}
    with open(f"{OUTPUT_DIR}/score.pkl", "wb") as f:
        pickle.dump(score, f)
    print(f'Our CV score is {best_score} using a threshold of {best_threshold}')

In [14]:
# Seed everything
seed_everything(CFG)

In [15]:
# Read data
train, topics, content, correlations = read_data(CFG)

 
--------------------------------------------------
train.shape: (615170, 5)
correlations.shape: (61517, 2)


In [16]:
if CFG.debug:
    train = train[:1000]
    CFG.epoch = 1

In [17]:
train, level_tag_list, category_list, kind_list = merge_cat_info(train, topics, content)

In [18]:
level_tag_list

['[LEVEL0]', '[LEVEL1]', '[LEVEL2]', '[LEVEL3]']

In [19]:
category_list

['[CATEGORY_ALIGNED]', '[CATEGORY_SOURCE]', '[CATEGORY_SUPPLEMENTAL]']

In [20]:
kind_list

['[KIND_AUDIO]',
 '[KIND_DOCUMENT]',
 '[KIND_EXERCISE]',
 '[KIND_HTML5]',
 '[KIND_VIDEO]']

In [21]:
train = preprocess(train)

In [22]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_titles,content_titles,target,text
0,t_30dd476279c8,c_a7926808742b,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_HTML5]Medicine,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
1,t_30dd476279c8,c_7f13d437e8a9,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_EXERCISE]Medicine Practice,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
2,t_30dd476279c8,c_adeeb1783027,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_VIDEO]Developing new drugs,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
3,t_30dd476279c8,c_2576fbd2b75d,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_VIDEO]Antibiotics,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
4,t_30dd476279c8,c_9a839ba9e755,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_VIDEO]Discovery and Development of Drugs,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...


In [23]:
train["text"][0]

'[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][KIND_HTML5]Medicine'

In [24]:
CFG.tokenizer = AutoTokenizer.from_pretrained(
    CFG.model,
    additional_special_tokens = level_tag_list + category_list + kind_list
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
CFG.tokenizer.additional_special_tokens

['[LEVEL0]',
 '[LEVEL1]',
 '[LEVEL2]',
 '[LEVEL3]',
 '[CATEGORY_ALIGNED]',
 '[CATEGORY_SOURCE]',
 '[CATEGORY_SUPPLEMENTAL]',
 '[KIND_AUDIO]',
 '[KIND_DOCUMENT]',
 '[KIND_EXERCISE]',
 '[KIND_HTML5]',
 '[KIND_VIDEO]']

In [26]:
# CV split
train = cv_split(train, CFG)

In [27]:
train[(train["fold"] == 0)]

Unnamed: 0,topics_ids,content_ids,topics_titles,content_titles,target,text,fold
90,t_0dbba21c4202,c_25e7a2c080bd,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation,[KIND_HTML5]A magic wand?,0,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation...,0
91,t_0dbba21c4202,c_b0dfb7e8097b,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation,[KIND_HTML5]Breathing water,1,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation...,0
92,t_0dbba21c4202,c_1f5384a92dcc,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation,[KIND_HTML5]Colder than ice,0,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation...,0
93,t_0dbba21c4202,c_8e699ad0b921,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation,[KIND_HTML5]Feel your voice,1,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation...,0
94,t_0dbba21c4202,c_cc540d762221,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation,[KIND_HTML5]Anaglyphs – depth in colors,1,[LEVEL1][CATEGORY_SOURCE]Biology > Sciensation...,0
...,...,...,...,...,...,...,...
995,t_a93e389f0f1a,c_3283ed173b80,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,[KIND_VIDEO]अंकों की पहचान,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,0
996,t_a93e389f0f1a,c_97c413dd5265,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,[KIND_VIDEO]सीधे और प्रतिलोम विचरण से परिचय,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,0
997,t_a93e389f0f1a,c_82be8a30fed9,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,[KIND_VIDEO]रिश्ते परिवार में और गणित में 2,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,0
998,t_a93e389f0f1a,c_dc4c95229204,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,[KIND_EXERCISE]तीन अंकों वाले संख्याओं की तुलन...,1,[LEVEL0][CATEGORY_SUPPLEMENTAL]Rajasthan State...,0


In [28]:
# Get max length
get_max_length(train, CFG)

  0%|          | 0/1000 [00:00<?, ?it/s]

max_len: 51


In [29]:
# Train and evaluate one fold
train_and_evaluate_one_fold(train, correlations, 0, CFG)

 


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/25] Elapsed 0m 1s (remain 0m 30s) Loss: 0.7023(0.7023) Grad: inf  LR: 0.00000080  
Epoch: [1][24/25] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7301(0.6676) Grad: 3.2459  LR: 0.00000970  
EVAL: [0/7] Elapsed 0m 0s (remain 0m 1s) Loss: 0.6455(0.6455) 
EVAL: [6/7] Elapsed 0m 0s (remain 0m 0s) Loss: 0.5433(0.6529) 
Epoch 1 - avg_train_loss: 0.6676  avg_val_loss: 0.6529  time: 5s
Epoch 1 - Score: 0.3923 - Threshold: 0.00100
Epoch 1 - Save Best Score: 0.3923 Model
Epoch: [2][0/25] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6793(0.6793) Grad: 0.9370  LR: 0.00000965  
Epoch: [2][24/25] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6368(0.6153) Grad: 2.7129  LR: 0.00000750  
EVAL: [0/7] Elapsed 0m 0s (remain 0m 1s) Loss: 0.5008(0.5008) 
EVAL: [6/7] Elapsed 0m 0s (remain 0m 0s) Loss: 0.4359(0.5946) 
Epoch 2 - avg_train_loss: 0.6153  avg_val_loss: 0.5946  time: 4s
Epoch 2 - Score: 0.3923 - Threshold: 0.00100
Epoch: [3][0/25] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6922(0.6922) Grad: inf  LR: 0.00000738  
E

# Upload

In [30]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")





0

In [33]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)

Create Dataset name:lecr-trn-use-parent-title, output_dir:/notebooks/kaggle_lecr/output/lecr-trn-use-parent-title
Starting upload for file xlm-roberta-base_fold0_42.pth


100%|██████████| 1.04G/1.04G [00:21<00:00, 51.1MB/s]


Upload successful: xlm-roberta-base_fold0_42.pth (1GB)
Starting upload for file score.pkl


100%|██████████| 169/169 [00:00<00:00, 306B/s]


Upload successful: score.pkl (169B)


Tue Feb 14 23:14:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   41C    P8    19W / 300W |   5723MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+
env: TOKENIZERS_PARALLELISM=true
 
--------------------------------------------------
train.shape: (615170, 5)
correlations.shape: (61517, 2)
max_len: 233
 
========== fold: 0 training ==========
Epoch: [1][0/15379] Elapsed 0m 1s (remain 350m 59s) Loss: 0.7973(0.7973) Grad: inf  LR: 0.00000000  
Epoch: [1][500/15379] Elapsed 0m 47s (remain 23m 25s) Loss: 0.6401(0.7658) Grad: 7.3303  LR: 0.00000065  
Epoch: [1][1000/15379] Elapsed 1m 33s (remain 22m 21s) Loss: 0.4891(0.6640) Grad: 3.2213  LR: 0.00000130  
Epoch: [1][1500/15379] Elapsed 2m 19s (remain 21m 32s) Loss: 0.4931(0.6162) Grad: 15.5229  LR: 0.00000195  
Epoch: [1][2000/15379] Elapsed 3m 5s (remain 20m 42s) Loss: 0.5041(0.5882) Grad: 8.1591  LR: 0.00000260  
Epoch: [1][2500/15379] Elapsed 3m 51s (remain 19m 54s) Loss: 0.5658(0.5696) Grad: 15.5275  LR: 0.00000325  
Epoch: [1][3000/15379] Elapsed 4m 37s (remain 19m 6s) Loss: 0.4298(0.5565) Grad: 6.5059  LR: 0.00000390  
Epoch: [1][3500/15379] Elapsed 5m 24s (remain 18m 19s) Loss: 0.4479(0.5458) Grad: 9.1639  LR: 0.00000455  
Epoch: [1][4000/15379] Elapsed 6m 10s (remain 17m 33s) Loss: 0.5086(0.5366) Grad: 6.9882  LR: 0.00000520  
Epoch: [1][4500/15379] Elapsed 6m 56s (remain 16m 46s) Loss: 0.4072(0.5287) Grad: 6.0169  LR: 0.00000585  
Epoch: [1][5000/15379] Elapsed 7m 42s (remain 16m 0s) Loss: 0.4271(0.5213) Grad: 5.4028  LR: 0.00000650  
Epoch: [1][5500/15379] Elapsed 8m 28s (remain 15m 13s) Loss: 0.7632(0.5151) Grad: 20.4132  LR: 0.00000715  
Epoch: [1][6000/15379] Elapsed 9m 15s (remain 14m 27s) Loss: 0.3101(0.5092) Grad: 7.8532  LR: 0.00000780  
Epoch: [1][6500/15379] Elapsed 10m 1s (remain 13m 41s) Loss: 0.3852(0.5046) Grad: 6.2766  LR: 0.00000845  
Epoch: [1][7000/15379] Elapsed 10m 47s (remain 12m 55s) Loss: 0.5162(0.5003) Grad: 12.7162  LR: 0.00000910  
Epoch: [1][7500/15379] Elapsed 11m 33s (remain 12m 8s) Loss: 0.3515(0.4965) Grad: 8.0966  LR: 0.00000975  
Epoch: [1][8000/15379] Elapsed 12m 19s (remain 11m 22s) Loss: 0.4342(0.4924) Grad: 6.3200  LR: 0.00001000  
Epoch: [1][8500/15379] Elapsed 13m 6s (remain 10m 36s) Loss: 0.3587(0.4891) Grad: 8.7809  LR: 0.00001000  
Epoch: [1][9000/15379] Elapsed 13m 52s (remain 9m 49s) Loss: 0.4360(0.4855) Grad: 7.8663  LR: 0.00000999  
Epoch: [1][9500/15379] Elapsed 14m 38s (remain 9m 3s) Loss: 0.4719(0.4822) Grad: 6.0157  LR: 0.00000998  
Epoch: [1][10000/15379] Elapsed 15m 24s (remain 8m 17s) Loss: 0.4225(0.4786) Grad: 7.3835  LR: 0.00000997  
Epoch: [1][10500/15379] Elapsed 16m 10s (remain 7m 31s) Loss: 0.4801(0.4757) Grad: 13.2828  LR: 0.00000996  
Epoch: [1][11000/15379] Elapsed 16m 57s (remain 6m 44s) Loss: 0.3396(0.4728) Grad: 17.8331  LR: 0.00000994  
Epoch: [1][11500/15379] Elapsed 17m 44s (remain 5m 58s) Loss: 0.4772(0.4700) Grad: 9.1142  LR: 0.00000993  
Epoch: [1][12000/15379] Elapsed 18m 30s (remain 5m 12s) Loss: 0.4171(0.4676) Grad: 6.8146  LR: 0.00000990  
Epoch: [1][12500/15379] Elapsed 19m 17s (remain 4m 26s) Loss: 0.5263(0.4654) Grad: 7.3134  LR: 0.00000988  
Epoch: [1][13000/15379] Elapsed 20m 4s (remain 3m 40s) Loss: 0.5730(0.4632) Grad: 32.1362  LR: 0.00000986  
Epoch: [1][13500/15379] Elapsed 20m 50s (remain 2m 53s) Loss: 0.3539(0.4605) Grad: 4.6183  LR: 0.00000983  
Epoch: [1][14000/15379] Elapsed 21m 37s (remain 2m 7s) Loss: 0.3898(0.4580) Grad: 6.5580  LR: 0.00000980  
Epoch: [1][14500/15379] Elapsed 22m 23s (remain 1m 21s) Loss: 0.5130(0.4560) Grad: 13.5218  LR: 0.00000976  
Epoch: [1][15000/15379] Elapsed 23m 10s (remain 0m 35s) Loss: 0.4598(0.4541) Grad: 12.7892  LR: 0.00000973  
Epoch: [1][15378/15379] Elapsed 23m 45s (remain 0m 0s) Loss: 0.1743(0.4526) Grad: 3.9559  LR: 0.00000970  
EVAL: [0/3845] Elapsed 0m 0s (remain 13m 23s) Loss: 0.7677(0.7677) 
EVAL: [500/3845] Elapsed 0m 11s (remain 1m 15s) Loss: 0.7311(0.3689) 
EVAL: [1000/3845] Elapsed 0m 24s (remain 1m 8s) Loss: 0.3327(0.3738) 
EVAL: [1500/3845] Elapsed 0m 37s (remain 0m 59s) Loss: 0.3402(0.3801) 
EVAL: [2000/3845] Elapsed 0m 52s (remain 0m 48s) Loss: 0.2454(0.3938) 
EVAL: [2500/3845] Elapsed 1m 6s (remain 0m 35s) Loss: 0.6875(0.3921) 
EVAL: [3000/3845] Elapsed 1m 21s (remain 0m 23s) Loss: 0.4916(0.3946) 
EVAL: [3500/3845] Elapsed 1m 38s (remain 0m 9s) Loss: 0.2538(0.3989) 
EVAL: [3844/3845] Elapsed 1m 51s (remain 0m 0s) Loss: 0.2353(0.4011) 
Epoch 1 - avg_train_loss: 0.4526  avg_val_loss: 0.4011  time: 1592s
Epoch 1 - Score: 0.5280 - Threshold: 0.07900
Epoch 1 - Save Best Score: 0.5280 Model
Epoch: [2][0/15379] Elapsed 0m 0s (remain 74m 59s) Loss: 0.3883(0.3883) Grad: inf  LR: 0.00000970  
Epoch: [2][500/15379] Elapsed 0m 46s (remain 23m 8s) Loss: 0.2431(0.3778) Grad: 5.2811  LR: 0.00000966  
Epoch: [2][1000/15379] Elapsed 1m 33s (remain 22m 21s) Loss: 0.4213(0.3778) Grad: 8.6874  LR: 0.00000962  
Epoch: [2][1500/15379] Elapsed 2m 20s (remain 21m 36s) Loss: 0.4571(0.3771) Grad: 8.1673  LR: 0.00000957  
Epoch: [2][2000/15379] Elapsed 3m 6s (remain 20m 49s) Loss: 0.4725(0.3757) Grad: 12.3657  LR: 0.00000952  
Epoch: [2][2500/15379] Elapsed 3m 53s (remain 20m 3s) Loss: 0.4153(0.3763) Grad: 9.2726  LR: 0.00000947  
Epoch: [2][3000/15379] Elapsed 4m 40s (remain 19m 15s) Loss: 0.4653(0.3754) Grad: 8.4268  LR: 0.00000942  
Epoch: [2][3500/15379] Elapsed 5m 26s (remain 18m 27s) Loss: 0.4325(0.3749) Grad: 9.1535  LR: 0.00000937  
Epoch: [2][4000/15379] Elapsed 6m 12s (remain 17m 40s) Loss: 0.3773(0.3746) Grad: 9.7756  LR: 0.00000931  
Epoch: [2][4500/15379] Elapsed 6m 59s (remain 16m 53s) Loss: 0.4314(0.3727) Grad: 10.2218  LR: 0.00000925  
Epoch: [2][5000/15379] Elapsed 7m 46s (remain 16m 7s) Loss: 0.4490(0.3717) Grad: 8.1123  LR: 0.00000919  
Epoch: [2][5500/15379] Elapsed 8m 32s (remain 15m 20s) Loss: 0.4805(0.3707) Grad: 12.4396  LR: 0.00000913  
Epoch: [2][6000/15379] Elapsed 9m 18s (remain 14m 33s) Loss: 0.2973(0.3699) Grad: 9.5631  LR: 0.00000907  
Epoch: [2][6500/15379] Elapsed 10m 5s (remain 13m 47s) Loss: 0.3037(0.3683) Grad: 8.7280  LR: 0.00000900  
Epoch: [2][7000/15379] Elapsed 10m 52s (remain 13m 0s) Loss: 0.4989(0.3671) Grad: 14.8343  LR: 0.00000893  
Epoch: [2][7500/15379] Elapsed 11m 39s (remain 12m 14s) Loss: 0.3169(0.3665) Grad: 6.4836  LR: 0.00000886  
Epoch: [2][8000/15379] Elapsed 12m 25s (remain 11m 27s) Loss: 0.2582(0.3663) Grad: 6.2613  LR: 0.00000878  
Epoch: [2][8500/15379] Elapsed 13m 12s (remain 10m 41s) Loss: 0.1969(0.3653) Grad: 5.5879  LR: 0.00000871  
Epoch: [2][9000/15379] Elapsed 13m 59s (remain 9m 54s) Loss: 0.3216(0.3644) Grad: 8.0453  LR: 0.00000863  
Epoch: [2][9500/15379] Elapsed 14m 46s (remain 9m 8s) Loss: 0.5602(0.3636) Grad: 17.0584  LR: 0.00000855  
Epoch: [2][10000/15379] Elapsed 15m 32s (remain 8m 21s) Loss: 0.4291(0.3627) Grad: 20.4848  LR: 0.00000847  
Epoch: [2][10500/15379] Elapsed 16m 19s (remain 7m 35s) Loss: 0.2265(0.3620) Grad: 7.2497  LR: 0.00000839  
Epoch: [2][11000/15379] Elapsed 17m 6s (remain 6m 48s) Loss: 0.4511(0.3609) Grad: 13.2522  LR: 0.00000831  
Epoch: [2][11500/15379] Elapsed 17m 53s (remain 6m 1s) Loss: 0.5667(0.3602) Grad: 20.2632  LR: 0.00000822  
Epoch: [2][12000/15379] Elapsed 18m 39s (remain 5m 15s) Loss: 0.4591(0.3593) Grad: 16.9464  LR: 0.00000813  
Epoch: [2][12500/15379] Elapsed 19m 26s (remain 4m 28s) Loss: 0.3018(0.3588) Grad: 7.9136  LR: 0.00000804  
Epoch: [2][13000/15379] Elapsed 20m 13s (remain 3m 41s) Loss: 0.3060(0.3576) Grad: 6.8796  LR: 0.00000795  
Epoch: [2][13500/15379] Elapsed 20m 59s (remain 2m 55s) Loss: 0.2621(0.3568) Grad: 9.0297  LR: 0.00000786  
Epoch: [2][14000/15379] Elapsed 21m 46s (remain 2m 8s) Loss: 0.2074(0.3561) Grad: 4.7241  LR: 0.00000777  
Epoch: [2][14500/15379] Elapsed 22m 32s (remain 1m 21s) Loss: 0.4077(0.3552) Grad: 9.5673  LR: 0.00000767  
Epoch: [2][15000/15379] Elapsed 23m 19s (remain 0m 35s) Loss: 0.3339(0.3544) Grad: 7.6350  LR: 0.00000757  
Epoch: [2][15378/15379] Elapsed 23m 55s (remain 0m 0s) Loss: 0.3183(0.3538) Grad: 10.0207  LR: 0.00000750  
EVAL: [0/3845] Elapsed 0m 0s (remain 15m 24s) Loss: 0.8208(0.8208) 
EVAL: [500/3845] Elapsed 0m 11s (remain 1m 15s) Loss: 0.7691(0.3535) 
EVAL: [1000/3845] Elapsed 0m 24s (remain 1m 8s) Loss: 0.2859(0.3541) 
EVAL: [1500/3845] Elapsed 0m 37s (remain 0m 59s) Loss: 0.2695(0.3569) 
EVAL: [2000/3845] Elapsed 0m 52s (remain 0m 48s) Loss: 0.2121(0.3667) 
EVAL: [2500/3845] Elapsed 1m 7s (remain 0m 36s) Loss: 0.5614(0.3638) 
EVAL: [3000/3845] Elapsed 1m 22s (remain 0m 23s) Loss: 0.3746(0.3644) 
EVAL: [3500/3845] Elapsed 1m 38s (remain 0m 9s) Loss: 0.2882(0.3681) 
EVAL: [3844/3845] Elapsed 1m 52s (remain 0m 0s) Loss: 0.2323(0.3709) 
Epoch 2 - avg_train_loss: 0.3538  avg_val_loss: 0.3709  time: 1606s
Epoch 2 - Score: 0.5450 - Threshold: 0.07600
Epoch 2 - Save Best Score: 0.5450 Model
Epoch: [3][0/15379] Elapsed 0m 0s (remain 175m 55s) Loss: 0.3874(0.3874) Grad: 8.9810  LR: 0.00000750  
Epoch: [3][500/15379] Elapsed 0m 47s (remain 23m 27s) Loss: 0.4630(0.2861) Grad: 18.7709  LR: 0.00000740  
Epoch: [3][1000/15379] Elapsed 1m 33s (remain 22m 28s) Loss: 0.2371(0.2855) Grad: 17.8308  LR: 0.00000730  
Epoch: [3][1500/15379] Elapsed 2m 20s (remain 21m 40s) Loss: 0.2229(0.2854) Grad: 10.7840  LR: 0.00000720  
Epoch: [3][2000/15379] Elapsed 3m 7s (remain 20m 53s) Loss: 0.2741(0.2863) Grad: 11.4963  LR: 0.00000710  
Epoch: [3][2500/15379] Elapsed 3m 54s (remain 20m 5s) Loss: 0.2218(0.2870) Grad: 10.2117  LR: 0.00000699  
Epoch: [3][3000/15379] Elapsed 4m 40s (remain 19m 17s) Loss: 0.3339(0.2862) Grad: 17.7824  LR: 0.00000689  
Epoch: [3][3500/15379] Elapsed 5m 27s (remain 18m 31s) Loss: 0.2264(0.2863) Grad: 11.9146  LR: 0.00000678  
Epoch: [3][4000/15379] Elapsed 6m 14s (remain 17m 44s) Loss: 0.4126(0.2859) Grad: 17.5984  LR: 0.00000668  
Epoch: [3][4500/15379] Elapsed 7m 1s (remain 16m 57s) Loss: 0.1259(0.2853) Grad: 9.3458  LR: 0.00000657  
Epoch: [3][5000/15379] Elapsed 7m 47s (remain 16m 10s) Loss: 0.2672(0.2847) Grad: 13.5175  LR: 0.00000646  
Epoch: [3][5500/15379] Elapsed 8m 34s (remain 15m 23s) Loss: 0.2999(0.2840) Grad: 13.6995  LR: 0.00000635  
Epoch: [3][6000/15379] Elapsed 9m 21s (remain 14m 36s) Loss: 0.1562(0.2838) Grad: 12.3989  LR: 0.00000624  
Epoch: [3][6500/15379] Elapsed 10m 7s (remain 13m 49s) Loss: 0.3641(0.2841) Grad: 15.6465  LR: 0.00000613  
Epoch: [3][7000/15379] Elapsed 10m 54s (remain 13m 2s) Loss: 0.2064(0.2842) Grad: 8.9800  LR: 0.00000602  
Epoch: [3][7500/15379] Elapsed 11m 40s (remain 12m 16s) Loss: 0.2848(0.2837) Grad: 12.6508  LR: 0.00000591  
Epoch: [3][8000/15379] Elapsed 12m 27s (remain 11m 29s) Loss: 0.3065(0.2833) Grad: 71.3249  LR: 0.00000580  
Epoch: [3][8500/15379] Elapsed 13m 14s (remain 10m 42s) Loss: 0.3539(0.2828) Grad: 17.0055  LR: 0.00000569  
Epoch: [3][9000/15379] Elapsed 14m 0s (remain 9m 55s) Loss: 0.4312(0.2826) Grad: 18.1832  LR: 0.00000557  
Epoch: [3][9500/15379] Elapsed 14m 47s (remain 9m 9s) Loss: 0.4132(0.2821) Grad: 16.1362  LR: 0.00000546  
Epoch: [3][10000/15379] Elapsed 15m 34s (remain 8m 22s) Loss: 0.3565(0.2816) Grad: 16.9530  LR: 0.00000535  
Epoch: [3][10500/15379] Elapsed 16m 21s (remain 7m 35s) Loss: 0.1258(0.2814) Grad: 14.5216  LR: 0.00000523  
Epoch: [3][11000/15379] Elapsed 17m 7s (remain 6m 48s) Loss: 0.1492(0.2812) Grad: 10.0984  LR: 0.00000512  
Epoch: [3][11500/15379] Elapsed 17m 54s (remain 6m 2s) Loss: 0.4051(0.2808) Grad: 22.7831  LR: 0.00000501  
Epoch: [3][12000/15379] Elapsed 18m 40s (remain 5m 15s) Loss: 0.1637(0.2809) Grad: 8.0875  LR: 0.00000489  
Epoch: [3][12500/15379] Elapsed 19m 27s (remain 4m 28s) Loss: 0.2714(0.2804) Grad: 12.9152  LR: 0.00000478  
Epoch: [3][13000/15379] Elapsed 20m 13s (remain 3m 42s) Loss: 0.2350(0.2796) Grad: 16.8559  LR: 0.00000467  
Epoch: [3][13500/15379] Elapsed 21m 0s (remain 2m 55s) Loss: 0.2574(0.2792) Grad: 15.8314  LR: 0.00000455  
Epoch: [3][14000/15379] Elapsed 21m 46s (remain 2m 8s) Loss: 0.2934(0.2787) Grad: 14.6650  LR: 0.00000444  
Epoch: [3][14500/15379] Elapsed 22m 33s (remain 1m 21s) Loss: 0.2817(0.2783) Grad: 37.3589  LR: 0.00000433  
Epoch: [3][15000/15379] Elapsed 23m 20s (remain 0m 35s) Loss: 0.3584(0.2780) Grad: 22.9930  LR: 0.00000422  
Epoch: [3][15378/15379] Elapsed 23m 55s (remain 0m 0s) Loss: 0.1719(0.2777) Grad: 10.3982  LR: 0.00000413  
EVAL: [0/3845] Elapsed 0m 0s (remain 13m 33s) Loss: 0.8954(0.8954) 
EVAL: [500/3845] Elapsed 0m 11s (remain 1m 15s) Loss: 0.6765(0.3712) 
EVAL: [1000/3845] Elapsed 0m 24s (remain 1m 8s) Loss: 0.2739(0.3695) 
EVAL: [1500/3845] Elapsed 0m 37s (remain 0m 59s) Loss: 0.2669(0.3722) 
EVAL: [2000/3845] Elapsed 0m 52s (remain 0m 48s) Loss: 0.2210(0.3799) 
EVAL: [2500/3845] Elapsed 1m 6s (remain 0m 35s) Loss: 0.4293(0.3745) 
EVAL: [3000/3845] Elapsed 1m 22s (remain 0m 23s) Loss: 0.4046(0.3729) 
EVAL: [3500/3845] Elapsed 1m 38s (remain 0m 9s) Loss: 0.2460(0.3763) 
EVAL: [3844/3845] Elapsed 1m 52s (remain 0m 0s) Loss: 0.2500(0.3794) 
Epoch 3 - avg_train_loss: 0.2777  avg_val_loss: 0.3794  time: 1601s
Epoch 3 - Score: 0.5537 - Threshold: 0.03400
Epoch 3 - Save Best Score: 0.5537 Model
Epoch: [4][0/15379] Elapsed 0m 0s (remain 77m 58s) Loss: 0.2589(0.2589) Grad: inf  LR: 0.00000413  
Epoch: [4][500/15379] Elapsed 0m 46s (remain 23m 12s) Loss: 0.1471(0.2066) Grad: 15.1539  LR: 0.00000402  
Epoch: [4][1000/15379] Elapsed 1m 33s (remain 22m 26s) Loss: 0.2852(0.2089) Grad: 15.1467  LR: 0.00000391  
Epoch: [4][1500/15379] Elapsed 2m 20s (remain 21m 37s) Loss: 0.2051(0.2079) Grad: 31.1701  LR: 0.00000380  
Epoch: [4][2000/15379] Elapsed 3m 7s (remain 20m 51s) Loss: 0.1297(0.2076) Grad: 17.1928  LR: 0.00000369  
Epoch: [4][2500/15379] Elapsed 3m 53s (remain 20m 3s) Loss: 0.0458(0.2074) Grad: 7.1598  LR: 0.00000358  
Epoch: [4][3000/15379] Elapsed 4m 40s (remain 19m 17s) Loss: 0.1653(0.2070) Grad: 16.8516  LR: 0.00000347  
Epoch: [4][3500/15379] Elapsed 5m 27s (remain 18m 31s) Loss: 0.2482(0.2067) Grad: 21.1887  LR: 0.00000336  
Epoch: [4][4000/15379] Elapsed 6m 14s (remain 17m 44s) Loss: 0.3843(0.2066) Grad: 23.1459  LR: 0.00000326  
Epoch: [4][4500/15379] Elapsed 7m 1s (remain 16m 58s) Loss: 0.2633(0.2054) Grad: 38.2187  LR: 0.00000315  
Epoch: [4][5000/15379] Elapsed 7m 48s (remain 16m 11s) Loss: 0.1514(0.2049) Grad: 23.4755  LR: 0.00000305  
Epoch: [4][5500/15379] Elapsed 8m 34s (remain 15m 24s) Loss: 0.2326(0.2046) Grad: 25.9484  LR: 0.00000294  
Epoch: [4][6000/15379] Elapsed 9m 21s (remain 14m 37s) Loss: 0.1747(0.2050) Grad: 15.1539  LR: 0.00000284  
Epoch: [4][6500/15379] Elapsed 10m 8s (remain 13m 50s) Loss: 0.0838(0.2052) Grad: 8.3395  LR: 0.00000274  
Epoch: [4][7000/15379] Elapsed 10m 55s (remain 13m 4s) Loss: 0.2782(0.2053) Grad: 27.0671  LR: 0.00000264  
Epoch: [4][7500/15379] Elapsed 11m 41s (remain 12m 17s) Loss: 0.3138(0.2051) Grad: 34.9858  LR: 0.00000254  
Epoch: [4][8000/15379] Elapsed 12m 28s (remain 11m 30s) Loss: 0.4934(0.2048) Grad: 41.4582  LR: 0.00000244  
Epoch: [4][8500/15379] Elapsed 13m 15s (remain 10m 43s) Loss: 0.2752(0.2048) Grad: 15.8718  LR: 0.00000234  
Epoch: [4][9000/15379] Elapsed 14m 1s (remain 9m 56s) Loss: 0.0282(0.2045) Grad: 4.3033  LR: 0.00000225  
Epoch: [4][9500/15379] Elapsed 14m 48s (remain 9m 9s) Loss: 0.0959(0.2042) Grad: 19.2758  LR: 0.00000215  
Epoch: [4][10000/15379] Elapsed 15m 35s (remain 8m 23s) Loss: 0.1346(0.2040) Grad: 19.0329  LR: 0.00000206  
Epoch: [4][10500/15379] Elapsed 16m 22s (remain 7m 36s) Loss: 0.1746(0.2039) Grad: 13.0410  LR: 0.00000197  
Epoch: [4][11000/15379] Elapsed 17m 9s (remain 6m 49s) Loss: 0.1176(0.2036) Grad: 14.4489  LR: 0.00000188  
Epoch: [4][11500/15379] Elapsed 17m 55s (remain 6m 2s) Loss: 0.0999(0.2037) Grad: 16.3581  LR: 0.00000179  
Epoch: [4][12000/15379] Elapsed 18m 42s (remain 5m 15s) Loss: 0.2641(0.2034) Grad: 23.0338  LR: 0.00000171  
Epoch: [4][12500/15379] Elapsed 19m 29s (remain 4m 29s) Loss: 0.0851(0.2034) Grad: 17.8624  LR: 0.00000162  
Epoch: [4][13000/15379] Elapsed 20m 15s (remain 3m 42s) Loss: 0.0619(0.2033) Grad: 17.6937  LR: 0.00000154  
Epoch: [4][13500/15379] Elapsed 21m 1s (remain 2m 55s) Loss: 0.1166(0.2030) Grad: 13.2694  LR: 0.00000146  
Epoch: [4][14000/15379] Elapsed 21m 48s (remain 2m 8s) Loss: 0.1543(0.2030) Grad: 28.7602  LR: 0.00000138  
Epoch: [4][14500/15379] Elapsed 22m 35s (remain 1m 22s) Loss: 0.2220(0.2031) Grad: 24.5910  LR: 0.00000130  
Epoch: [4][15000/15379] Elapsed 23m 21s (remain 0m 35s) Loss: 0.1517(0.2027) Grad: 22.1565  LR: 0.00000123  
Epoch: [4][15378/15379] Elapsed 23m 57s (remain 0m 0s) Loss: 0.0912(0.2027) Grad: 18.5728  LR: 0.00000117  
EVAL: [0/3845] Elapsed 0m 0s (remain 13m 53s) Loss: 0.9226(0.9226) 
EVAL: [500/3845] Elapsed 0m 11s (remain 1m 15s) Loss: 0.7594(0.4212) 
EVAL: [1000/3845] Elapsed 0m 24s (remain 1m 8s) Loss: 0.2681(0.4153) 
EVAL: [1500/3845] Elapsed 0m 37s (remain 0m 59s) Loss: 0.2485(0.4166) 
EVAL: [2000/3845] Elapsed 0m 52s (remain 0m 48s) Loss: 0.2163(0.4229) 
EVAL: [2500/3845] Elapsed 1m 7s (remain 0m 36s) Loss: 0.4399(0.4151) 
EVAL: [3000/3845] Elapsed 1m 22s (remain 0m 23s) Loss: 0.4103(0.4102) 
EVAL: [3500/3845] Elapsed 1m 38s (remain 0m 9s) Loss: 0.2894(0.4120) 
EVAL: [3844/3845] Elapsed 1m 52s (remain 0m 0s) Loss: 0.2747(0.4146) 
Epoch 4 - avg_train_loss: 0.2027  avg_val_loss: 0.4146  time: 1601s
Epoch 4 - Score: 0.5551 - Threshold: 0.02200
Epoch 4 - Save Best Score: 0.5551 Model
Epoch: [5][0/15379] Elapsed 0m 0s (remain 78m 50s) Loss: 0.2108(0.2108) Grad: inf  LR: 0.00000117  
Epoch: [5][500/15379] Elapsed 0m 46s (remain 23m 10s) Loss: 0.2002(0.1502) Grad: 26.4544  LR: 0.00000110  
Epoch: [5][1000/15379] Elapsed 1m 33s (remain 22m 21s) Loss: 0.2929(0.1525) Grad: 24.1264  LR: 0.00000103  
Epoch: [5][1500/15379] Elapsed 2m 19s (remain 21m 34s) Loss: 0.1042(0.1525) Grad: 21.4917  LR: 0.00000096  
Epoch: [5][2000/15379] Elapsed 3m 6s (remain 20m 47s) Loss: 0.1221(0.1519) Grad: 16.7280  LR: 0.00000089  
Epoch: [5][2500/15379] Elapsed 3m 53s (remain 20m 1s) Loss: 0.3239(0.1517) Grad: 42.5904  LR: 0.00000083  
Epoch: [5][3000/15379] Elapsed 4m 40s (remain 19m 15s) Loss: 0.1165(0.1512) Grad: 25.4244  LR: 0.00000077  
Epoch: [5][3500/15379] Elapsed 5m 26s (remain 18m 28s) Loss: 0.0569(0.1503) Grad: 38.1647  LR: 0.00000071  
Epoch: [5][4000/15379] Elapsed 6m 13s (remain 17m 42s) Loss: 0.2952(0.1494) Grad: 35.0370  LR: 0.00000065  
Epoch: [5][4500/15379] Elapsed 7m 0s (remain 16m 56s) Loss: 0.0610(0.1494) Grad: 16.7459  LR: 0.00000060  
Epoch: [5][5000/15379] Elapsed 7m 47s (remain 16m 10s) Loss: 0.1661(0.1490) Grad: 21.5310  LR: 0.00000054  
Epoch: [5][5500/15379] Elapsed 8m 34s (remain 15m 23s) Loss: 0.1599(0.1488) Grad: 17.9489  LR: 0.00000049  
Epoch: [5][6000/15379] Elapsed 9m 21s (remain 14m 36s) Loss: 0.1967(0.1492) Grad: 55.7475  LR: 0.00000045  
Epoch: [5][6500/15379] Elapsed 10m 8s (remain 13m 50s) Loss: 0.1942(0.1492) Grad: 15.6444  LR: 0.00000040  
Epoch: [5][7000/15379] Elapsed 10m 54s (remain 13m 3s) Loss: 0.2771(0.1493) Grad: 22.3882  LR: 0.00000036  
Epoch: [5][7500/15379] Elapsed 11m 41s (remain 12m 16s) Loss: 0.0491(0.1491) Grad: 10.3095  LR: 0.00000032  
Epoch: [5][8000/15379] Elapsed 12m 28s (remain 11m 30s) Loss: 0.0355(0.1493) Grad: 6.4640  LR: 0.00000028  
Epoch: [5][8500/15379] Elapsed 13m 15s (remain 10m 43s) Loss: 0.2344(0.1495) Grad: 20.4128  LR: 0.00000024  
Epoch: [5][9000/15379] Elapsed 14m 2s (remain 9m 56s) Loss: 0.1409(0.1497) Grad: 22.0691  LR: 0.00000021  
Epoch: [5][9500/15379] Elapsed 14m 48s (remain 9m 9s) Loss: 0.0565(0.1496) Grad: 14.3924  LR: 0.00000018  
Epoch: [5][10000/15379] Elapsed 15m 35s (remain 8m 23s) Loss: 0.0975(0.1495) Grad: 10.3407  LR: 0.00000015  
Epoch: [5][10500/15379] Elapsed 16m 22s (remain 7m 36s) Loss: 0.0754(0.1494) Grad: 16.1291  LR: 0.00000012  
Epoch: [5][11000/15379] Elapsed 17m 8s (remain 6m 49s) Loss: 0.2850(0.1490) Grad: 33.3775  LR: 0.00000010  
Epoch: [5][11500/15379] Elapsed 17m 55s (remain 6m 2s) Loss: 0.3100(0.1491) Grad: 43.6708  LR: 0.00000008  
Epoch: [5][12000/15379] Elapsed 18m 41s (remain 5m 15s) Loss: 0.0597(0.1490) Grad: 9.8335  LR: 0.00000006  
Epoch: [5][12500/15379] Elapsed 19m 28s (remain 4m 29s) Loss: 0.1224(0.1488) Grad: 23.9168  LR: 0.00000004  
Epoch: [5][13000/15379] Elapsed 20m 15s (remain 3m 42s) Loss: 0.1171(0.1491) Grad: 23.9006  LR: 0.00000003  
Epoch: [5][13500/15379] Elapsed 21m 1s (remain 2m 55s) Loss: 0.1973(0.1489) Grad: 33.7229  LR: 0.00000002  
Epoch: [5][14000/15379] Elapsed 21m 48s (remain 2m 8s) Loss: 0.1673(0.1490) Grad: 17.6193  LR: 0.00000001  
Epoch: [5][14500/15379] Elapsed 22m 35s (remain 1m 22s) Loss: 0.0895(0.1488) Grad: 26.0954  LR: 0.00000000  
Epoch: [5][15000/15379] Elapsed 23m 21s (remain 0m 35s) Loss: 0.3163(0.1485) Grad: 27.9466  LR: 0.00000000  
Epoch: [5][15378/15379] Elapsed 23m 57s (remain 0m 0s) Loss: 0.1339(0.1484) Grad: 28.8154  LR: 0.00000000  
EVAL: [0/3845] Elapsed 0m 0s (remain 13m 4s) Loss: 1.0793(1.0793) 
EVAL: [500/3845] Elapsed 0m 11s (remain 1m 15s) Loss: 1.0445(0.4817) 
EVAL: [1000/3845] Elapsed 0m 24s (remain 1m 8s) Loss: 0.3109(0.4720) 
EVAL: [1500/3845] Elapsed 0m 37s (remain 0m 59s) Loss: 0.2988(0.4736) 
EVAL: [2000/3845] Elapsed 0m 52s (remain 0m 48s) Loss: 0.2573(0.4812) 
EVAL: [2500/3845] Elapsed 1m 6s (remain 0m 35s) Loss: 0.4580(0.4712) 
EVAL: [3000/3845] Elapsed 1m 22s (remain 0m 23s) Loss: 0.3279(0.4654) 
EVAL: [3500/3845] Elapsed 1m 38s (remain 0m 9s) Loss: 0.3285(0.4668) 
EVAL: [3844/3845] Elapsed 1m 51s (remain 0m 0s) Loss: 0.2933(0.4695) 
Epoch 5 - avg_train_loss: 0.1484  avg_val_loss: 0.4695  time: 1601s
Epoch 5 - Score: 0.5533 - Threshold: 0.01200
Our CV score is 0.5551 using a threshold of 0.022000000000000002
Requirement already satisfied: kaggle in /usr/local/lib/python3.9/dist-packages (1.5.12)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.28.1)
Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)
Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.26.10)
Requirement already satisfied: python-slugify in /usr/lib/python3/dist-packages (from kaggle) (4.0.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from kaggle) (4.64.0)
Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (2.1.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests->kaggle) (2.8)
Create Dataset name:lecr-trn-use-parent-title, output_dir:/notebooks/kaggle_lecr/output/lecr-trn-use-parent-title
Starting upload for file xlm-roberta-base_fold0_42.pth
Upload successful: xlm-roberta-base_fold0_42.pth (1GB)
Starting upload for file score.pkl
Upload successful: score.pkl (169B)
