In [None]:
!nvidia-smi

In [None]:
import numpy as np
import pandas as pd

import os
import math
import sys
sys.path.append("../input/torch-components-library/torch-components-main")
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from torch_components import Configuration, Timer, Averager
from torch_components.callbacks import EarlyStopping, ModelCheckpoint
from torch_components.utils import seed_everything, get_lr, get_optimizer, get_scheduler, get_batch
from torch_components.import_utils import wandb_run_exists
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Optimizer
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_constant_schedule_with_warmup
)
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModel
from transformers import PreTrainedModel

from tqdm import tqdm
from datetime import timedelta
import scipy
from torch.optim import lr_scheduler
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
import gc
gc.enable()

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda")
current_dir = ''

In [None]:
conf_dict = {
    "split_num" : 4,
    "seed" : [2021],
#     "seed" : [0],
    "batch_size" : 64,
    "input_path"  : '../input/us-patent-phrase-to-phrase-matching/',
    "model_path" : '../input/uspppm-debertv3large-5folds-v2/',
    "learning_rate" : 2e-5,
#     "learning_rate" : 1e-4,
    "regressor_lr" : 1.0,
    "weight_decay" : 0.01,
    "max_length" : 256,
    "epochs" : 6,
#     "epochs" : 1,
    "hidden_dropout" : 0,
    "attention_dropout" : 0.1,
    "linear_dropout1" : 0.1,
    "linear_dropout2" : 0.1,
    "warmup_ratio" : 0.06,
    "use_llrd" : True,
    "llrd_rate" : 0.95,
    "optimizer":dict(name="AdamW", 
                                      parameters=dict(lr=2e-5, weight_decay=0.0)),
                       
    "scheduler":dict(name="get_cosine_with_hard_restarts_schedule_with_warmup", 
                                      parameters=dict(num_cycles=2, last_epoch=-1)),
#     "llrd_rate" : 0.8,
    "freeze_embed" : False,
    "use_mixout" : True,
    "mixout_prob" : 0.3,
    "use_prior_wd" : False,
    "gradient_norm":1.0,
    "gradient_scaling":True,
    "use_USPPM_pretrained" : True,
    "use_relu" : False,
    "val_interval" : 20,
    "no_interval_epoch" : 0,
    "reinit_layers" : 0,
#     "reinit_layers" : 4,
    "split_type" : 0,
    "output_directory": './',
    "num_workers":4,
    "pin_memory" : True,
    "delta" : 1e-4,
    "scheduling_after":"step",
    "gradient_accumulation_steps":1,
    "validation_steps":500, 
    "amp":True, 
    "debug":True,
    "verbose":250,
    "device":'cuda',
    "cv_monitor_value":"pearson",
    "save_model":True,
     
#     "model_name" : "roberta-base",
#     "model_name" : 'roberta-large',
#     "model_name" : 'microsoft/deberta-base',
#     "model_name" : 'microsoft/deberta-large',
#     "model_name" : "google/electra-large-generator",
    "model_name" : "../input/roberta-base",
#     "model_name" : 'studio-ousia/luke-base',
#     "model_name" : 'studio-ousia/luke-large',
#     "model_name" : 'nghuyong/ernie-2.0-en',
    
    "input_linear_num1" : 768,
    "input_linear_num2" : 384,
#     "input_linear_num1" : 1024,
#     "input_linear_num2" : 512,
}

In [None]:
RESULT_TEXT_PATH = f"{current_dir}result_text.txt"
# if os.path.exists(RESULT_TEXT_PATH):
#     raise

conf_text = ""
for key,value in conf_dict.items():
    conf_text += f"{key}:{value}\n"
        
with open(RESULT_TEXT_PATH, mode='w') as f:
    f.write(conf_text)
    f.write("\n")

In [None]:
def set_seed(seed):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
if conf_dict["split_type"] == 0:
    train_df = pd.read_csv(f"{conf_dict['input_path']}train.csv")
    train_df['fold'] = -1
    titles = pd.read_csv('../input/cpc-codes/titles.csv')
    train_df = train_df.merge(titles, left_on='context', right_on='code')
    train_df['input'] = train_df['title']+'[SEP]'+train_df['anchor']
    train_df = train_df.sort_values("target").reset_index(drop=True)
    train_df["fold"] = train_df.index % 4

if conf_dict["split_type"] == 1:
    import numpy as np
    import pandas as pd

    from sklearn import datasets
    from sklearn import model_selection

    def create_folds(data, num_splits):
        data["fold"] = -1
        data = data.sample(frac=1, random_state=42).reset_index(drop=True)
        num_bins = int(np.floor(1 + np.log2(len(data))))
        data.loc[:, "bins"] = pd.cut(
            data["target"], bins=num_bins, labels=False
        )
        kf = model_selection.StratifiedKFold(n_splits=num_splits)
        for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
            data.loc[v_, 'fold'] = f
        data = data.drop("bins", axis=1)
        return data

    train_df = pd.read_csv(f"{conf_dict['input_path']}train.csv")
    titles = pd.read_csv('../input/cpc-codes/titles.csv')
    train_df = train_df.merge(titles, left_on='context', right_on='code')
    train_df['input'] = train_df['title']+'[SEP]'+train_df['anchor']
    train_df = create_folds(train_df, num_splits=conf_dict["split_num"])
    print(train_df.fold.value_counts())
    train_df

In [None]:
train_df["score_bin"] = pd.cut(train_df["score"], bins=5, labels=False)
train_df["text_length"] = train_df["input"].apply(lambda text: len(text.split()))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../input/roberta-base")
tokenizer_path = os.path.join('./', "tokenizer/")
tokenizer_files = tokenizer.save_pretrained(tokenizer_path)

In [None]:
class Dataset:
    def __init__(self, texts, pair_texts, tokenizer, targets=None, max_length=128):
        self.texts = texts
        self.pair_texts = pair_texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index].lower()
        pair_text = self.pair_texts[index].lower()
        
        tokenized = self.tokenizer(text=text, 
                                   text_pair=pair_text, 
                                   add_special_tokens=True,
                                   #max_length=self.max_length,
                                   #padding="max_length",
                                   truncation=True,
                                   return_attention_mask=True,
                                   return_token_type_ids=False,
                                   return_offsets_mapping=False)
        
        
        if self.targets is not None:
            target = self.targets[index]
            
            return tokenized, target
            
        return tokenized

In [None]:
class USPPM_Model(PreTrainedModel): 
    def __init__(self, conf):
        super(USPPM_Model, self).__init__(conf) 

        if conf_dict["use_USPPM_pretrained"]:
            self.bert = AutoModel.from_pretrained(f'{current_dir}../input/roberta-base/pytorch_model.bin', config=conf)
        else:
            self.bert = AutoModel.from_pretrained(conf_dict["model_name"], config=conf)
        
        self.drop_out1 = nn.Dropout(conf_dict["linear_dropout1"])
        self.drop_out2 = nn.Dropout(conf_dict["linear_dropout2"])

        self.layer_norm1 = nn.LayerNorm(conf_dict["input_linear_num1"])
        self.l1 = nn.Linear(conf_dict["input_linear_num1"], conf_dict["input_linear_num2"])
        self.l2 = nn.Linear(conf_dict["input_linear_num2"], 1)

        self._init_weights(self.layer_norm1)
        self._init_weights(self.l1)
        self._init_weights(self.l2)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, ids, mask):
        out = self.bert(
            input_ids=ids,
            attention_mask=mask
        )

        # out = out['pooler_output']        
        out = torch.mean(out['last_hidden_state'], 1)
        out = self.layer_norm1(out)
        out = self.drop_out1(out)
        out = self.l1(out)
        if conf_dict["use_relu"]:
            out = F.relu(out)
        out = self.drop_out2(out)
        out = self.l2(out)
#         print("out:",out.shape)
        
        preds = out.squeeze(-1)
#         raise

        return preds

In [None]:
def get_optimizer_params(model):
    param_optimizer = list(model.named_parameters())
    learning_rate = conf_dict["learning_rate"]
    no_decay = ["bias", "LayerNorm.weight"]

    model_word = "bert"

    # print('model.named_parameters():',[n for n, p in model.named_parameters()])
    # print('[n for n, p in model.named_parameters() if "bert" not in n]', [n for n, p in model.named_parameters() if model_word not in n])
    optimizer_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if model_word not in n],
            'lr': conf_dict["learning_rate"] * conf_dict["regressor_lr"],
        }
    ]

 
    layers = [getattr(model, model_word).embeddings] + list(getattr(model, model_word).encoder.layer)
    layers.reverse()
    # print('layers ',layers)
    for layer in layers:
        # print('[n for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)] ',[n for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)])
        # print('[n for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)] ',[n for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)])
        learning_rate *= conf_dict["llrd_rate"]
        optimizer_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay_rate': conf_dict["weight_decay"],
                "lr": learning_rate,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": learning_rate,
            },
        ]
    # raise
    return optimizer_parameters

In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction

class Mixout(InplaceFunction):
    @staticmethod
    def _make_noise(input):
        return input.new().resize_as_(input)

    @classmethod
    def forward(cls, ctx, input, target=None, p=0.0, training=False, inplace=False):
        if p < 0 or p > 1:
            raise ValueError("A mix probability of mixout has to be between 0 and 1," " but got {}".format(p))
        if target is not None and input.size() != target.size():
            raise ValueError(
                "A target tensor size must match with a input tensor size {},"
                " but got {}".format(input.size(), target.size())
            )
        ctx.p = p
        ctx.training = training

        if ctx.p == 0 or not ctx.training:
            return input

        if target is None:
            target = cls._make_noise(input)
            target.fill_(0)
        target = target.to(input.device)

        if inplace:
            ctx.mark_dirty(input)
            output = input
        else:
            output = input.clone()

        ctx.noise = cls._make_noise(input)
        if len(ctx.noise.size()) == 1:
            ctx.noise.bernoulli_(1 - ctx.p)
        else:
            ctx.noise[0].bernoulli_(1 - ctx.p)
            ctx.noise = ctx.noise[0].repeat(input.size()[0], 1)
        ctx.noise.expand_as(input)

        if ctx.p == 1:
            output = target
        else:
            output = ((1 - ctx.noise) * target + ctx.noise * output - ctx.p * target) / (1 - ctx.p)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.p > 0 and ctx.training:
            return grad_output * ctx.noise, None, None, None, None
        else:
            return grad_output, None, None, None, None


def mixout(input, target=None, p=0.0, training=False, inplace=False):
    return Mixout.apply(input, target, p, training, inplace)


class MixLinear(torch.nn.Module):
    __constants__ = ["bias", "in_features", "out_features"]
    def __init__(self, in_features, out_features, bias=True, target=None, p=0.0):
        super(MixLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter("bias", None)
        self.reset_parameters()
        self.target = target
        self.p = p

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return F.linear(input, mixout(self.weight, self.target, self.p, self.training), self.bias)

    def extra_repr(self):
        type = "drop" if self.target is None else "mix"
        return "{}={}, in_features={}, out_features={}, bias={}".format(
            type + "out", self.p, self.in_features, self.out_features, self.bias is not None
        )

In [None]:
class PriorWD(Optimizer):
    def __init__(self, optim, use_prior_wd=False, exclude_last_group=True):
        super(PriorWD, self).__init__(optim.param_groups, optim.defaults)
        self.param_groups = optim.param_groups
        self.optim = optim
        self.use_prior_wd = use_prior_wd
        self.exclude_last_group = exclude_last_group
        self.weight_decay_by_group = []
        for i, group in enumerate(self.param_groups):
            self.weight_decay_by_group.append(group["weight_decay"])
            group["weight_decay"] = 0

        self.prior_params = {}
        for i, group in enumerate(self.param_groups):
            for p in group["params"]:
                self.prior_params[id(p)] = p.detach().clone()

    def step(self, closure=None):
        if self.use_prior_wd:
            for i, group in enumerate(self.param_groups):
                for p in group["params"]:
                    if self.exclude_last_group and i == len(self.param_groups):
                        p.data.add_(-group["lr"] * self.weight_decay_by_group[i], p.data)
                    else:
                        p.data.add_(
                            -group["lr"] * self.weight_decay_by_group[i], p.data - self.prior_params[id(p)],
                        )
        loss = self.optim.step(closure)

        return loss

    def compute_distance_to_prior(self, param):
        assert id(param) in self.prior_params, "parameter not in PriorWD optimizer"
        return (param.data - self.prior_params[id(param)]).pow(2).sum().sqrt()

In [None]:
def valid_model(model):
    model.eval()
    with torch.no_grad():
        loss_val = 0
        preds_val = []
        targets_val = []
        for i, (excerpts, targets, standard_error) in enumerate(val_loader):
            batch = tokenizer(list(excerpts), truncation=True, padding="max_length", return_tensors='pt', max_length=conf_dict["max_length"])
            input_ids = batch['input_ids']
            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = batch['attention_mask']
            attention_mask = attention_mask.to(device, dtype=torch.long)
            targets=targets.to(device, dtype=torch.float)
            
            preds = model(input_ids, attention_mask)
            
            loss = loss_fn(preds, targets)
            loss = loss.item()
            loss_val = loss_val + loss
            
            preds = preds.cpu().detach().numpy().tolist()
            targets = targets.cpu().detach().numpy().tolist()
            preds_val += preds
            targets_val += targets
                
        loss_val = loss_val / len(val_loader)
        
    return loss_val, targets_val, preds_val

In [None]:
def make_directory(directory, overwriting=False):
    if not os.path.exists(directory):
        os.mkdir(directory)
    else:
        if overwriting:
            shutil.rmtree(directory)
            os.mkdir(directory)


In [None]:
class DynamicPadding:
    def __init__(self, tokenizer, max_length=None, padding=True, pad_to_multiple_of=None, return_tensors="pt"):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_tensors = return_tensors
    
    def __call__(self, tokenized):
        max_length = max(len(_["input_ids"]) for _ in tokenized)
        max_length = min(max_length, self.max_length) if self.max_length is not None else max_length
                
        padded = self.tokenizer.pad(encoded_inputs=tokenized,
                                    max_length=max_length,
                                    padding=self.padding, 
                                    pad_to_multiple_of=self.pad_to_multiple_of, 
                                    return_tensors=self.return_tensors)
        
        return padded
class Collator:
    def __init__(self, **kwargs):
        self.dynamic_padding = DynamicPadding(**kwargs)
    
    def __call__(self, batch):
        all_tokenized, all_targets = [], []
        for tokenized, target in batch:
            all_tokenized.append(tokenized)
            all_targets.append(target)
        
        tokenized = self.dynamic_padding(all_tokenized)
        
        input_ids = torch.tensor(tokenized.input_ids)
        attention_mask = torch.tensor(tokenized.attention_mask)
        all_targets = torch.tensor(all_targets)
        
        return input_ids, attention_mask, all_targets

In [None]:
def training_loop(train_loader, 
                  model,
                  optimizer,
                  scheduler=None,
                  scheduling_after="step",
                  epochs=1,
                  validation_loader=None, 
                  gradient_accumulation_steps=1, 
                  gradient_scaling=False,
                  gradient_norm=1,
                  validation_steps=100, 
                  amp=True,
                  recalculate_metrics_at_end=True, 
                  return_validation_outputs=True,
                  debug=True, 
                  verbose=1, 
                  device="cpu", 
                  finish_wandb_run=True, 
                  time_format="{hours}:{minutes}:{seconds}"):
    
    training_steps = len(train_loader) * epochs
    scaler = GradScaler() if gradient_scaling else None
    
    if wandb_run_exists():
        wandb.define_metric("train/loss vs epoch", step_metric="epoch")
    
    if debug:
        print(f"Auto Mixed Precision: {amp}")
        print(f"Gradient norm: {gradient_norm}")
        print(f"Gradient scaling: {gradient_scaling}")
        print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
        print(f"Validation steps: {validation_steps}")
        print(f"Device: {device}")
        print()
        
    
    if wandb_run_exists():
        print(f"Weights & Biases Run: {wandb.run.get_url()}", end="\n"*2)
        
    
    passed_steps = 1
    train_loss, train_metrics = Averager(), Averager()
    best_validation_loss, best_validation_metrics, best_validation_outputs = None, None, None
    
    model.to(device)
    model.zero_grad()
    total_time = timedelta(seconds=0)
    for epoch in range(1, epochs+1):
        print(f"\nEpoch {epoch}/{epochs}", end="\n"*2)

        timer = Timer(time_format)
        epoch_train_loss, epoch_train_metrics = Averager(), Averager()
        steps = len(train_loader)
        for step, batch in enumerate(train_loader, 1):
            batch_size = len(batch)
            batch_loss, batch_metrics = training_step(batch=batch, 
                                                      model=model, 
                                                      gradient_accumulation_steps=gradient_accumulation_steps, 
                                                      amp=amp, 
                                                      scaler=scaler, 
                                                      device=device)
            
            train_loss.update(batch_loss, n=batch_size)
            train_metrics.update(batch_metrics, n=batch_size)
            epoch_train_loss.update(batch_loss, n=batch_size)
            epoch_train_metrics.update(batch_metrics, n=batch_size)
            
            if (passed_steps % gradient_accumulation_steps) == 0:
                optimization_step(model=model, optimizer=optimizer, gradient_norm=gradient_norm, scaler=scaler)
                

            lr = get_lr(optimizer, only_last=True)
            if scheduling_after == "step":
                scheduling_step(scheduler)
            
            logs = {"train/loss": train_loss.average, 
                    "train/loss vs batch": batch_loss, "lr": lr}
            
            for metric in batch_metrics:
                metric = metric.strip().lower()
                logs.update({f"train/{metric}": train_metrics.average[metric], 
                             f"train/{metric} vs batch": batch_metrics[metric]})
                
            if wandb_run_exists():
                wandb.log(logs, step=passed_steps) 
            
            if step % verbose == 0 or step == steps:
                elapsed, remain = timer(step/steps)
                print(f"{step}/{steps} - "
                      f"remain: {remain} - "
                      f"loss: {epoch_train_loss.average:.4}"
                      f"{format_metrics(epoch_train_metrics.average)}")
            
            if validation_loader is not None:
                if (passed_steps % validation_steps) == 0:
                    print()
                    validation_loss, validation_metrics, validation_outputs = validation_loop(loader=validation_loader, 
                                                                                              model=model, 
                                                                                              amp=amp, 
                                                                                              return_outputs=True, 
                                                                                              verbose=verbose, 
                                                                                              recalculate_metrics_at_end=True, 
                                                                                              device=device)
                    
                    
                    logs = {"validation/loss": validation_loss, 
                            "train/loss vs validation steps": train_loss.average}
    
                    for metric, value in validation_metrics.items():
                        metric = metric.strip().lower()
                        logs.update({f"validation/{metric}": value, 
                                     f"train/{metric} vs validation steps": train_metrics.average[metric]})
                    
                    if wandb_run_exists():
                        wandb.log(logs, step=passed_steps)
                    
                    is_checkpoint_saved = model_checkpointing(loss=validation_loss, 
                                                              metrics=validation_metrics,
                                                              model=model, 
                                                              optimizer=optimizer, 
                                                              scheduler=scheduler, 
                                                              step=passed_steps)
                    
                    if is_checkpoint_saved:
                        best_validation_loss = validation_loss
                        best_validation_metrics = validation_metrics
                        best_validation_outputs = validation_outputs
                    
                    scheduling_step(scheduler, loss=validation_loss)
                    
                    print()
            
            passed_steps += 1
        
        if scheduling_after == "epoch":
            scheduling_step(scheduler)
        
        logs = {"train/loss vs epoch": epoch_train_loss.average, 
                "epoch": epoch}
        
        
        for metric, value in train_metrics.average.items():
            metric = metric.strip().lower()
            logs.update({f"train/{metric} vs epoch": value})
            
            if wandb_run_exists():
                wandb.define_metric(f"train/{metric} vs epoch", step_metric="epoch")
            
        epoch_elapsed_seconds = timer.elapsed_time.total_seconds()
        total_time += timedelta(seconds=epoch_elapsed_seconds)
        
        if wandb_run_exists():
            wandb.log(logs, step=passed_steps)

    
    print(f"\nResults", end="\n"*2)
    
    print(f"Training loss: {train_loss.average}{format_metrics(train_metrics.average)}")
    print(f"Validation loss: {best_validation_loss}{format_metrics(best_validation_metrics)}")
    print(f"Total time: {Timer.format_time(total_time, time_format=time_format)}")
    
    if wandb_run_exists() and finish_wandb_run:
        wandb.finish()
    
    
    if return_validation_outputs:
        return (train_loss.average, train_metrics.average), (best_validation_loss, best_validation_metrics, best_validation_outputs)
    
    return (train_loss.average, train_metrics.average), (best_validation_loss, best_validation_metrics)
    

    
def validation_loop(loader, 
                    model, 
                    amp=False, 
                    return_outputs=True, 
                    recalculate_metrics_at_end=True, 
                    verbose=1, 
                    device="cpu", 
                    time_format="{hours}:{minutes}:{seconds}"):
    
    model.eval()
    loss, metrics = Averager(), Averager()
    timer = Timer(time_format)
    outputs, targets = [], []
    num_steps = len(loader)
    for step, batch in enumerate(loader, 1):
        with torch.no_grad():
            with autocast(enabled=amp):
                batch_loss, batch_outputs = calculate_loss(batch=batch, model=model, return_outputs=True, device=device)
                loss.update(batch_loss.item(), n=len(batch))
                
                batch_targets = get_targets(batch)
                batch_metrics = calculate_metrics(outputs=batch_outputs, targets=batch_targets, device=device)
                metrics.update(batch_metrics, n=len(batch))
                
                if isinstance(batch_targets, dict):
                    targets.append(batch_targets)
                else:
                    targets.extend(batch_targets.to("cpu").tolist())
                
                outputs.extend(batch_outputs.to("cpu").tolist())
                
                if step % verbose == 0 or step == num_steps:
                    elapsed, remain = timer(step/num_steps)
                    
                    if step == num_steps and recalculate_metrics_at_end:
                        outputs = torch.tensor(outputs)
                        targets = torch.tensor(targets)
                        
                        metrics = Averager(calculate_metrics(outputs=outputs, targets=targets))
                    
                    print(f"[Validation] "
                          f"{step}/{num_steps} - "
                          f"remain: {remain} - "
                          f"loss: {loss.average:.4}"
                          f"{format_metrics(metrics.average)}")
                    
    if not recalculate_metrics_at_end: 
        outputs = torch.tensor(outputs)
        
    return (loss.average, metrics.average, outputs) if return_outputs else (loss.average, metrics.average)


def format_metrics(metrics, sep=" - ", add_sep_to_start=True):
    if metrics != {}:
        string = sep.join([f"{k.strip().lower()}: {v:.4}" for k, v in metrics.items()])
        return sep + string if add_sep_to_start else string 
    else:
        return ""

    
def training_step(batch, model, amp=False, gradient_accumulation_steps=1, scaler=None, device="cpu"):
    model.train()
    with autocast(enabled=amp):
        loss, outputs = calculate_loss(batch=batch, model=model, return_outputs=True, device=device)
        targets = get_targets(batch)
        metrics = calculate_metrics(outputs=outputs, targets=targets, device=device)
        
        if gradient_accumulation_steps > 1:
            loss /= gradient_accumulation_steps
        
        if scaler is not None:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        
    return loss.detach(), metrics


def optimization_step(model, optimizer, gradient_norm=0, scaler=None):
    if gradient_norm > 0:
        if scaler is not None:
            scaler.unscale_(optimizer)
                            
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_norm)
                        
    if scaler is not None:
        scaler.step(optimizer)
        scaler.update()
    else:
        optimizer.step()
        
    model.zero_grad()
        

def scheduling_step(scheduler=None, loss=None):
    if scheduler is not None:
        if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
            scheduler.step(loss)
        else:
            scheduler.step()
def calculate_loss(batch, model, return_outputs=True, device="cpu"):
    input_ids, attention_mask, targets = batch
    
    input_ids = input_ids.to(device).long()
    attention_mask = attention_mask.to(device).long()
    targets = targets.to(device).float()
    
    outputs = model(input_ids, attention_mask)
    outputs = outputs.squeeze(dim=-1)
    loss = F.mse_loss(outputs, targets, reduction="mean")
        
    return (loss, outputs) if return_outputs else loss


def calculate_metrics(targets, outputs, device="cpu"):
    outputs = outputs.detach().view(-1).to("cpu").float().numpy()
    targets = targets.view(-1).to("cpu").float().numpy()
    
    return dict(pearson=scipy.stats.pearsonr(outputs, targets)[0])


def get_targets(batch):
    *_, targets = batch
    return targets


def model_checkpointing(loss, metrics, model, optimizer=None, scheduler=None, step=None):
    is_saved_checkpoint = model_checkpoint(value=loss, 
                                           model=model, 
                                           optimizer=optimizer, 
                                           scheduler=scheduler, 
                                           step=step)
    return is_saved_checkpoint


In [None]:
cv_scores = []
oof_data_frame = pd.DataFrame()
with open(RESULT_TEXT_PATH, mode='a') as f:

    for seed in conf_dict["seed"]:
        print('seed:',seed)
    
        for fold in range(conf_dict["split_num"]):
            print('fold:',fold)
            
            #set model init seed
            set_seed(0)
            fold_directory = os.path.join(conf_dict['output_directory'], f"seed_{seed}_fold_{fold}")    
            make_directory(fold_directory)
            model_path = os.path.join(fold_directory, "model.pth")
            model_config_path = os.path.join(fold_directory, "model_config.json")
            checkpoints_directory = os.path.join(fold_directory, "checkpoints/")
            
            print(conf_dict["model_name"])
            tokenizer = AutoTokenizer.from_pretrained(conf_dict["model_name"])

            if conf_dict["use_USPPM_pretrained"]:
                model_config = AutoConfig.from_pretrained(f'../input/roberta-base/config.json')
            else:
                model_config = AutoConfig.from_pretrained(conf_dict["model_name"])

            model_config.update({'hidden_dropout_prob':conf_dict["hidden_dropout"]})
            model_config.update({'attention_probs_dropout_prob':conf_dict["attention_dropout"]})
    #         model_config.update({'output_hidden_states':True}) 
            split_validation_df = train_df[train_df.fold==fold].reset_index(drop=True)
            split_train_df = train_df[train_df.fold!=fold].reset_index(drop=True)
            collator = Collator(tokenizer=tokenizer, max_length=conf_dict['max_length'])
            train_dataset = Dataset(texts=split_train_df["input"].values, 
                            pair_texts=split_train_df["target"].values, 
                            targets=split_train_df["score"].values, 
                            max_length=conf_dict['max_length'],
                            tokenizer=tokenizer)
    
            train_loader = DataLoader(dataset=train_dataset, 
                              batch_size=conf_dict['batch_size'], 
                              num_workers=conf_dict['num_workers'],
                              pin_memory=conf_dict['pin_memory'],
                              collate_fn=collator,
                              shuffle=True, 
                              drop_last=False)
    
            print(f"Train samples: {len(train_dataset)}")
    
            validation_dataset = Dataset(texts=split_validation_df["input"].values, 
                                 pair_texts=split_validation_df["target"].values, 
                                 targets=split_validation_df["score"].values,
                                 max_length=conf_dict['max_length'],
                                 tokenizer=tokenizer)
    
            validation_loader = DataLoader(dataset=validation_dataset, 
                                   batch_size=conf_dict['batch_size']*2, 
                                   num_workers=conf_dict['num_workers'],
                                   pin_memory=conf_dict['pin_memory'],
                                   collate_fn=collator,
                                   shuffle=True, 
                                   drop_last=False)
            print(f"Validation samples: {len(validation_dataset)}")

            model = USPPM_Model(model_config)
            
            if conf_dict["use_mixout"]:
                for sup_module in model.modules():
                    for name, module in sup_module.named_children():
                        if isinstance(module, nn.Dropout):
                            module.p = 0.0
                        if isinstance(module, nn.Linear):
                            target_state_dict = module.state_dict()
                            bias = True if module.bias is not None else False
                            new_module = MixLinear(
                                module.in_features, module.out_features, bias, target_state_dict["weight"], conf_dict["mixout_prob"]
                            )
                            new_module.load_state_dict(target_state_dict)
                            setattr(sup_module, name, new_module)
            if not os.path.exists(model_config_path): 
                model.config.to_json_file(model_config_path)
    
            model_parameters = model.parameters()
    
            training_steps = len(train_loader) * conf_dict['epochs']
    
            


            if conf_dict["freeze_embed"]:
                model.bert.embeddings.requires_grad_(False)

            if conf_dict["use_llrd"]:
                optimizer_grouped_parameters = get_optimizer_params(model)
                optimizer = AdamW(optimizer_grouped_parameters, lr=conf_dict["learning_rate"], weight_decay=conf_dict["weight_decay"])
            else:
                optimizer = AdamW(model.parameters(), lr=conf_dict["learning_rate"], weight_decay=conf_dict["weight_decay"])

            if conf_dict["use_prior_wd"]:
                print('PRIOR_WD')
                optimizer = PriorWD(optimizer, use_prior_wd=True)
                
            if conf_dict["reinit_layers"] > 0:
                print(f'Reinitializing Last {conf_dict["reinit_layers"]} Layers ...')
                encoder_temp = getattr(model, "bert")
#                 print("encoder_temp ",encoder_temp)
                for layer in encoder_temp.encoder.layer[-conf_dict["reinit_layers"]:]:
                    for module in layer.modules():
                        if isinstance(module, nn.Linear):
                            module.weight.data.normal_(mean=0.0, std=model_config.initializer_range)
                            if module.bias is not None:
                                module.bias.data.zero_()
                        elif isinstance(module, nn.Embedding):
                            module.weight.data.normal_(mean=0.0, std=model_config.initializer_range)
                            if module.padding_idx is not None:
                                module.weight.data[module.padding_idx].zero_()
                        elif isinstance(module, nn.LayerNorm):
                            module.bias.data.zero_()
                            module.weight.data.fill_(1.0)

            if "scheduler" in list(conf_dict.keys()):
                conf_dict['scheduler']['parameters']['num_training_steps'] = training_steps
                conf_dict['scheduler']['parameters']['num_warmup_steps'] = training_steps * 0.6
                scheduler = get_scheduler(**conf_dict['scheduler'], optimizer=optimizer, from_transformers=True)
            else:
                scheduler = None
        
            model_checkpoint = ModelCheckpoint(mode="min", 
                                       delta=conf_dict['delta'], 
                                       directory=checkpoints_directory, 
                                       overwriting=True, 
                                       filename_format="checkpoint.pth", 
                                       num_candidates=1)
            model.to(device)
            best_rms = 999
            best_preds = None
            
            #set train order seed
            set_seed(seed)
            (train_loss, train_metrics), (validation_loss, validation_metrics, validation_outputs) = training_loop(model=model, 
                                                                                                           optimizer=optimizer, 
                                                                                                           scheduler=scheduler,
                                                                                                           scheduling_after=conf_dict['scheduling_after'],
                                                                                                           train_loader=train_loader,
                                                                                                           validation_loader=validation_loader,
                                                                                                           epochs=conf_dict['epochs'], 
                                                                                                           #epochs=2,          
                                                                                                           gradient_accumulation_steps=conf_dict['gradient_accumulation_steps'], 
                                                                                                           gradient_scaling=conf_dict['gradient_scaling'], 
                                                                                                           gradient_norm=conf_dict['gradient_norm'], 
                                                                                                           validation_steps=conf_dict['validation_steps'], 
                                                                                                           amp=conf_dict['amp'],
                                                                                                           debug=conf_dict['debug'], 
                                                                                                           verbose=conf_dict['verbose'], 
                                                                                                           device=conf_dict['device'], 
                                                                                                           recalculate_metrics_at_end=True, 
                                                                                                           return_validation_outputs=True, 
                                                                                                           finish_wandb_run=True)
    
            if conf_dict['save_model']:
                model_state = model.state_dict()
                torch.save(model_state, model_path)
                print(f"Model's path: {model_path}")
            validation_fold = train_df[train_df["fold"].isin([fold])]
            validation_fold["predictions"] = validation_outputs.to("cpu").numpy()
            oof_data_frame = pd.concat([oof_data_frame, validation_fold])
    
            cv_monitor_value = validation_loss if conf_dict['cv_monitor_value'] == "loss" else validation_metrics[conf_dict['cv_monitor_value']]
            cv_scores.append(cv_monitor_value)
    
            del model, optimizer, validation_outputs
            torch.cuda.empty_cache()
            gc.collect()
    
            print(end="\n"*6)
        
    
            
    