In [None]:
VERSION = "20200516"  #@param ["1.5" , "20200516", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

In [None]:
import math,os,cv2,random
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

#transformers
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

# exclude warnings
import warnings
warnings.simplefilter('ignore')

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.debug.metrics as met
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.utils as xu

os.environ['XLA_USE_BF16']="1"

In [None]:
def get_optimizer_params_large(model):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 1e-7
    no_decay = ['bias', 'gamma', 'beta']
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group4=['layer.12.','layer.13.','layer.14.','layer.15.']    
    group5=['layer.16.','layer.17.','layer.18.','layer.19.']
    group6=['layer.20.','layer.21.','layer.22.','layer.23.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.',
               'layer.12.','layer.13.','layer.14.','layer.15.','layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']
    optimizer_parameters = [
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.05, 'lr': learning_rate/10.0},
        # we design the optimizer parameter with decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': cfg.attention_wdecay*10.0, 'lr': learning_rate/6.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': cfg.attention_wdecay*5.0, 'lr': learning_rate/5.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': cfg.attention_wdecay*2.5, 'lr': learning_rate/4.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate/3.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)],
         'weight_decay_rate': cfg.attention_wdecay/2.5, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)],
         'weight_decay_rate': cfg.attention_wdecay/5.0, 'lr': learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.0},
        # we design the optimizer parameter with the no_decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/6.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/5.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/4.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/3.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)],
         'weight_decay_rate': 0.0, 'lr': learning_rate},
        {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 
         'lr':1e-3, "momentum" : 0.99},
    ]
    return optimizer_parameters

def get_optimizer_params_base(model):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 1e-4
    no_decay = ['bias', 'gamma', 'beta']
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']  
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.',
               'layer.4.','layer.5.','layer.6.','layer.7.',
               'layer.8.','layer.9.','layer.10.','layer.11.',]

    optimizer_parameters = [
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.01, 'lr': learning_rate/2.0},
        # we design the optimizer parameter with decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate*2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.0},
        # we design the optimizer parameter with the no_decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': 0.0, 'lr': learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': 0.0, 'lr': learning_rate*2.0},
        {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 
         'lr':1e-3, "momentum" : 0.99},
    ]
    return optimizer_parameters

In [None]:
trainpath = "../input/nlpdatasets/train.csv"
df = pd.read_csv(trainpath)
df = df[['id','excerpt','target','standard_error','fr','th','tr','ur','ru','bg','de','ar']]
df.head()

In [None]:
df = df[df.standard_error>0.4]

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

#RANDOM_STATE = 42
RANDOM_STATE = 88

kfold = KFold(n_splits=8, random_state=RANDOM_STATE, shuffle=True)
splits= kfold.split(df)
train_indexs = []
test_indexs = []
for i,(train_index, test_index) in enumerate(splits):
    print(train_index.shape,test_index.shape)
    train_indexs.append(train_index)
    test_indexs.append(test_index)

In [None]:
class traindataset(Dataset):
    def __init__(self, df, max_len, test=False):
        self.df = df
        self.max_len = max_len
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.transformer_model)
        self.test = test

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        lang = np.random.choice(['excerpt','fr','th','tr','ur','ru','bg','de','ar'])
        text = row[lang]
        if self.test:
            text = row.excerpt
        text = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt",return_token_type_ids=True)
        ids = text["input_ids"][0]
        mask = text["attention_mask"][0]
        token_id = text["token_type_ids"][0]
        target = row.target
        std = row.standard_error
        
        return {
            "input_ids": torch.tensor(ids),
            "attention_mask": torch.tensor(mask),
            "token_type_ids": torch.tensor(token_id)
        },torch.tensor(target),torch.tensor(std)

In [None]:
def train_fn(loader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    
    allpreds = []
    alltargets = []
    
    for b_idx, (data,target,standard_error) in enumerate(loader):
        for key, value in data.items():
            data[key] = value.to(device)

        optimizer.zero_grad()
        target = target.to(device).float()
        standard_error = standard_error.to(device).float()
        output = model(**data)#,labels=target)
        mean, std= output.logits[:,0],output.logits[:,1]
        mseloss = criterion(mean, target)
        p = torch.distributions.Normal(mean, std)
        q = torch.distributions.Normal(target, standard_error)
        kl_vector = torch.distributions.kl_divergence(p, q)
        #print(p,q,kl_vector)
        klloss = kl_vector.mean()
        loss = mseloss + cfg.alpha*klloss
        loss.backward()
        xm.optimizer_step(optimizer, barrier=True)
        allpreds.append(mean.detach().cpu().numpy())
        alltargets.append(target.detach().squeeze(-1).cpu().numpy())
        if b_idx%100==0:
            print(b_idx,len(loader),loss.item(),scheduler.get_lr()[0])
        if scheduler is not None:
            scheduler.step(epoch+b_idx/len(loader))
    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)
    try:
        train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
        print(f"rmse_score:{train_rme_loss}")
    except:
        print(mean,std)
def val_fn(loader,model,criterion,optimizer,device,scheduler):
    model.eval()
    
    allpreds = []
    alltargets = []
    
    with torch.no_grad():
        for b_idx, (data,target,std) in enumerate(loader):
            for key, value in data.items():
                data[key] = value.to(device)

            output = model(**data)
            #print(output.logits.shape)
            mean, std= output.logits[:,0],output.logits[:,1]#.squeeze(-1)#.detach().cpu().numpy()

            allpreds.append(mean.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())
            if b_idx%40==0:
                print(b_idx,len(loader))
        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
        print(f"valrmse_score:{rme_loss}")
    return rme_loss

In [None]:
def run(fold):
    set_seed(cfg.RANDOM_STATE-fold)
    train_index, test_index = train_indexs[fold],test_indexs[fold]
    df_train, df_test = df.iloc[train_index,:],df.iloc[test_index,:]
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    device = xm.xla_device(fold + 1)
    model = transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=2).to(device)
    if cfg.init_headparam:
      nn.init.xavier_normal_(model.classifier.dense.weight)
      nn.init.constant_(model.classifier.dense.bias,0)
      nn.init.xavier_normal_(model.classifier.out_proj.weight)
      nn.init.constant_(model.classifier.out_proj.bias,0)
    
    #for param in model.roberta.embeddings.parameters():
        #param.requires_grad = False
    
    #model.load_state_dict(torch.load('/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/inference_weight/bertmodel_9.pt'))
    train_ds = traindataset(df=df_train, max_len=cfg.max_len)
    train_loader = torch.utils.data.DataLoader(train_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=0, 
                                               pin_memory=False, 
                                               shuffle=True)
    test_ds = traindataset(df=df_test, max_len=cfg.max_len, test=True)
    val_loader = torch.utils.data.DataLoader(test_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=0, 
                                               pin_memory=False, 
                                               shuffle=False)
    optimizer_parameters = get_optimizer_params_base(model)
    optimizer = AdamW(optimizer_parameters,lr= cfg.LR,betas=(0.9, 0.999)) 
    def func(epoch):
        return (1.0*(cfg.EPOCHS-epoch) + 0.01*(epoch))/cfg.EPOCHS
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = func)
    criterion = nn.MSELoss()
    rme = val_fn(val_loader,model,criterion,optimizer,device,scheduler)
    for epoch in range(cfg.EPOCHS):
        train_fn(train_loader,model,criterion,optimizer,device,scheduler,epoch)
        
        if epoch%1==0:
            rme = val_fn(val_loader,model,criterion,optimizer,device,scheduler)
            if rme < 0.53:
              xm.save(model.state_dict(),cfg.wpath + f"bertmodel_fold{fold}_{rme:.3g}.pt")
              print(f"save epoch_{epoch}_{rme:.3g}")

In [None]:
class cfg:
    NUM_WORKERS = 2
    TRAIN_BATCH_SIZE = 16
    wpath = "/kaggle/working/"
    SEED = 2020
    EPOCHS = 10
    LR = 1e-4
    # feature vector parameter
    max_len = 256

    #weight parameter (finetunning target)
    attention_wdecay = 0.001
    #nodecay_layer = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # weight of classifer layer become initialized.
    init_headparam = True
    transformer_model = "distilroberta-base"
    alpha = 0.03
    RANDOM_STATE = 1048

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [None]:
from joblib import Parallel, delayed
#for seed in [300,500,700,900,1100,1300,1500,1700,1900,2100,2300,2500,2700]:
  #cfg.RANDOM_STATE = seed
Parallel(n_jobs=8, backend="threading")(delayed(run)(i) for i in range(8))