In [None]:
VERSION = "20200516"  #@param ["1.5" , "20200516", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

In [None]:
import math,os,cv2,random
from tqdm import tqdm
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

#transformers
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
import gc
gc.enable()

# exclude warnings
import warnings
warnings.simplefilter('ignore')

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.debug.metrics as met
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.utils as xu

os.environ['XLA_USE_BF16']="1"

In [None]:
def get_optimizer_params_base(model):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = cfg.LR
    no_decay = ['bias', 'gamma', 'beta']
    group1=['layer.0.','layer.1.']
    group2=['layer.2.','layer.3.']
    group3=['layer.4.','layer.5.']
    group4=['layer.6.','layer.7.']  
    group5=['layer.8.','layer.9.']
    group6=['layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.',
               'layer.4.','layer.5.','layer.6.','layer.7.',
               'layer.8.','layer.9.','layer.10.','layer.11.',]

    optimizer_parameters = [
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.01, 'lr': learning_rate/5.0},
        # we design the optimizer parameter with decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': cfg.attention_wdecay*5.0, 'lr': learning_rate/4.5},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': cfg.attention_wdecay*4.5, 'lr': learning_rate/4.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': cfg.attention_wdecay*4.0, 'lr': learning_rate/3.5},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)],
         'weight_decay_rate': cfg.attention_wdecay*3.5, 'lr': learning_rate/3.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)],
         'weight_decay_rate': cfg.attention_wdecay*3.0, 'lr': learning_rate/2.5},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)],
         'weight_decay_rate': cfg.attention_wdecay*2.5, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.0},
        # we design the optimizer parameter with the no_decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/4.5},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/4.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/3.5},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/3.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/2.5},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 
         'lr':1e-4, "momentum" : 0.99},
    ]
    return optimizer_parameters

In [None]:
df = pd.read_csv("../input/step1-exclude-anomaly/train.csv")
df

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

num_bins = int(np.floor(1 + np.log2(len(df))))
df.loc[:,'bins'] = pd.cut(df['target'],bins=num_bins,labels=False)
target = df['target'].to_numpy()
bins = df.bins.to_numpy()

def make_foldindex(RANDOM_STATE):
    kfold = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
    splits= kfold.split(df.excerpt,bins)
    train_indexs = []
    test_indexs = []
    for i,(train_index, test_index) in enumerate(splits):
        print(train_index.shape,test_index.shape)
        train_indexs.append(train_index)
        test_indexs.append(test_index)
    return train_indexs,test_indexs

In [None]:
class traindataset(Dataset):
    def __init__(self, df, max_len, test=False):
        self.df = df
        self.max_len = max_len
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.transformer_model)
        self.test = test

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.excerpt
        if self.test:
            text = row.excerpt
        text = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt",return_token_type_ids=True)
        ids = text["input_ids"][0]
        mask = text["attention_mask"][0]
        token_id = text["token_type_ids"][0]
        target = row.target
        
        return {
            "input_ids": torch.tensor(ids),
            "attention_mask": torch.tensor(mask),
            "token_type_ids": torch.tensor(token_id)
        },torch.tensor(target)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim
        self._init_params()    
    def _init_params(self):
        nn.init.xavier_normal_(self.W.weight)
        nn.init.constant_(self.W.bias, 0)
        nn.init.xavier_normal_(self.V.weight)
        nn.init.constant_(self.V.bias, 0)

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
        

In [None]:
class Model(nn.Module):
    def __init__(self,leng):
        super(Model, self).__init__()
        self.roberta = transformers.AutoModel.from_pretrained(cfg.transformer_model)
        self.head = AttentionHead(leng,1024,1)
        self.fc = nn.Linear(leng, 1)
        self.dropout = nn.Dropout(p=0.1)
        self.dropouts = nn.ModuleList([
                nn.Dropout(p=0.1) for _ in range(5)
            ])
        self._init_params()    
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
    def forward(self, **x):
        x = self.roberta(**x)[0]
        x = self.head(x)
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.fc(dropout(x))
            else:
                logits += self.fc(dropout(x))
        
        logits /= len(self.dropouts)
        return logits

class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(cfg.transformer_model)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(cfg.transformer_model, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self,**x):
        roberta_output = self.roberta(**x)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [None]:
def train_fn(train_loader,test_loader,model,criterion,optimizer,device,scheduler,epoch,fold):
    model.train()
    
    allpreds = []
    alltargets = []
    interval = 10
    for b_idx, (data,target) in enumerate(train_loader):
        for key, value in data.items():
            data[key] = value.to(device)

        optimizer.zero_grad()
        target = target.to(device).float()
        output = model(**data)
        output = output.squeeze(-1)
        loss = torch.sqrt(criterion(output,target))
        loss.backward()
        xm.optimizer_step(optimizer, barrier=True)
        
        allpreds.append(output.detach().cpu().numpy())
        alltargets.append(target.detach().squeeze(-1).cpu().numpy())
        if b_idx%interval==0 and epoch+b_idx/len(train_loader) > 1.1:
            rme = val_fn(test_loader,model,criterion,optimizer,device,scheduler,fold)
            scheduler.step(rme)
            if rme < 0.49:
              interval = interval//2
              if interval == 0:
                interval = 1
            elif rme > 0.70:
              interval = int(interval*2)
        #if scheduler is not None:
            #scheduler.step(epoch+b_idx/len(train_loader))
            
    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)
    train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
    print(f"rmse_score:{train_rme_loss}")
        
def val_fn(loader,model,criterion,optimizer,device,scheduler,fold):
    model.eval()
    
    allpreds = []
    alltargets = []
    
    with torch.no_grad():
        for b_idx, (data,target) in enumerate(loader):
            for key, value in data.items():
                data[key] = value.to(device)

            output = model(**data)
            output = output.squeeze(-1)
            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())
        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        rme = np.sqrt(mean_squared_error(alltargets,allpreds))
        if rme < 0.52:
            print(f"validation rmse_score:{rme:.5g}")
        if rme < 0.475:
                xm.save(model.state_dict(),cfg.wpath + f"bestmodel_{rme:.3g}_fold{fold}_mlen{cfg.max_len}.pt")
                print(f"save epoch_{rme:.3g}")
    return rme

In [None]:
def run(fold):
    set_seed(cfg.RANDOM_STATE-fold)
    train_index, test_index = train_indexs[fold%5],test_indexs[fold%5]
    df_train, df_test = df.iloc[train_index,:],df.iloc[test_index,:]
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    device = xm.xla_device(fold + 1)
    print(f"---start_fold{fold%5}---")
    model = Model(leng=768).to(device)
    #model = LitModel().to(device)
    #model = transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=1).to(device)
    #model.load_state_dict(torch.load('/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/inference_weight/bertmodel_9.pt'))
    train_ds = traindataset(df=df_train, max_len=cfg.max_len)
    train_loader = torch.utils.data.DataLoader(train_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=0, 
                                               pin_memory=False, 
                                               shuffle=True)
    test_ds = traindataset(df=df_test, max_len=cfg.max_len, test=True)
    test_loader = torch.utils.data.DataLoader(test_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=0, 
                                               pin_memory=False, 
                                               shuffle=False)
    optimizer_parameters = get_optimizer_params_base(model)
    optimizer = AdamW(optimizer_parameters,lr= cfg.LR,betas=(0.9, 0.999)) 
    def func(epoch):
      return (1.0*(cfg.EPOCHS-epoch) + 0.05*(epoch))/cfg.EPOCHS #*(2.0 + np.sin(2*np.pi*epoch-int(epoch)))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = func)
    """
    scheduler = ReduceLROnPlateau(optimizer, 
                                 mode='min', 
                                 factor=0.3, 
                                 patience=1, 
                                 threshold=0.0001,
                                 verbose=False, 
                                 min_lr=1e-6,
                                 eps=1e-08)
    """
    criterion = nn.MSELoss()
    val_fn(test_loader,model,criterion,optimizer,device,scheduler,fold)
    for epoch in range(cfg.EPOCHS):
        train_fn(train_loader,test_loader,model,criterion,optimizer,device,scheduler,epoch,fold%5)

In [None]:
class cfg:
    NUM_WORKERS = 2
    TRAIN_BATCH_SIZE = 8
    wpath = "/kaggle/working/"
    SEED = 2020
    #important learning parameter (finetunning target)

    # epoch assign the five, but our training in the best model is finished near 3 epochs. 
    EPOCHS = 5
    LR = 1e-4
    # feature vector parameter
    max_len = 248

    #weight parameter (finetunning target)
    attention_wdecay = 0.01   
    transformer_model = "roberta-base"
    RANDOM_STATE = 1048

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore


In [None]:
from joblib import Parallel, delayed
from torch.optim.lr_scheduler import ReduceLROnPlateau

for seed in range(1):
  seed = random.randint(0, 2500)
  print(seed)
  cfg.RANDOM_STATE = seed
  train_indexs,test_indexs = make_foldindex(seed)
  Parallel(n_jobs=8, backend="threading")(delayed(run)(i) for i in range(8))