In [None]:
import math,os,cv2,random
from tqdm import tqdm
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

#transformers
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

# exclude warnings
import warnings
warnings.simplefilter('ignore')

import sys
sys.path.append("../input/cosine-annealing-warm-with-warmup-for-pytorch/")
from cosine_annearing_with_warmup import *

# parameter tunning target

# RobertaForSequenceClassification

# roberta-large
## (roberta)

### (embeddings)

RobertaEmbeddings(  
  (word_embeddings): Embedding(50265, 1024, padding_idx=1)  
  (position_embeddings): Embedding(514, 1024, padding_idx=1)  
  (token_type_embeddings): Embedding(1, 1024)  
  (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
  (dropout): Dropout(p=0.1, inplace=False)  
)  

### (encoder)  

#### layer[0:23] (Attention layer)  

RobertaLayer(  
  (attention): RobertaAttention(  
    (self): RobertaSelfAttention(  
      (query): Linear(in_features=1024, out_features=1024, bias=True)  
      (key): Linear(in_features=1024, out_features=1024, bias=True)  
      (value): Linear(in_features=1024, out_features=1024, bias=True)  
      (dropout): Dropout(p=0.1, inplace=False)  
    )  
    (output): RobertaSelfOutput(  
      (dense): Linear(in_features=1024, out_features=1024, bias=True)  
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
      (dropout): Dropout(p=0.1, inplace=False)  
    )  
  )  
  (intermediate): RobertaIntermediate(  
    (dense): Linear(in_features=1024, out_features=4096, bias=True)  
  )  
  (output): RobertaOutput(  
    (dense): Linear(in_features=4096, out_features=1024, bias=True)  
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)  
    (dropout): Dropout(p=0.1, inplace=False)  
  )  
)  


## (classifier)  
RobertaClassificationHead(  
  (dense): Linear(in_features=1024, out_features=1024, bias=True)  
  (dropout): Dropout(p=0.1, inplace=False)  
  (out_proj): Linear(in_features=1024, out_features=1, bias=True)  
)  

In [None]:
def get_optimizer_params_large(model):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 1e-5
    no_decay = ['bias', 'gamma', 'beta']
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group4=['layer.12.','layer.13.','layer.14.','layer.15.']    
    group5=['layer.16.','layer.17.','layer.18.','layer.19.']
    group6=['layer.20.','layer.21.','layer.22.','layer.23.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.',
               'layer.12.','layer.13.','layer.14.','layer.15.','layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']
    optimizer_parameters = [
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.05, 'lr': learning_rate/10.0},
        # we design the optimizer parameter with decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': cfg.attention_wdecay*10.0, 'lr': learning_rate/6.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': cfg.attention_wdecay*5.0, 'lr': learning_rate/5.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': cfg.attention_wdecay*2.5, 'lr': learning_rate/4.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate/3.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)],
         'weight_decay_rate': cfg.attention_wdecay/2.5, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)],
         'weight_decay_rate': cfg.attention_wdecay/5.0, 'lr': learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.0},
        # we design the optimizer parameter with the no_decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/6.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/5.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/4.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/3.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)],
         'weight_decay_rate': 0.0, 'lr': learning_rate},
        {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 
         'lr':1e-3, "momentum" : 0.99},
    ]
    return optimizer_parameters

def get_optimizer_params_base(model):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 5e-6
    no_decay = ['bias', 'gamma', 'beta']
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']  
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.',
               'layer.4.','layer.5.','layer.6.','layer.7.',
               'layer.8.','layer.9.','layer.10.','layer.11.',]

    optimizer_parameters = [
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.01, 'lr': learning_rate/4.0},
        # we design the optimizer parameter with decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': cfg.attention_wdecay, 'lr': learning_rate*2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay_rate': 0.0},
        # we design the optimizer parameter with the no_decay layer each group
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay_rate': 0.0, 'lr': learning_rate/2.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay_rate': 0.0, 'lr': learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay_rate': 0.0, 'lr': learning_rate*2.0},
        {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 
         'lr':1e-3, "momentum" : 0.99},
    ]
    return optimizer_parameters

# read csvfile

In [None]:
trainpath = "../input/commonlitreadabilityprize/train.csv"
trainpath = "../input/nlpdatasets/train.csv"
df = pd.read_csv(trainpath)
df = df[['id','excerpt','target','standard_error','fr','th','tr','ur','ru','bg','de','ar']]
df.head()

# Make Cross validation folder
cross validation score each fold  
fold 0: 0.49  
fold 1: 0.52  
fold 2: 0.50  
fold 3: 0.51  
fold 4: 0.50  
LB score: 0.494

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42
fold = 0

kfold = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
skfold = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
splits= kfold.split(df)
for i,(train_index, test_index) in enumerate(splits):
    print(train_index.shape,test_index.shape)
    df_train, df_test = df.iloc[train_index,:],df.iloc[test_index,:]
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    if i==fold:
        break

# define the device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# define the dataset

In [None]:
class traindataset(Dataset):
    def __init__(self, df, max_len, test=False):
        self.df = df
        self.max_len = max_len
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.transformer_model)
        self.test = test

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        lang = np.random.choice(['excerpt','fr','th','tr','ur','ru','bg','de','ar'])
        text = row[lang]
        if self.test:
            text = row.excerpt
        text = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt",return_token_type_ids=True)
        ids = text["input_ids"][0]
        mask = text["attention_mask"][0]
        token_id = text["token_type_ids"][0]
        
        return {
            "input_ids": torch.tensor(ids),
            "attention_mask": torch.tensor(mask),
            "token_type_ids": torch.tensor(token_id)
        },torch.tensor(row.target)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.bert = transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=1).to(device)
        self.head = AttentionHead(1024,1024,1)
        self.fc = nn.Linear(1024, 1)
        self._init_params()    
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
    def forward(self, **x):
        x = self.bert.roberta(**x)[0]
        x = self.head(x)
        x = self.fc(x)
        return x

# define train function and validation function

In [None]:
def train_fn(loader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    
    allpreds = []
    alltargets = []
    
    for b_idx, (data,target) in enumerate(loader):
        for key, value in data.items():
            data[key] = value.to(device)

        optimizer.zero_grad()
        target = target.to(device).float()
        output = model(**data).squeeze(-1)#,labels=target)
        #loss = output.loss
        loss = criterion(output, target)
        #output = output.logits.squeeze(-1)
        loss.backward()
        optimizer.step()
        
        allpreds.append(output.detach().cpu().numpy())
        alltargets.append(target.detach().squeeze(-1).cpu().numpy())
        if b_idx%50==0:
            print(b_idx,len(loader),loss.item(),scheduler.get_lr()[0])
        if scheduler is not None:
            scheduler.step()
    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)
    train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
    print(f"rmse_score:{train_rme_loss}")
        
def val_fn(loader,model,criterion,optimizer,device,scheduler):
    model.eval()
    
    allpreds = []
    alltargets = []
    
    with torch.no_grad():
        for b_idx, (data,target) in enumerate(loader):
            for key, value in data.items():
                data[key] = value.to(device)

            output = model(**data)
            output = output.squeeze(-1)            

            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())
            if b_idx%20==0:
                print(b_idx,len(loader))
        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
        print(f"rmse_score:{train_rme_loss}")
    return train_rme_loss

In [None]:
def run():

    #model = transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=1).to(device)
    model = Model().to(device)
    if cfg.init_headparam:
      nn.init.xavier_normal_(model.classifier.dense.weight)
      nn.init.constant_(model.classifier.dense.bias,0)
      nn.init.xavier_normal_(model.classifier.out_proj.weight)
      nn.init.constant_(model.classifier.out_proj.bias,0)
    
    #for param in model.roberta.embeddings.parameters():
        #param.requires_grad = False
    
    #model.load_state_dict(torch.load('/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/inference_weight/bertmodel_9.pt'))
    train_ds = traindataset(df=df_train, max_len=cfg.max_len)
    train_loader = torch.utils.data.DataLoader(train_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=2, 
                                               pin_memory=True, 
                                               shuffle=True)
    test_ds = traindataset(df=df_test, max_len=cfg.max_len, test=True)
    val_loader = torch.utils.data.DataLoader(test_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=2, 
                                               pin_memory=True, 
                                               shuffle=False)
    optimizer_parameters = get_optimizer_params_base(model.bert)
    optimizer = AdamW(optimizer_parameters,lr= cfg.LR,betas=(0.9, 0.999)) 
    train_steps = int(len(train_loader)*cfg.EPOCHS)
    num_steps = int(train_steps*cfg.warmuprate)
    #scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    #scheduler = get_cosine_schedule_with_warmup(optimizer, num_steps, train_steps)
    scheduler = CosineAnnealingWarmupRestarts(optimizer, 
                                              first_cycle_steps=len(train_loader)//2,
                                              cycle_mult=1.0,
                                              max_lr=1e-4, 
                                              min_lr=1e-6,
                                              warmup_steps=len(train_loader)//4,
                                              gamma=0.85)
    criterion = nn.MSELoss()
    rme = val_fn(val_loader,model,criterion,optimizer,device,scheduler)
    for epoch in range(cfg.EPOCHS):
        train_fn(train_loader,model,criterion,optimizer,device,scheduler,epoch)
        rme = val_fn(val_loader,model,criterion,optimizer,device,scheduler)
        if rme < 0.52:
            torch.save(model.state_dict(),cfg.wpath + f"bertmodel_{epoch}.pt")
            print(f"save epoch_{epoch}")

In [None]:
"important parameter is LR,max_len,weight_decay,warmuprate."
"init_headparam is always True."
"my bestfit model is (LR,max_len,weight_decay,warmuprate)=(2e-5,256,0.01,0.1) in fold 0"
"my bestfit model is (LR,max_len,weight_decay,warmuprate)=(2e-5,256,0.01,0.5) in fold 1"

class cfg:
    NUM_WORKERS = 2
    TRAIN_BATCH_SIZE = 8
    wpath = "/kaggle/working/"
    SEED = 2020
    #important learning parameter (finetunning target)

    # epoch assign the five, but our training in the best model is finished near 3 epochs. 
    EPOCHS = 15
    warmuprate = 0.5
    LR = 2e-5
    # feature vector parameter
    max_len = 256

    #weight parameter (finetunning target)
    attention_wdecay = 0.001
    #nodecay_layer = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # weight of classifer layer become initialized.
    init_headparam = False
    transformer_model = "roberta-large"
    RANDOM_STATE = 15

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

set_seed(cfg.RANDOM_STATE)

In [None]:
run()

In [None]:
#transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=1)

In [None]:
#model.roberta.encoder.layer[22]