In [None]:
!pip install torchsummary
!pip install transformers

In [None]:
import math,os,cv2,random
from tqdm import tqdm
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

#transformers
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

# exclude warnings
import warnings
warnings.simplefilter('ignore')

# read csvfile

In [None]:
trainpath = "../input/commonlitreadabilityprize/train.csv"
df = pd.read_csv(trainpath)
df = df[['id','excerpt','target','standard_error']]
df.head()

# Make Cross validation folder
cross validation score each fold  
fold 0: 0.49  
fold 1: 0.52  
fold 2: 0.50  
fold 3: 0.51  
fold 4: 0.50  
LB score: 0.494

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42
fold = 2

kfold = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
skfold = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
splits= kfold.split(df)
for i,(train_index, test_index) in enumerate(splits):
    print(train_index.shape,test_index.shape)
    df_train, df_test = df.iloc[train_index,:],df.iloc[test_index,:]
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    if i==fold:
        break

# define the device and fix the seed

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(RANDOM_STATE)

# define the dataset

In [None]:
class traindataset(Dataset):
    def __init__(self, df, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.transformer_model)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.excerpt
        text = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        ids = text["input_ids"][0]
        mask = text["attention_mask"][0]
        
        return {
            "input_ids": torch.tensor(ids),
            "attention_mask": torch.tensor(mask),
        },torch.tensor(row.target)

# define train function and validation function

In [None]:
def train_fn(loader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    
    allpreds = []
    alltargets = []
    
    for b_idx, (data,target) in enumerate(loader):
        for key, value in data.items():
            data[key] = value.to(device)

        optimizer.zero_grad()
        output = model(**data)
        output = output.logits.squeeze(-1)
        target = target.to(device).float()
        loss = criterion(output,target)
        loss.backward()
        optimizer.step()
        
        allpreds.append(output.detach().cpu().numpy())
        alltargets.append(target.detach().squeeze(-1).cpu().numpy())
        if b_idx%50==0:
            print(b_idx,len(loader),loss.item(),scheduler.get_lr()[0])
        if scheduler is not None:
            scheduler.step()
    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)
    train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
    print(f"rmse_score:{train_rme_loss}")
        
def val_fn(loader,model,criterion,optimizer,device,scheduler):
    model.eval()
    
    allpreds = []
    alltargets = []
    
    with torch.no_grad():
        for b_idx, (data,target) in enumerate(loader):
            for key, value in data.items():
                data[key] = value.to(device)

            output = model(**data)
            output = output.logits.squeeze(-1)#.detach().cpu().numpy()

            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())
            if b_idx%20==0:
                print(b_idx,len(loader))
        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
        print(f"rmse_score:{train_rme_loss}")

In [None]:
def run():

    model = transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=1).to(device)
    if cfg.init_headparam:
      nn.init.xavier_normal_(model.classifier.dense.weight)
      nn.init.constant_(model.classifier.dense.bias,0)
      nn.init.xavier_normal_(model.classifier.out_proj.weight)
      nn.init.constant_(model.classifier.out_proj.bias,0)

    #model.load_state_dict(torch.load('/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/inference_weight/bertmodel_9.pt'))
    train_ds = traindataset(df=df_train, max_len=cfg.max_len)
    train_loader = torch.utils.data.DataLoader(train_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=2, 
                                               pin_memory=True, 
                                               shuffle=True)
    test_ds = traindataset(df=df_test, max_len=cfg.max_len)
    val_loader = torch.utils.data.DataLoader(test_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=2, 
                                               pin_memory=True, 
                                               shuffle=False)
    param_optimizer = list(model.named_parameters())
    no_decay = cfg.nodecay_layer
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': cfg.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0},]  
    optimizer = AdamW(optimizer_parameters,lr= cfg.LR,betas=(0.9, 0.999)) 
    train_steps = int(len(train_loader)*cfg.EPOCHS)
    num_steps = int(train_steps*cfg.warmuprate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    criterion = nn.MSELoss()
    val_fn(val_loader,model,criterion,optimizer,device,scheduler)
    for epoch in range(cfg.EPOCHS):
        train_fn(train_loader,model,criterion,optimizer,device,scheduler,epoch)
        
        if epoch%1==0:
            val_fn(val_loader,model,criterion,optimizer,device,scheduler)
            torch.save(model.state_dict(),cfg.wpath + f"bertmodel_{epoch}.pt")
            print(f"save epoch_{epoch}")

In [None]:
"important parameter is LR,max_len,weight_decay,warmuprate."
"init_headparam is always True."
"my bestfit model is (LR,max_len,weight_decay,warmuprate)=(2e-5,256,0.01,0.1) in fold 0"
"my bestfit model is (LR,max_len,weight_decay,warmuprate)=(2e-5,256,0.01,0.5) in fold 1"
class cfg:
    NUM_WORKERS = 2
    TRAIN_BATCH_SIZE = 8
    wpath = "/kaggle/working"
    SEED = 2020
    #important learning parameter (finetunning target)

    # epoch assign the five, but our training in the best model is finished near 3 epochs. 
    EPOCHS = 5
    warmuprate = 0.3
    LR = 2e-5
    # feature vector parameter
    max_len = 256

    #weight parameter (finetunning target)
    weight_decay = 0.01
    nodecay_layer = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # weight of classifer layer become initialized.
    init_headparam = True
    transformer_model = "distilroberta-base"

In [None]:
run()