# BERT Simple Baseline 

this code is almost same as [this notebook](https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room). <br>

I haven't tried bert-large-uncased, so I'll try next.

----------------------------------------------------

I tried bert-base-uncased, bert-large-uncased, roberta-base ever. <br>
Roberta-base performed well. (CV~0.506) <br> 
There should be more space to archive better score.

In [None]:
# Library 
# utils 
import os,gc,pickle,random
from tqdm import tqdm 
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
warnings.simplefilter("ignore",FutureWarning)

# imgs 
import cv2 
import albumentations as A

# torch 
import torch 
import torch.optim as optim 
from torch.utils.data import Dataset,DataLoader 
import torch.nn as nn 
import torch.nn.functional as F 

# other  
import transformers 
from transformers import get_linear_schedule_with_warmup


# Config 
WHERE = "kaggle" #  or colab 
if WHERE == "kaggle":
    INPUT_DIR = "../input/"
    OUTPUT_DIR = "./"
elif WHERE == "colab":
    INPUT_DIR = "../input/"
    OUTPUT_DIR = "./"

DEBUG = False 
NB = 'public1'
VERSION = 3   

class CFG:
    # utils
    seed = 42
    print_freq = 50
    num_workers = 4 
    use_amp = False 

    # training param
    folds = [0]
    n_epoch = 10
    batch_size = 8   
    learning_rate = 2e-5 

    optimizer = "adamw"
    weight_decay = 1e-1 
    betas = (0.9,0.999) 

    scheduler = "linear_warmup"

    # bert param
    model_name = "roberta-large" # "bert-large-uncased" "bert-base-uncased"
    max_sentence = 256  

if DEBUG:
    CFG.n_epoch = 1


# Utils 
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class Loader():
    def __init__(self):
        pass 
    
    def load(self,path):
        """
        Args:
            path : from which data shouled be loaded
        Returns:
            data : loaded data 
        """
        obj = pickle.load(open(path,"rb"))
        return obj 
    
    def dump(self,obj,path):
        """
        Args:
            obj (object) : object which should be dumped
            path (str) : to which object should be dumped 
        """
        f = open(path,"wb")
        pickle.dump(obj,f)
        f.close 

class History():
    def __init__(self,metric=None,others=None):
        """
        Default:
            columns has default value below
            - epoch 
            - train_loss
            - valid_loss
            - train_{metric}
            _ valid_{metric}
            
        Args:
            metric (int) : metric 
            others (list) : other parameters which is logged 
        """

        columns = ["epoch","train_loss","valid_loss"]
        if metric is not None:
            columns.extend([f"train_{metric}",f"valid_{metric}"]) 
        if others is not None:
            columns.extend(others)
        
        self.df = pd.DataFrame(columns=columns)
        self.epoch = 1
    
    def log(self,dict):
        """
        Args:   
            dict (dict) : dict which should have these keys : "train_loss","valid_loss","train_{metric}","valid_{metric}" 
        """
        info = " ".join(f"{k}: {v:.4f}" for k,v in dict.items())
        print(f"\n[EPOCH{self.epoch}] {info}\n")
        dict["epoch"] = self.epoch
        self.df = self.df.append(dict,ignore_index=True)
        self.epoch += 1 
    
    def dump(self,path="./training_history.csv"):
        self.df.to_csv(path,index=False)

def metric(pred,target):
    # rmse
    return np.mean((pred - target)**2)**0.5

# Dataset 
class COMMONLITDatasetBert(Dataset):
    def __init__(self,df,model_name,mode,max_sentence=315):
        super(COMMONLITDatasetBert,self).__init__()
        self.texts = df["excerpt"].values 
        self.model_name = model_name
        if model_name == "bert-base-uncased":
            self.tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")
        elif model_name == "bert-large-uncased":
            self.tokenizer = transformers.BertTokenizer.from_pretrained("../input/huggingface-bert/bert-large-uncased")
        elif model_name == "roberta-base":
            self.tokenizer = transformers.RobertaTokenizer.from_pretrained("../input/huggingface-roberta-variants/roberta-base/roberta-base")
        elif model_name == "roberta-large":
            self.tokenizer = transformers.RobertaTokenizer.from_pretrained("../input/huggingface-roberta-variants/roberta-large/roberta-large")
        else:
            raise ValueError("This name bert is not here.")
    
        self.mode = mode
        self.max_sentence = max_sentence
        
        if mode != "test":
            self.target = df["target"].values 
    
    def __len__(self):
        return len(self.texts) 
    
    def __getitem__(self,idx):
        sentence = self.texts[idx]
        tokenized = self.tokenizer(
            sentence,
            add_special_tokens=True,
            max_length=self.max_sentence,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True
        )

        ids = torch.tensor(tokenized["input_ids"],dtype=torch.long) 
        mask = torch.tensor(tokenized["attention_mask"],dtype=torch.long)
        if "roberta" not in self.model_name:
            token_type = torch.tensor(tokenized["token_type_ids"],dtype=torch.long)
        else:
            token_type = 0  

        if self.mode == "test":
            return (
                ids,
                mask,
                token_type
            )
        
        else: # train and valid
            target = torch.tensor(self.target[idx],dtype=torch.float) 
            return (
                ids,
                mask,
                token_type,
                target
            )

# Model 
class COMMONLITModelBert(nn.Module):
    def __init__(self,model_name):
        super(COMMONLITModelBert,self).__init__()
        self.model_name = model_name
        if model_name == "bert-base-uncased":
            self.model = transformers.BertForSequenceClassification.from_pretrained("../input/bert-base-uncased",num_labels=1)
        elif model_name == "bert-large-uncased":
            self.model = transformers.BertForSequenceClassification.from_pretrained("../input/huggingface-bert/bert-large-uncased",num_labels=1)
        elif model_name == "roberta-base":
            self.model = transformers.RobertaForSequenceClassification.from_pretrained("../input/huggingface-roberta-variants/roberta-base/roberta-base",num_labels=1)
        elif model_name == "roberta-large":
            self.model = transformers.RobertaForSequenceClassification.from_pretrained("../input/huggingface-roberta-variants/roberta-large/roberta-large",num_labels=1)
        else:
            raise ValueError("This name bert is not here.")
    
    def forward(self,ids,mask):
        output = self.model.forward(ids,mask)
        return output["logits"]
        

# Train,Valid 
def train_fn(train_loader,model,optimizer,criterion,device,epoch,scheduler):
    model.train()
    preds = [] 
    targets = []
    losses = AverageMeter()
    if CFG.use_amp:
        scaler = torch.cuda.amp.GradScaler()
    train_len = len(train_loader)
    
    for step,(ids,mask,token_type,target) in enumerate(train_loader):
        step += 1
        batch_size = ids.shape[0] 
        ids = ids.to(device,non_blocking=True)
        mask = mask.to(device,non_blocking=True) 
        target = target.to(device,non_blocking=True) 

        if CFG.use_amp:
            with torch.cuda.amp.autocast():
                y_pred = model(ids,mask).squeeze(-1)
                loss = criterion(y_pred,target)
            
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update() 
            scheduler.step() 

        else:
            y_pred = model(ids,mask).squeeze(-1) 
            loss = criterion(y_pred,target)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 
            scheduler.step()
        
        losses.update(loss.item(),batch_size) 
        preds.append(y_pred.detach().to("cpu").numpy())
        targets.append(target.detach().squeeze(-1).to("cpu").numpy())

        if step%CFG.print_freq == 0 or step == train_len:
            print('Epoch: [{0}][{1}/{2}] '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, train_len,
                   loss=losses,
                   #lr=scheduler.get_lr()[0],
                   ))
    
    predictions = np.concatenate(preds)
    targets = np.concatenate(targets)
    return losses.avg,predictions,targets

def valid_fn(valid_loader,model,criterion,device,epoch): 
    model.eval()
    preds = [] 
    targets = []
    losses = AverageMeter()
    valid_len = len(valid_loader)
    
    for step,(ids,mask,token_type,target) in enumerate(valid_loader):
        step += 1
        batch_size = ids.shape[0] 
        ids = ids.to(device,non_blocking=True)
        mask = mask.to(device,non_blocking=True) 
        target = target.to(device,non_blocking=True)
        
        with torch.no_grad(): 
            y_pred = model(ids,mask).squeeze(-1)  
            loss = criterion(y_pred,target) 
            
        losses.update(loss.item(),batch_size)
        preds.append(y_pred.detach().to("cpu").numpy())
        targets.append(target.detach().squeeze(-1).to("cpu").numpy())
        
        if step%CFG.print_freq == 0 or step == valid_len:
            print('Valid : [{0}][{1}/{2}] '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, valid_len,
                   loss=losses,
                   #lr=scheduler.get_lr()[0],
                   ))

    predictions = np.concatenate(preds)
    targets = np.concatenate(targets)
    return losses.avg,predictions,targets

# Training 
def train_loop(df,fold):
    print("-"*20,f"Fold {fold}","-"*20)
    random_seed(CFG.seed)
    loader = Loader() 

    train = df[df["fold"] != fold].reset_index(drop=True)
    valid = df[df["fold"] == fold].reset_index(drop=True)
    if DEBUG:
        train = train.sample(n = CFG.batch_size).reset_index()
        valid = valid.sample(n = CFG.batch_size*2).reset_index()
    
    train_dset = COMMONLITDatasetBert(train,CFG.model_name,"train",max_sentence=CFG.max_sentence)
    valid_dset = COMMONLITDatasetBert(valid,CFG.model_name,"valid",max_sentence=CFG.max_sentence) 
    
    train_loader = DataLoader(train_dset,batch_size=CFG.batch_size,shuffle=True,num_workers=CFG.num_workers,pin_memory=True,drop_last=True)
    valid_loader = DataLoader(valid_dset,batch_size=CFG.batch_size*2,shuffle=False,num_workers=CFG.num_workers,pin_memory=True,drop_last=False) 
    
    model = COMMONLITModelBert(model_name=CFG.model_name)

    if CFG.optimizer == "adamw":
        optimizer = optim.Adam(model.parameters(),lr = CFG.learning_rate,weight_decay = CFG.weight_decay) 
    
    if CFG.scheduler == "linear_warmup":
        train_steps = int(len(train)/CFG.batch_size*CFG.n_epoch)
        num_steps = int(train_steps*0.1)
        scheduler = get_linear_schedule_with_warmup(optimizer,num_steps,train_steps)
    
    criterion = nn.MSELoss() 
    device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
    model.to(device) 
    n_epochs = CFG.n_epoch 
    
    best_loss = 1e20
    history = History(metric="rmse")
    model_path = f"{OUTPUT_DIR}{CFG.model_name}_nb{NB}ver{VERSION}fold{fold}"

    for epoch in range(n_epochs):
        tr_loss,tr_pred,tr_target = train_fn(train_loader,model,optimizer,criterion,device,epoch,scheduler)
        val_loss,val_pred,val_target = valid_fn(valid_loader,model,criterion,device,epoch) 

        tr_rmse = metric(tr_pred,tr_target)
        val_rmse = metric(val_pred,val_target)

        history.log({
            "train_loss":tr_loss,
            "valid_loss":val_loss,
            "train_rmse":tr_rmse,
            "valid_rmse":val_rmse
        })

        if best_loss > val_loss:
            print(f"score update! {best_loss} -> {val_loss}\n")
            best_loss = val_loss
            torch.save(model.state_dict(),f"{model_path}epoch{epoch}.pth")
            loader.dump(tr_pred,f"{model_path}fold{fold}_tr.pkl")
            loader.dump(val_pred,f"{model_path}fold{fold}_val.pkl")
    
    #del train_loader,valid_loader,model,criterion,optimizer
    gc.collect() 

    history.dump(f"{model_path}.csv")
    return history

def main():
    train = pd.read_csv(f"{INPUT_DIR}commonlitutils/train_folds.csv")

    for fold in CFG.folds:
        history = train_loop(train,fold=fold)

        fig,ax = plt.subplots(1,1,figsize=(10,7))
        ax.plot(history.df["train_loss"],label = "train loss")
        ax.plot(history.df["valid_loss"],label = "valid loss")
        ax.set_title(f"FOLD {fold}")
        plt.legend()
        plt.show()

In [None]:
main()