# BERT Simple Baseline 

This is inference code. [Training code is here.](https://www.kaggle.com/hedwig100/bert-simple-baseline?scriptVersionId=64726110) 

Model is roberta-base,5fold. LB = 0.504. 
You can train models in [this notebook.](https://www.kaggle.com/hedwig100/bert-simple-baseline?scriptVersionId=64726110)(same above). 

In [None]:
# Library 
# utils 
import os,gc,pickle,random
from tqdm import tqdm 
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
warnings.simplefilter("ignore",FutureWarning)

# imgs 
import cv2 
import albumentations as A

# torch 
import torch 
import torch.optim as optim 
from torch.utils.data import Dataset,DataLoader 
import torch.nn as nn 
import torch.nn.functional as F 

# other  
import transformers 
from transformers import get_linear_schedule_with_warmup


# Config 
INPUT_DIR = "../input/"
OUTPUT_DIR = "./"

DEBUG = False 

class CFG:
    # utils
    num_workers = 4
    batch_size = 32

    # bert param
    model_name = "roberta-base"
    max_sentence = 315
    model_path = [
        "../input/commonlitmodels/roberta-base_nb3ver3epoch2.pth",
        "../input/commonlitmodels/roberta-base_nb3ver3epoch6.pth",
        "../input/commonlitmodels/roberta-base_nb3ver3epoch3.pth",
        "../input/commonlitmodels/roberta-base_nb3ver8epoch4.pth",
        "../input/commonlitmodels/roberta-base_nb3ver9epoch9.pth",
    ]

# Utils 
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class Loader():
    def __init__(self):
        pass 
    
    def load(self,path):
        """
        Args:
            path : from which data shouled be loaded
        Returns:
            data : loaded data 
        """
        obj = pickle.load(open(path,"rb"))
        return obj 
    
    def dump(self,obj,path):
        """
        Args:
            obj (object) : object which should be dumped
            path (str) : to which object should be dumped 
        """
        f = open(path,"wb")
        pickle.dump(obj,f)
        f.close 

class History():
    def __init__(self,metric=None,others=None):
        """
        Defauls:
            columns has default value
            - epoch 
            - train_loss
            - valid_loss
            - train_{metric}
            _ valid_{metric}
            
        Args:
            metric (int) : metric 
            others (list) : other parameters which is logged 
        """

        columns = ["epoch","train_loss","valid_loss"]
        if metric is not None:
            columns.extend([f"train_{metric}",f"valid_{metric}"]) 
        if others is not None:
            columns.extend(others)
        
        self.df = pd.DataFrame(columns=columns)
        self.epoch = 1
    
    def log(self,dict):
        """
        Args:   
            dict (dict) : dict which should have these keys : "train_loss","valid_loss","train_{metric}","valid_{metric}" 
        """
        dict["epoch"] = self.epoch
        self.df.append(dict,ignore_index=True)
        self.epoch += 1
    
    def dump(self,path="./training_history.csv"):
        self.df.to_csv(path,index=False)

def metric(pred,target):
    # rmse
    return np.mean((pred - target)**2)**0.5


# Dataset 
class COMMONLITDatasetBert(Dataset):
    def __init__(self,df,model_name,mode,max_sentence=315):
        super(COMMONLITDatasetBert,self).__init__()
        self.texts = df["excerpt"].values 
        self.model_name = model_name
        if model_name == "bert-base-uncased":
            self.tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")
        elif model_name == "bert-large-uncased":
            self.tokenizer = transformers.BertTokenizer.from_pretrained("../input/huggingface-bert/bert-large-uncased")
        elif model_name == "roberta-base":
            self.tokenizer = transformers.RobertaTokenizer.from_pretrained("../input/huggingface-roberta-variants/roberta-base/roberta-base")
        else:
            raise ValueError("This name bert is not here.")
    
        self.mode = mode
        self.max_sentence = max_sentence
        
        if mode != "test":
            self.target = df["target"].values 
    
    def __len__(self):
        return len(self.texts) 
    
    def __getitem__(self,idx):
        sentence = self.texts[idx]
        tokenized = self.tokenizer(
            sentence,
            add_special_tokens=True,
            max_length=self.max_sentence,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True
        )

        ids = torch.tensor(tokenized["input_ids"],dtype=torch.long) 
        mask = torch.tensor(tokenized["attention_mask"],dtype=torch.long)
        if "roberta" not in self.model_name:
            token_type = torch.tensor(tokenized["token_type_ids"],dtype=torch.long)
        else:
            token_type = 0  

        if self.mode == "test":
            return (
                ids,
                mask,
                token_type
            )
        
        else: # train and valid
            target = torch.tensor(self.target[idx],dtype=torch.float) 
            return (
                ids,
                mask,
                token_type,
                target
            )

# Model 
class COMMONLITModelBert(nn.Module):
    def __init__(self,model_name):
        super(COMMONLITModelBert,self).__init__()
        self.model_name = model_name
        if model_name == "bert-base-uncased":
            self.model = transformers.BertForSequenceClassification.from_pretrained("../input/bert-base-uncased",num_labels=1)
        elif model_name == "bert-large-uncased":
            self.model = transformers.BertForSequenceClassification.from_pretrained("../input/huggingface-bert/bert-large-uncased",num_labels=1)
        elif model_name == "roberta-base":
            self.model = transformers.RobertaForSequenceClassification.from_pretrained("../input/huggingface-roberta-variants/roberta-base/roberta-base",num_labels=1)
        else:
            raise ValueError("This name bert is not here.")
    
    def forward(self,ids,mask):
        output = self.model.forward(ids,mask)
        return output["logits"]      
      

# Test
def test_fn(test_loader,model,device):
    model.eval() 
    preds = [] 

    for ids,mask,token_type in test_loader:
        ids = ids.to(device,non_blocking=True)
        mask = mask.to(device,non_blocking=True) 

        with torch.no_grad():
            y_pred = model(ids,mask).squeeze(-1)
        
        preds.append(y_pred.detach().to("cpu").numpy())
    
    predictions = np.concatenate(preds)
    return predictions


# Training 
def test_pred(test,CFG):
    test_dset = COMMONLITDatasetBert(test,CFG.model_name,max_sentence=CFG.max_sentence,mode="test")
    test_loader = DataLoader(test_dset,batch_size=CFG.batch_size,shuffle=False,num_workers=CFG.num_workers,pin_memory=True)

    model = COMMONLITModelBert(model_name=CFG.model_name)
    device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
    model.to(device) 

    preds = np.zeros(len(test)) 
    for path in CFG.model_path:
        print(f"start model {path}")
        if device == torch.device("cuda"):
            model.load_state_dict(torch.load(path)) 
        else:
            model.load_state_dict(torch.load(path,map_location="cpu"))
        preds += test_fn(test_loader,model,device) 
    
    preds /= len(CFG.model_path) 
    return preds 

def main():
    test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    submit = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

    pred = test_pred(test,CFG) 
    
    submit["target"] = pred 
    submit.to_csv("submission.csv",index=False)
    print("Done !")

In [None]:
main()