# BERT baseline[training + validation]

it's a kernel that trains `BertForSequenceClassification` from huggingface's transformers.

i'm appreciated if you enjoy and upvote it.
if helps you somewhat, i will post an inference kernel.

In [None]:
import os
import random
import torch
import numpy as np
import pandas as pd
    
def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
import torch
import transformers

from typing import Tuple, Dict, Union

class Dataset(torch.utils.data.Dataset):
    def __init__(
        self, 
        df: pd.DataFrame,
        tokenizer: transformers.tokenization_utils.PreTrainedTokenizer,
        max_sequence_length: int
    ):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, i: int) -> Union[Tuple, Dict]:
        text = self.df.iloc[i]['excerpt']
        target = self.df.iloc[i].get('target', None)
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_sequence_length,
            truncation=True,
            padding='max_length'
        )
        input_ids_and_mask = dict(
            input_ids=torch.tensor(inputs['input_ids'], dtype=torch.long),
            attention_mask=torch.tensor(inputs['attention_mask'], dtype=torch.long),
            token_type_ids=torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        )
        if target is None:
            return input_ids_and_mask

        return input_ids_and_mask, torch.tensor(target, dtype=torch.float)

In [None]:
import sklearn.metrics
from typing import Dict

def validate(
    model: transformers.PreTrainedModel,
    tokenizer: transformers.PreTrainedTokenizer,
    valid_df: pd.DataFrame,
    max_sequence_length: int,
    batch_size: int
) -> Dict:

    dataset = Dataset(
        valid_df, tokenizer,
        max_sequence_length=max_sequence_length
    )
    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False
    )

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    model.to(device)    

    losses = []
    logits = []
    labels = []
    for step, (_inputs, _labels) in enumerate(loader):
        model.eval()

        _inputs = {
            k: _inputs[k].to(device)
            for k in _inputs.keys() if k in {'input_ids','attention_mask','token_type_ids'}
        }

        with torch.no_grad():
            _loss, _logits, *_ = model(
                **_inputs, 
                labels=_labels.to(device),
                return_dict=False
            )
            losses.append(_loss.item())
            logits.append(_logits.detach().cpu().numpy())
            labels.append(_labels.reshape((-1,1)))

    del _inputs, _loss, _logits
    rmse = sklearn.metrics.mean_squared_error(
        np.vstack(labels), np.vstack(logits),
        squared=False
    )

    return dict(
        mean_loss=(sum(losses)/len(losses)),
        rmse=rmse
    )

In [None]:
import sklearn.model_selection

def make_folds(num_folds: int) -> pd.DataFrame:
    df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
    
    folds = pd.Series([-1] * len(df))
    kf = sklearn.model_selection.KFold(num_folds)
    for fold, (_, valid_index) in enumerate(kf.split(df)):
        folds[valid_index] = fold

    df.loc[:, 'fold'] = folds
    return df

In [None]:
seed_everything()

In [None]:
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AdamW
)

df = make_folds(5)

train_df = df[df['fold'].isin(list(range(4))) == True]
valid_df = df[df['fold'].isin(list(range(4))) == False]

model_name_or_path = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

dataset = Dataset(
    train_df, tokenizer,
    max_sequence_length=512
)
loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=12,
    shuffle=True
)
# regression
model = BertForSequenceClassification.from_pretrained(
    model_name_or_path,
    num_labels=1
)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1, 5+1):
    losses = []
    for step, (inputs, labels) in enumerate(loader):
        model.train()

        inputs = {
            k: inputs[k].to(device)
            for k in inputs.keys() if k in {'input_ids','attention_mask','token_type_ids'}
        }
        optimizer.zero_grad()

        loss, *_ = model(
            **inputs, 
            labels=labels.to(device),
            return_dict=False
        )
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

    del inputs, loss
    print('training loss@%d=%f' % (epoch, sum(losses)/len(losses)))
    result = validate(
        model, tokenizer, valid_df,
        max_sequence_length=512,
        batch_size=12
    )
    print('validation loss@%d=%f, rmse=%f' % (epoch, result['mean_loss'], result['rmse']))

In [None]:
from pathlib import Path

output_dir = Path('output/')
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print('saved to model and tokenizer to %s' % str(output_dir))