# Pytorch BERT + Stratified K Fold [train]
## Introduction

This notebook is the BERT baseline (using HuggingFace) with stratified K fold.  
I stratify the dataset on the target distribution.

Inference part is here:  
https://www.kaggle.com/atsushiiwasaki/commonlit-bert-stratified-k-fold-baseline-infer

You can choose BERT variants from
* BERT
* DISTILBERT
* ROBERTA

Hyper-parameters / Optimizer / Scheduler or any settings for them are not optimized.  
Please try some experiments. Thanks.

## Contents
1. Libraries
1. Configuration
1. Data (Dataset, DataLoader)
1. Criterion
1. Model
1. Optimizer
1. Training/Inference
1. Run
1. Calculate CV Score

## Update
* v9  : fully connected layer -> kaiming_normal
* v10 : back to Distilbert (LB: 0.497, CV: 0.533)
* v11 : change random seed
* v12 : change random seed
* v14 : chage max_len in tokenizer 210 -> 250, change random seed 777 -> 28
* v15 : add CV score calculation section

# Libraries

In [None]:
import os
import random
import gc
from pprint import pprint
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='darkgrid')

from sklearn.model_selection import StratifiedKFold

%matplotlib inline

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

# Configuration

In [None]:
DEBUG = False

TRAIN = '../input/commonlitreadabilityprize/train.csv'
TEST = '../input/commonlitreadabilityprize/test.csv'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

SEED = 28
seed_everything(SEED)

cfg ={}

In [None]:
# BERT
BERT = 'bert-base-uncased'

# Distilbert
DISTILBERT = 'distilbert-base-uncased'

# Roberta
ROBERTA = 'roberta-base'



cfg ={}

ARCH_PATH = DISTILBERT

cfg['train'] = {'n_folds': 5}

# Data

## Stratify on target distribution

In [None]:
def get_bin_stratified(df, n_bins=10, n_splits=5):
    df['bin'] = pd.cut(df.target, n_bins, labels=[i for i in range(n_bins)])
    
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    gen_skf = skf.split(df.id, y=df.bin)

    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        df.loc[idx_val, 'fold'] = fold

    df['fold'] = df['fold'].astype('int8')

In [None]:
df = pd.read_csv(TRAIN)
get_bin_stratified(df)

plt.figure(figsize=(12, 6))
for fold in range(cfg['train']['n_folds']):
    sns.histplot(data=df.loc[df.fold==fold], x='target', bins=10, hue='fold', label=f'fold{fold}')
    
plt.title('Target Distribution for Each Fold')
plt.legend()
plt.show()

## Tokenizer

In [None]:
cfg['tokenizer'] ={'name': ARCH_PATH, 
                   'max_length': 250}

tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])

In [None]:
if DEBUG:
    text = df.loc[SEED, 'excerpt']
    print('Text Length ', len(text.split(' ')))
    print()
    
    text_tokenized = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        padding='max_length',
                        max_length=cfg['tokenizer']['max_length'], 
                        truncation=True
                        )
    
    for key, value in text_tokenized.items():
        print(key, type(value))
        print(value)
        print()

## Dataset

In [None]:
class CommonLitDataset(Dataset):
    
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.loc[index, 'excerpt']
        inputs = self.tokenizer.encode_plus(
            text,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        if cfg['tokenizer']['name']=='bert-base-uncased':
            token_type_ids = inputs['token_type_ids'] 
        else:
            token_type_ids = 1.
        
        target = self.df.loc[index, ['target']]
        
        return {
            'ids': torch.LongTensor(ids),
            'mask': torch.LongTensor(mask),
             'token_type_ids': torch.tensor(token_type_ids)
            },{
            'target': torch.Tensor(target)
        }

In [None]:
if DEBUG:
    ds = CommonLitDataset(df=df, 
                          tokenizer=tokenizer, 
                          max_len=cfg['tokenizer']['max_length'])
    assert len(df) == len(ds)
    
    ds = iter(ds)
    inputs, targets = next(ds)
    
    for k, v in inputs.items():
        print(k, v.dtype)
        print(v)
        print()
        
    for k, v in targets.items():
        print(k, v.dtype)
        print(v)
        print()

## Dataloader

In [None]:
cfg['dl_train'] = {
    'batch_size': 8 if device.type=='cpu' else 32, 
    'shuffle': True, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

cfg['dl_val'] = {
    'batch_size': 8 if device.type=='cpu' else 64, 
    'shuffle': False, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

In [None]:
if DEBUG:
    ds = CommonLitDataset(df=df, 
                          tokenizer=tokenizer, 
                          max_len=cfg['tokenizer']['max_length'])
    
    dl = DataLoader(ds, **cfg['dl_train'])
    
    for data in dl:
        print(data[0]['ids'].detach().cpu().size())
        break

# Model

In [None]:
cfg['model'] = {'name': ARCH_PATH}

In [None]:
class CommonLitBERT(nn.Module):
    
    def __init__(self, name, dropout=True):
        super(CommonLitBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(name)
        self.name = name
        
        if name == BERT:
            self.in_features = self.bert.pooler.dense.out_features
        elif name == DISTILBERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
        elif name == ROBERTA:
            self.in_features = self.bert.pooler.dense.out_features
        else:
            self.in_features = 768
        
        self.fc = nn.Linear(self.in_features, 1)
        self.dense = nn.Linear(self.in_features, self.in_features)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(p=0.2)
        
        torch.nn.init.kaiming_normal_(self.dense.weight)
        torch.nn.init.kaiming_normal_(self.fc.weight)
        
    def forward(self, ids, mask, token_type_ids):
        if self.name == BERT:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.name == DISTILBERT:
            last_hidden_state = self.bert(ids, 
                                           attention_mask=mask, 
                                           return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.name == ROBERTA:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
#                                                   token_type_ids=token_type_ids,
                                                  return_dict=False)
        output = self.dropout(output)
        output = self.fc(output)
        return output

In [None]:
if DEBUG:
    model = CommonLitBERT(name=cfg['model']['name'])
    data = next(iter(dl))
    inputs = data[0]
    outputs = model(**inputs)
    print(outputs)

# Criterion

In [None]:
def CommonLitMetric(y_pred, y_gt):
    assert y_pred.size() == y_gt.size()
    
    metric = nn.MSELoss()
    metric = torch.sqrt(metric(y_pred, y_gt))
    return metric

In [None]:
if DEBUG:
    def RMSE_grad(y_pred, y_gt):
        # y_pred differential
        delta = y_pred - y_gt
        N = len(delta)

        dL = delta / N
        dy = torch.sqrt((delta**2).sum() / N)

        return dL/dy
    
    y_pred = torch.tensor([[6], [5]], dtype=torch.float32, requires_grad=True)
    y_gt = torch.tensor([[2], [4]], dtype=torch.float32, requires_grad=True)
    
    metric = CommonLitMetric(y_pred, y_gt)
    metric.backward()
    
    for i in range(len(y_pred)):
        assert y_pred.grad[i] == RMSE_grad(y_pred, y_gt).data[i], f'{i}th element is not consistent.'

# Optimizer

In [None]:
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

from transformers import get_cosine_schedule_with_warmup
from transformers import AdamW

cfg['optim'] = {'lr': 3e-5}
cfg['scheduler'] = {'num_warmup_steps': 2, 
                    'num_training_steps': 5, 
#                     'num_cycles': 1,
                   }

In [None]:
if DEBUG:
    model = CommonLitBERT(name=cfg['model']['name'])
    optim = AdamW(model.parameters(), **cfg['optim'])
    scheduler = get_cosine_schedule_with_warmup(optim, **cfg['scheduler'])
    
    lrs = []

    for epoch in range(50):
        lr = optim.param_groups[0]['lr']
        lrs.append(lr)

        optim.step()
        scheduler.step()
        
    plt.plot(lrs, marker='o')
    plt.xlabel('Steps')
    plt.ylabel('Learning Rate')
    plt.title('LR Scheduler Plot')
    plt.show()

# Training / Inference

In [None]:
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast

In [None]:
cfg['train'] ={
    'n_folds': 5,
    'n_epochs': 100
}

In [None]:
class StoreLoss:
    
    def __init__(self, fold):
        self.loss_train_mean = []
        self.loss_train_std = []
        self.loss_val_mean = []
        self.loss_val_std = []
        
        self.fold = fold
        
    def get_loss(self, loss_train, loss_val):
        self.loss_train_mean.append(loss_train[0])
        self.loss_train_std.append(loss_train[1])
        self.loss_val_mean.append(loss_val[0])
        self.loss_val_std.append(loss_val[1])
        
    def plot_loss(self):
        
        def get_ax(ax, loss_train, loss_val, title='mean'):
            ax.plot(loss_train, marker='o', label='train')
            ax.plot(loss_val, marker='x', label='val')
            ax.set_xlabel('Epoch')
            ax.set_ylabel(f'RMSE ({title})')
            ax.set_title(f'RMSE({title}) vs Epoch at fold {self.fold}')
            ax.legend()
            return ax
        
        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
        
        ax[0] = get_ax(ax[0], self.loss_train_mean, self.loss_val_mean, title='mean')
        ax[1] = get_ax(ax[1], self.loss_train_std, self.loss_val_std, title='std')
        
        
        fig.show()

In [None]:
if DEBUG:
    store = StoreLoss(fold=0)
    
    for epoch in range(10):
        loss_train = np.random.rand(2)
        loss_val = np.random.rand(2)
        
        store.get_loss(loss_train, loss_val)
    
    store.plot_loss()

In [None]:
def train_fn(model, dl, criterion, optim, scheduler):
    scaler = GradScaler()
    
    loss_train = []
    loss_total = 0
    
    model.train()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='train')
    
    for i, data in enumerate(progress_bar):
        optim.zero_grad()
        
        inputs = {key: value.to(device) for key, value in data[0].items()}
        targets = data[1]['target'].to(device)
        
        with autocast():
            outputs = model(**inputs)
            loss = criterion(outputs, targets)
#         loss.backward()
        
        scaler.scale(loss).backward()
        
        loss_train.append(loss.item())
        loss_total += loss.item()
        
        progress_bar.set_postfix({'RMSE(batch)': loss.item(), 
                                  'RMSE(ave)': loss_total / (i+1), 
                                  'lr': optim.param_groups[0]['lr']})
        
        scaler.step(optim)
        scaler.update()
#         optim.step()
    
    return np.mean(loss_train), np.std(loss_train)

def val_fn(model, dl):
    scaler = GradScaler()
    
    loss_val = []
    loss_total = 0
    
    model.eval()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='val')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['target'].to(device)
            
            with autocast():
                outputs = model(**inputs)
                loss = CommonLitMetric(outputs, targets)
            
            loss_val.append(loss.item())
            loss_total += loss.item()
            
            progress_bar.set_postfix({'RMSE(batch)': loss.item(), 'RMSE(ave)': loss_total / (i+1)})
    
    loss_val_2 = np.array(loss_val)**2 * cfg['dl_val']['batch_size'] / len(dl.dataset)
    print('RMSE for validation set overall: ', np.sqrt(loss_val_2.sum()))
    
    return np.sqrt(loss_val_2.sum()), np.std(loss_val)

In [None]:
def run_one_epoch(model, train_dl, val_dl, criterion, optim, scheduler):
    inputs_train = {
        'model': model, 
        'dl': train_dl, 
        'criterion': criterion, 
        'optim': optim, 
        'scheduler': scheduler
    }

    inputs_val = {'model': model, 
                  'dl': val_dl}

    loss_train = train_fn(**inputs_train)
    loss_val = val_fn(**inputs_val)
    
    return loss_train, loss_val

In [None]:
def get_dls_for_n_fold(df, fold, tokenizer):
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_ds = CommonLitDataset(
        train_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    val_ds = CommonLitDataset(
        val_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    train_dl = DataLoader(train_ds, **cfg['dl_train'])
    val_dl = DataLoader(val_ds, **cfg['dl_val'])
    
    return train_dl, val_dl

In [None]:
# https://qiita.com/ku_a_i/items/ba33c9ce3449da23b503

class EarlyStopping:
    
    def __init__(self, patience=2, seq=False):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.stop = False
        
    def __call__(self, loss, model, optim, cfg, path):
        if self.best_score is None:
            self.best_score = loss
            self.save_checkpoint(model, optim, cfg, path)
        elif loss < self.best_score:
            print(f'Loss decreased {self.best_score} -> {loss}.')
            self.best_score = loss
            self.counter = 0
            self.save_checkpoint(model, optim, cfg, path)
        else:
            self.counter += 1
            if self.counter > self.patience: self.stop = True
                
    def save_checkpoint(self, model, optim, cfg, path):
        save_list = {'model': model.state_dict(), 
#                      'optim': optim.state_dict(), 
                     'cfg': cfg}
        SAVE_PATH = path
        torch.save(save_list, SAVE_PATH)

# Run

In [None]:
pprint(cfg)

In [None]:
def main():
    seed_everything(SEED)
        
    df = pd.read_csv(TRAIN)
    get_bin_stratified(df, n_splits=cfg['train']['n_folds'])

    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        print('Fold:', fold)
        store = StoreLoss(fold=fold)
        es = EarlyStopping()

        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

        model = CommonLitBERT(name=cfg['model']['name'])
        criterion = CommonLitMetric
        optim = AdamW(model.parameters(), **cfg['optim'])
        scheduler = get_cosine_schedule_with_warmup(optim, **cfg['scheduler'])
        if optim.param_groups[0]['lr']==0:
            optim.step()
            scheduler.step()

        inputs = {'model': model,
                  'train_dl': train_dl,
                  'val_dl': val_dl,
                  'criterion': criterion,
                  'optim': optim,
                  'scheduler': scheduler}

        for epoch in range(cfg['train']['n_epochs']):
            loss_train, loss_val = run_one_epoch(**inputs)
            
            store.get_loss(loss_train, loss_val)
            
            es(loss_val[0], model, optim, cfg, path=f'CommonLitBERT_fold{fold}.tar')
            if es.stop:
                print('Early Stop !')
                print()
                break

            scheduler.step()
            
        store.plot_loss()
        
        del model, optim
        gc.collect()

In [None]:
%%time
main()

# Calculate CV Score

In [None]:
MODEL_NAME = 'CommonLitBERT'

def val_fn_cv(model, dl):
    scaler = GradScaler()
    preds = []
    
    model.eval()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='cv')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['target'].to(device)
            
            with autocast():
                outputs = model(**inputs)
            
            preds.append(outputs.detach().cpu().numpy())
    
    preds = np.concatenate(preds)
    
    return preds

def main_cv():
    seed_everything(SEED)
    
    df = pd.read_csv(TRAIN)
    get_bin_stratified(df, n_splits=cfg['train']['n_folds'])
    df['oof'] = np.nan

    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

        model = CommonLitBERT(name=cfg['model']['name'])
        PATH = os.path.join(MODEL_NAME + f'_fold{fold}.tar')
        saved_contents = torch.load(PATH, map_location=device)
        
        model.load_state_dict(saved_contents['model'])
        if fold==0:
            cfg_for_train = saved_contents['cfg']
            print('Configuration for training:')
            print()
            pprint(cfg_for_train)
            print()
        
        print('Fold:', fold)
        
        inputs = {'model': model,
                  'dl': val_dl}
        
        preds = val_fn_cv(**inputs)
        df.loc[df.fold==fold, 'oof'] = preds

    return df

In [None]:
from sklearn.metrics import mean_squared_error

df = main_cv()
df.to_csv('oof_df.csv', index=False)

mse = mean_squared_error(df['target'], df['oof'])
rmse = np.sqrt(mse)
print('CV score: ', rmse)

# Inference Part is here !
https://www.kaggle.com/atsushiiwasaki/commonlit-bert-stratified-k-fold-baseline-infer