# Pytorch BERT + Stratified K Fold [inference]

## Introduction

This is the inference part of BERT baseline.  
Training part is here:  
https://www.kaggle.com/atsushiiwasaki/commonlit-bert-stratified-k-fold-baseline-train

In this notebook, the cv score is caluculated using oof in the training part.

Thanks.

## Contents
1. Libraries
1. Configuration
1. Data
1. Model
1. CV/Inference

# Libraries

In [None]:
import os
import random
import gc
from pprint import pprint
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='darkgrid')

from sklearn.model_selection import StratifiedKFold

%matplotlib inline

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

# Configuration

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

In [None]:
DEBUG = False

CV = True

TRAIN = '../input/commonlitreadabilityprize/train.csv'
TEST = '../input/commonlitreadabilityprize/test.csv'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

SEED = 28
seed_everything(SEED)

In [None]:
### Model Architecture ###
# BERT
BERT = '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'

# Distilbert
DISTILBERT = '../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased'

# Roberta
ROBERTA = '../input/huggingface-roberta-variants/roberta-base/roberta-base'


### Trained Weights ###
TRAINED = '../input/commonlit-bert-stratified-k-fold-baseline-train'
MODEL_NAME = 'CommonLitBERT'



cfg ={}

ARCH_PATH = ROBERTA
cfg['train'] = {'n_folds': 5}

# Data

## Tokenizer

In [None]:
cfg['tokenizer'] ={'name': ARCH_PATH, 
                   'max_length': 210}

tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])

In [None]:
if DEBUG:
    df = pd.read_csv(TRAIN)
    text = df.loc[SEED, 'excerpt']
    print('Text Length ', len(text.split(' ')))
    print()
    
    text_tokenized = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        padding='max_length',
                        max_length=cfg['tokenizer']['max_length'], 
                        truncation=True
                        )
    
    for key, value in text_tokenized.items():
        print(key, type(value))
        print(value)
        print()

## Dataset

In [None]:
class CommonLitDataset(Dataset):
    
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.loc[index, 'excerpt']
        inputs = self.tokenizer.encode_plus(
            text,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        if cfg['tokenizer']['name']==BERT:
            token_type_ids = inputs['token_type_ids'] 
        else:
            token_type_ids = 1.
        
        target = self.df.loc[index, ['target']]
        
        return {
            'ids': torch.LongTensor(ids),
            'mask': torch.LongTensor(mask),
             'token_type_ids': torch.tensor(token_type_ids)
            },{
            'target': torch.Tensor(target)
        }

In [None]:
if DEBUG:
    ds = CommonLitDataset(df=df, 
                          tokenizer=tokenizer, 
                          max_len=cfg['tokenizer']['max_length'])
    assert len(df) == len(ds)
    
    ds = iter(ds)
    inputs, targets = next(ds)
    
    for k, v in inputs.items():
        print(k, v.dtype)
        print(v)
        print()
        
    for k, v in targets.items():
        print(k, v.dtype)
        print(v)
        print()

## Dataloader

In [None]:
cfg['dl_train'] = {
    'batch_size': 8 if device.type=='cpu' else 16, 
    'shuffle': True, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

cfg['dl_val'] = {
    'batch_size': 8 if device.type=='cpu' else 16, 
    'shuffle': False, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

In [None]:
if DEBUG:
    ds = CommonLitDataset(df=df, 
                          tokenizer=tokenizer, 
                          max_len=cfg['tokenizer']['max_length'])
    
    dl = DataLoader(ds, **cfg['dl_train'])
    
    for data in dl:
        print(data[0]['ids'].detach().cpu().size())
        break

# Model

In [None]:
cfg['model'] = {'name': ARCH_PATH}

In [None]:
class CommonLitBERT(nn.Module):
    
    def __init__(self, name, dropout=True):
        super(CommonLitBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(name)
        self.name = name
        
        if name == BERT:
            self.in_features = self.bert.pooler.dense.out_features
        elif name == DISTILBERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
        elif name == ROBERTA:
            self.in_features = self.bert.pooler.dense.out_features
        else:
            self.in_features = 768
        
        self.fc = nn.Linear(self.in_features, 1)
        self.dense = nn.Linear(self.in_features, self.in_features)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(p=0.2)
        self.layer_norm = nn.LayerNorm(self.in_features)
        
    def forward(self, ids, mask, token_type_ids):
        if self.name == BERT:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.name == DISTILBERT:
            last_hidden_state = self.bert(ids, 
                                           attention_mask=mask, 
                                           return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.name == ROBERTA:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
#                                                   token_type_ids=token_type_ids,
                                                  return_dict=False)
        output = self.layer_norm(output)
        output = self.dropout(output)
        output = self.fc(output)
        return output

In [None]:
if DEBUG:
    model = CommonLitBERT(name=cfg['model']['name'])
    data = next(iter(dl))
    inputs = data[0]
    outputs = model(**inputs)
    print(outputs)
    
    del model
    gc.collect()

# CV / Inference

In [None]:
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast

In [None]:
cfg['train'] ={
    'n_folds': 5,
    'n_epochs': 100
}

In [None]:
def get_bin_stratified(df, n_bins=20, n_splits=5):
    df['bin'] = pd.cut(df.target, n_bins, labels=[i for i in range(n_bins)])
    
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    gen_skf = skf.split(df.id, y=df.bin)

    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        df.loc[idx_val, 'fold'] = fold

    df['fold'] = df['fold'].astype('int8')

In [None]:
def val_fn_cv(model, dl):
    scaler = GradScaler()
    preds = []
    
    model.eval()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='cv')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['target'].to(device)
            
            with autocast():
                outputs = model(**inputs)
            
            preds.append(outputs.detach().cpu().numpy())
    
    preds = np.concatenate(preds)
    
    return preds

In [None]:
def get_dls_for_n_fold(df, fold, tokenizer):
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_ds = CommonLitDataset(
        train_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    val_ds = CommonLitDataset(
        val_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    train_dl = DataLoader(train_ds, **cfg['dl_train'])
    val_dl = DataLoader(val_ds, **cfg['dl_val'])
    
    return train_dl, val_dl

In [None]:
pprint(cfg)

In [None]:
def main_cv():
    seed_everything(SEED)
    
    df = pd.read_csv(TRAIN)
    get_bin_stratified(df, n_splits=cfg['train']['n_folds'])
    df['oof'] = np.nan

    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

        model = CommonLitBERT(name=cfg['model']['name'])
        PATH = os.path.join(TRAINED, MODEL_NAME + f'_fold{fold}.tar')
        saved_contents = torch.load(PATH, map_location=device)
        
        model.load_state_dict(saved_contents['model'])
        if fold==0:
            cfg_for_train = saved_contents['cfg']
            print('Configuration for training:')
            print()
            pprint(cfg_for_train)
            print()
        
        print('Fold:', fold)
        
        inputs = {'model': model,
                  'dl': val_dl}
        
        preds = val_fn_cv(**inputs)
        df.loc[df.fold==fold, 'oof'] = preds

    return df

In [None]:
def main_infer():
    seed_everything(SEED)
    
    df = pd.read_csv(TEST)
    df['target'] = 0.
    
    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        print('Fold:', fold)

        test_ds = CommonLitDataset(
                    df, 
                    tokenizer=tokenizer, 
                    max_len=cfg['tokenizer']['max_length']
                    )
    
        test_dl = DataLoader(test_ds, **cfg['dl_val'])

        model = CommonLitBERT(name=cfg['model']['name'])
        PATH = os.path.join(TRAINED, MODEL_NAME + f'_fold{fold}.tar')
        state_dict = torch.load(PATH, map_location=device)['model']
        model.load_state_dict(state_dict)

        inputs = {'model': model,
                  'dl': test_dl}
        
        preds = val_fn_cv(**inputs)
        df['target'] = df['target'] + np.concatenate(preds)
    
    df['target'] = df['target'] / cfg['train']['n_folds']
    return df

In [None]:
%%time
if CV:
    from sklearn.metrics import mean_squared_error

    df = main_cv()
    df.to_csv('oof_df.csv', index=False)
    
    mse = mean_squared_error(df['target'], df['oof'])
    rmse = np.sqrt(mse)
    print('CV score: ', rmse)

In [None]:
df = main_infer()
df = df[['id', 'target']]
df.to_csv('submission.csv', index=False)

In [None]:
df.info()

# Training Part is Here !
https://www.kaggle.com/atsushiiwasaki/commonlit-bert-stratified-k-fold-baseline-train