# Libraries

In [None]:
import os
import random
import gc
from pprint import pprint
from tqdm import tqdm
import more_itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='darkgrid')

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Sampler
from torch.utils.data import RandomSampler

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import DataCollatorWithPadding

%matplotlib inline

# Configuration

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

In [None]:
DEBUG = False

TRAIN = '../input/commonlitreadabilityprize/train.csv'
TRAIN = '../input/clrp-robertabase-from-colab/oof_df_for_train.csv'
TEST = '../input/commonlitreadabilityprize/test.csv'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

SEED = 567
seed_everything(SEED)

In [None]:
### Model Architecture ###
# BERT
BERT = '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'

# Distilbert
DISTILBERT = '../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased'

# Roberta
ROBERTA = '../input/huggingface-roberta-variants/roberta-base/roberta-base'
ROBERTA_LARGE = '../input/huggingface-roberta-variants/roberta-large/roberta-large'

### Trained Weights ###
TRAINED = '../input/clrp-robertabase-from-colab'
MODEL_NAME = 'CLRPModelColab'


test_df = pd.read_csv(TEST)
CV = True
POST = True


cfg ={}

ARCH_PATH = ROBERTA
cfg['train'] = {'n_folds': 5}

cfg['TTA'] = {'use': False, 'n_tta': 4}

# Data

## Tokenizer

In [None]:
cfg['tokenizer'] ={
    'name': ARCH_PATH,
    'params':{
        'add_special_tokens':True, 
        'padding': 'longest', 
        'max_length': 258, 
        'truncation': True,
        'return_special_tokens_mask': True
    }
    }

def get_tokenizer():
    return AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])

In [None]:
if DEBUG:
    df = pd.read_csv(TRAIN)
    text = df.loc[SEED, 'excerpt']
    print('Text Length ', len(text.split(' ')))
    print()
    
    text_tokenized = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        padding='max_length',
                        max_length=cfg['tokenizer']['max_length'], 
                        truncation=True
                        )
    
    for key, value in text_tokenized.items():
        print(key, type(value))
        print(value)
        print()

## Dataset

In [None]:
def clean_text(text):
    text = text.replace('\n', '')
    return text

class CLRPDataset(Dataset):
    
    def __init__(self, df, tokenizer):
        self.df = df
        self.texts = self.df['excerpt'].tolist()
        self.targets = self.df['target'].tolist()
        self.se = self.df['standard_error'].tolist()
        self.tokenizer = tokenizer
#         self.max_length = max_length
#         self.masking_prob = masking_prob
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = clean_text(self.texts[index])
        tokenized_text = self.tokenizer.encode_plus(
            text,
            **cfg['tokenizer']['params']
        )
        
        tokenized_text['target'] = self.targets[index]
        tokenized_text['se'] = self.se[index]
        return tokenized_text

## Dataloader

In [None]:
cfg['collator'] = {
    'train': {
        'name': 'MLM', 
        'params':{
            'mlm_probability': 0.1
            }},
    'val': {
        'name': 'padding' if not cfg['TTA']['use'] else 'MLM',
        'params': {
            None if not cfg['TTA']['use'] else 'mlm_probability': 0.03 # Here !
        }
    }}

def get_collator(tokenizer, phase='train'):
    
    def _prepare_collator(name, phase):
        if name=='MLM':
            return DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                   **cfg['collator'][phase]['params'])
        elif name=='padding':
            return DataCollatorWithPadding(tokenizer=tokenizer)
    
    return _prepare_collator(cfg['collator'][phase]['name'], phase=phase)

In [None]:
class SmartBatchingSampler(Sampler):
    def __init__(self, data_source, batch_size):
        super(SmartBatchingSampler, self).__init__(data_source)
        sample_lengths = [len(seq) for seq in data_source]
        argsort_inds = np.argsort(sample_lengths)
        self.batches = list(more_itertools.chunked(argsort_inds, n=batch_size))
        self._backsort_inds = None
    
    def __iter__(self):
        if self.batches:
            last_batch = self.batches.pop(-1)
            np.random.shuffle(self.batches)
            self.batches.append(last_batch)
        self._inds = list(more_itertools.flatten(self.batches))
        yield from self._inds

    def __len__(self):
        return len(self._inds)
    
    @property
    def backsort_inds(self):
        if self._backsort_inds is None:
            self._backsort_inds = np.argsort(self._inds)
        return self._backsort_inds
    
def get_SmartBatchingSampler(df, batch_size):
    data_source = df.excerpt.apply(lambda x: x.split(' '))
    return SmartBatchingSampler(data_source=data_source, batch_size=batch_size)

def get_RandomSampler(df):
    data_source = df.excerpt.apply(lambda x: x.split(' '))
    return RandomSampler(data_source=data_source)

In [None]:
cfg['sampler'] = {'name': 'SmartBatchSampler',
                  'params': {}}

def get_sampler(df=None, batch_size=None):
    if cfg['sampler']['name']=='SmartBatchSampler':
        return get_SmartBatchingSampler(df, batch_size)

In [None]:
tokenizer= get_tokenizer()

cfg['dl'] ={
    'train': {
        'batch_size': 8 if device.type=='cpu' else 16,
        'shuffle': False,
        'collate_fn': get_collator(tokenizer, phase='train'),
        'num_workers': os.cpu_count(), 
        'pin_memory': True,
},
    'val': {
        'batch_size': 8 if device.type=='cpu' else 16, 
        'shuffle': False, 
        'collate_fn': get_collator(tokenizer, phase='val'),
        'num_workers': os.cpu_count(), 
        'pin_memory': True
    }}

# Model

In [None]:
cfg['model'] = {'name': ARCH_PATH, 
                'path': '', 
                'p': 0.2, 
                'with_se': False,
                'attention': True}

In [None]:
class Regressor(nn.Module):
    
    def __init__(self, in_features=768, p=0.2, with_se=False):
        super().__init__()
        self.in_features = in_features
        self.p = p
        self.with_se = with_se
        
        self.LayerNorm = nn.LayerNorm(self.in_features)
        self.dropout = nn.Dropout(p=self.p)
        self.fc = nn.Linear(in_features, 2 if self.with_se else 1)
        
    def forward(self, x):
        x = self.LayerNorm(x)
        x = self.dropout(x)
        x = self.fc(x)
        return x

class AttentionRegressor(nn.Module):
    
    def __init__(self, in_features=768, hidden_state=512, with_se=False):
        super().__init__()
        self.in_features = in_features
        self.hidden_state = hidden_state
        self.with_se = with_se
        
        self.attention = nn.Sequential(
            nn.Linear(self.in_features, self.hidden_state),
            nn.Tanh(),
            nn.Linear(self.hidden_state, 1),
            nn.Softmax(dim=1)
        )
        
        self.fc = nn.Linear(self.in_features, 2 if self.with_se else 1)

        torch.nn.init.kaiming_normal_(self.fc.weight)
        torch.nn.init.kaiming_normal_(self.attention[0].weight)
        torch.nn.init.kaiming_normal_(self.attention[2].weight)
        
    def forward(self, x):
        weights = self.attention(x)
        context = torch.sum(weights * x, dim=1)
        output = self.fc(context)
        return output

class CLRPModel(nn.Module):
    
    def __init__(self, name, p=0.2, with_se=False, path=None, attention=False):
        super(CLRPModel, self).__init__()
        
        self.name = name
        self.path = path
        self.p = p
        self.with_se = with_se
        self.attention = attention
        
        config = AutoConfig.from_pretrained(name)  # This setting is from https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch
        config.update({'output_hidden_states': False,
                       'max_position_embeddings': 514,
                       'hidden_dropout_prob': 0.0,
                       'attention_probs_dropout_prob': 0.1,  # これも怪しい
                       'layer_norm_eps': 1e-7})
        
        self.bert = AutoModel.from_pretrained(name)
        if path: self._load_pretrained_weights()
        
        if name in [BERT, ROBERTA, ROBERTA_LARGE]:
            self.in_features = self.bert.pooler.dense.out_features
        elif name == DISTILBERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
            self.dense = nn.Linear(self.in_features, self.in_features)
            self.activation = nn.Tanh()
            torch.nn.init.kaiming_normal_(self.dense.weight)
        else:
            self.in_features = 768
        
        if self.attention:
            self.regressor = AttentionRegressor(in_features=self.in_features,
                                                hidden_state=514,
                                                with_se=self.with_se)
        else:
            self.regressor = Regressor(in_features=self.in_features, 
                                       p=self.p, 
                                       with_se=self.with_se)
        
        
    def _load_pretrained_weights(self):
        model_pretrained = AutoModelForMaskedLM.from_pretrained(self.name)
        checkpoint = torch.load(self.path, map_location=device)
        model_pretrained.load_state_dict(checkpoint['model'])
        
        self.bert.embeddings = model_pretrained.roberta.embeddings
        self.bert.encoder = model_pretrained.roberta.encoder
        
        del model_pretrained
        gc.collect()
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        if self.name == BERT:
            last_hidden_state, output = self.bert(input_ids=input_ids,
                                                  attention_mask=attention_mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.name == DISTILBERT:
            last_hidden_state = self.bert(input_ids=input_ids, 
                                           attention_mask=attention_mask, 
                                           return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.name in [ROBERTA, ROBERTA_LARGE]:
            last_hidden_state, output = self.bert(input_ids=input_ids,
                                                  attention_mask=attention_mask,
#                                                   token_type_ids=token_type_ids,
                                                  return_dict=False)
        
        if self.attention:
            output = self.regressor(last_hidden_state)
        else:
            output = self.regressor(output)
        
        return last_hidden_state, output

    
def get_model(pretrained=True, fold=None):
    if pretrained:
        PRETRAINED_MODEL = os.path.join(PRETRAINED_PATH, f'CLRPModelMLM_fold{fold}.tar')
        cfg['model']['path'] = PRETRAINED_MODEL
    else:
        cfg['model']['path'] = None
    
    return CLRPModel(**cfg['model'])

# CV / Inference

In [None]:
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast

In [None]:
cfg['train'] ={
    'n_folds': 5,
    'n_epochs': 100
}

In [None]:
def get_stratified_from_oof(df, n_splits=5):
    cols_to_use = ['id', 'url_legal', 'license', 'excerpt', 'target', 'standard_error', 'oof']
    df = df.loc[:, cols_to_use]
    df['abs_error'] = abs(df['oof'] - df['target'])
    
    n_bins = int(np.floor(1 + np.log2(len(df))))
    df['bin'] = pd.cut(df.abs_error, n_bins, labels=False)
    
    skf = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)
    df['fold'] = np.nan

    for fold, (idx_tr, idx_val) in enumerate(skf.split(df.id, y=df.bin)):
        df.loc[idx_val, 'fold'] = fold

    df['fold'] = df['fold'].astype('int8')
    return df

In [None]:
def pooled_last_hidden_state(last_hidden_states, pool='max'):
    last_hidden_states = last_hidden_states.detach().cpu().numpy()
    if pool=='max':
        return last_hidden_states.max(axis=1)
    elif pool=='mean':
        return last_hidden_states.mean(axis=1)

    
def val_fn_cv(model, dl):
    scaler = GradScaler()
    preds = []
    lhs = [] # last hidden state
    
    model.eval()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='cv')
    
    with torch.no_grad():
        for i, batch in enumerate(progress_bar):
            inputs = get_inputs(batch)
            
            with autocast():
                last_hidden_states, outputs = model(**inputs)
            
            preds.append(outputs.detach().cpu().numpy())
            lhs.append(pooled_last_hidden_state(last_hidden_states, pool='max'))
    
    preds = np.concatenate(preds)
    lhs = np.concatenate(lhs)
    
    return lhs, preds


def main_cv():
    seed_everything(SEED)
    
    lhs_list = []
    
    df = pd.read_csv(TRAIN)
    df = df.loc[df.standard_error!=0].reset_index(drop=True)
    df = get_stratified_from_oof(df)    
    df['oof'] = np.nan

    tokenizer = get_tokenizer()
    
    for fold in range(cfg['train']['n_folds']):
        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)
        
        model = get_model(pretrained=False)

        PATH = os.path.join(TRAINED, MODEL_NAME + f'_fold{fold}.tar')
        saved_contents = torch.load(PATH, map_location=device)
        model.load_state_dict(saved_contents['model'])
        
        if fold==0:
            cfg_for_train = saved_contents['cfg']
            print('Configuration for training:')
            print()
            pprint(cfg_for_train)
            print()
        
        print('Fold:', fold)
        
        inputs = {'model': model,
                  'dl': val_dl}
        
        lhs, preds = val_fn_cv(**inputs)
        df.loc[df.fold==fold, 'oof'] = preds
        if fold==0:
            lhs_cols = [f'lhs_{i}' for i in range(lhs.shape[1])]
            df[lhs_cols] = np.nan
        df.loc[df.fold==fold, lhs_cols] = lhs
        
        
        if cfg['TTA']['use']:
            for n_tta in range(1, cfg['TTA']['n_tta']):
                lhs, preds = val_fn_cv(**inputs)
                df.loc[df.fold==fold, 'oof'] += np.concatenate(preds)
                
            df.loc[df.fold==fold, 'oof'] /= cfg['TTA']['n_tta']


    return df

In [None]:
def get_dls_for_n_fold(df, fold, tokenizer):
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_ds = CLRPDataset(
        train_df,
        tokenizer=tokenizer
    )
    
    val_ds = CLRPDataset(
        val_df, 
        tokenizer=tokenizer
    )
    
    if cfg['sampler']['name'] is not None:
        cfg['dl']['train']['sampler'] = get_sampler(df=train_df, 
                                                    batch_size=cfg['dl']['train']['batch_size'])
    else:
        cfg['dl']['train']['shuffle'] = True
    
    train_dl = DataLoader(train_ds, **cfg['dl']['train'])
    val_dl = DataLoader(val_ds, **cfg['dl']['val'])
    
    return train_dl, val_dl


def get_inputs(batch):
    keys_to_input = ['input_ids', 'attention_mask', 'token_type_ids']
    inputs = {key: value.to(device) for key, value in batch.items() if key in keys_to_input}
    return inputs


def get_targets(batch):
    keys_to_output = ['target', 'se']
    targets = {key: value.view(-1, 1).to(device) for key, value in batch.items() if key in keys_to_output}
    if cfg['loss']['name']!='KLdiv':
        return targets['target'] 
    else:
        return torch.cat((targets['target'], targets['se']), dim=1)

In [None]:
pprint(cfg)

In [None]:
def main_infer():
    seed_everything(SEED)
    
    df = pd.read_csv(TEST)
    df['target'] = 0.
    df['standard_error'] = 0.
    
    tokenizer = get_tokenizer()
    
    for fold in range(cfg['train']['n_folds']):
        print('Fold:', fold)

        test_ds = CLRPDataset(
                    df, 
                    tokenizer=tokenizer
                    )
    
        test_dl = DataLoader(test_ds, **cfg['dl']['val'])

        model = CLRPModel(**cfg['model'])
        PATH = os.path.join(TRAINED, MODEL_NAME + f'_fold{fold}.tar')
        state_dict = torch.load(PATH, map_location=device)['model']
        model.load_state_dict(state_dict)

        inputs = {'model': model,
                  'dl': test_dl}
        
        lhs, preds = val_fn_cv(**inputs)
        df['target'] = df['target'] + np.concatenate(preds)
    
    df['target'] = df['target'] / cfg['train']['n_folds']
    return df

In [None]:
%%time

if CV:
    from sklearn.metrics import mean_squared_error

    df = main_cv()
    df.to_csv('oof_df.csv', index=False)
    
    mse = mean_squared_error(df['target'], df['oof'])
    rmse = np.sqrt(mse)
    print('CV score: ', rmse)

# Postprocessing

In [None]:
if POST:
    from sklearn import linear_model
    from sklearn.metrics import mean_squared_error

    def RMSE(y_pred, y_gt):
        mse = mean_squared_error(y_pred, y_gt)
        return np.sqrt(mse)

    lm = linear_model.LinearRegression()
    lm.fit(df.loc[:, ['oof']], df['target'].values)

    df['oof_post'] = lm.predict(df.loc[:, ['oof']])

    score_oof_post = RMSE(df['oof_post'], df['target'])
    print('RMSE (oof post): ', score_oof_post)

In [None]:
df = main_infer()

if POST:
    df['target'] = lm.predict(df.loc[:, ['target']])
    
df = df[['id', 'target']]
df.to_csv('submission.csv', index=False)

In [None]:
df.info()