## Import Package

In [None]:
import pandas as pd
import tqdm

## BERT
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

import torch
from torch.utils.data import DataLoader, Dataset, random_split

from sklearn.model_selection import train_test_split

In [None]:
train_csv = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_csv = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

## Module

In [None]:
def pad_to_len(seqs, to_len, padding=0):
    paddeds = []
    for seq in seqs:
        paddeds.append(
            seq[:to_len] + [padding] * max(0, to_len - len(seq))
        )
    return paddeds

In [None]:
class dataset(Dataset):
    def __init__(self, data):
        self.data  = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sample = self.data[index]
        return sample
    
    def collate_fn(self, samples, target=True):

        batch = {}
        
        for key in ['input_ids', 'token_type_ids', 'attention_mask']:
            to_len = max([len(sample[key]) for sample in samples])
            padded = pad_to_len(
                [sample[key] for sample in samples], to_len, 0
            )
            batch[key] = torch.tensor(padded)
        
        if target:
            batch['label'] = torch.tensor([sample['target'] for sample in samples])
            
        return batch

In [None]:
class BERT(torch.nn.Module):
    def __init__(self):
        super().__init__()        
        self.model = BertModel.from_pretrained( "../input/huggingface-bert/bert-base-uncased")
        self.extractor = torch.nn.Linear(768, 768)
        self.classifer = torch.nn.Linear(768, 1)
        self.dropout   = torch.nn.Dropout(0.1)
        self.tanh = torch.nn.Tanh() 
        
    def forward(self, ids, mask):
        output = self.model(ids, mask)
        CLS    = output.last_hidden_state[:,0,:]
        output = self.tanh(self.extractor(CLS))
        output = self.dropout(output)
        output = self.classifer(output)
        return output

In [None]:
class Estimator():
    def __init__(self, hyperparameters, device, model, optim = 'AdamW'):        
        self.params = hyperparameters
        self.model  = model
        self.device = device
        
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 
             'weight_decay': self.params['weight_decay']},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
        ]
        
        if optim == 'Adam':
            self.optimizer = torch.optim.Adam(params=optimizer_grouped_parameters, 
                                              lr=self.params['learning_rate'])
        if optim == 'AdamW':
            self.optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, 
                                              lr=self.params['learning_rate'])
        self.model.to(self.device)
        
    def load_weight(self, weight_path):
        self.model.load_state_dict(torch.load(weight_path))

    def fit(self, data, save_name):
        
        ## loss function
        criterion = torch.nn.MSELoss()
        criterion.to(self.device)

        ## Meta
        len_of_train = len(data['train'].dataset)
        len_of_test  = len(data['test'].dataset)
        best = 1000
        
        train_steps = int(len_of_train/self.params['batch_size']*self.params['epoch'])
        num_steps   = int(train_steps*0.1)

        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_steps, train_steps)

        for epoch in range(self.params['epoch']):
            total_loss = 0
            total_val_loss = 0
            
            self.model.train()
            for batch in tqdm.tqdm(data['train']):
                ## INPUT
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                target = batch['label'].to(self.device)

                ## FOWARD
                output = self.model(input_ids, attention_mask)
                loss = criterion(output.squeeze(), target) ## LOSS and PREDICT
                total_loss+=loss.item()*len(input_ids)

                ## OPTIMIZE
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                scheduler.step() # Update learning rate schedule

            self.model.eval()
            for batch in data['test']:
                with torch.no_grad():
                    ## INPUT
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    target = batch['label'].to(self.device)

                    ## FOWARD
                    output = self.model(input_ids, attention_mask)
                    loss = criterion(output.squeeze(), target) ## LOSS and PREDICT
                    
                    total_val_loss+=loss.item()*len(input_ids)
                
            print(f'Epoch: {epoch}, Train: {(total_loss/len_of_train)**(1/2)}, Test: {(total_val_loss/len_of_test)**(1/2)}')

            if total_val_loss < best:
                best = total_val_loss
                torch.save(self.model.state_dict(), f"{save_name}.pth")
                    
    def inference(self, data, t = True):
        
        total_loss = 0
        outputs = []
        
        ## loss function
        criterion = torch.nn.MSELoss()
        criterion.to(self.device)
        
        self.model.eval()
        for batch in tqdm.tqdm(data):
            with torch.no_grad():
                ## INPUT
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                if t:
                    target = batch['label'].to(self.device)
                
                ## FOWARD
                output = self.model(input_ids, attention_mask)
                
                if t:
                    loss = criterion(output.squeeze(), target) ## LOSS and PREDICT
                    total_loss+=loss.item()*len(input_ids)
                
                outputs+=output.squeeze().tolist()
        
        if t:
            return (total_loss/len(data.dataset))**(1/2)
        else:
            return outputs

## Preprocess

### 1. Split Dataset

In [None]:
X_train, X_test, _, _ = train_test_split(train_csv, train_csv['target'], test_size=0.1, random_state=42)

# X_train.to_csv('X_train.csv', index=False)
# X_test.to_csv('X_test.csv', index=False)

# X_train = pd.read_csv('X_train.csv')
# X_test  = pd.read_csv('X_test.csv')

X_train, X_valid, _, _ = train_test_split(X_train, X_train['target'], test_size=1/9, random_state=42)

X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)
X_test  = X_test.reset_index(drop=True)

datasets = {"train": X_train, "valid": X_valid, "test": X_test, "submit": test_csv}

In [None]:
pretrained_bert = "../input/huggingface-bert/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrained_bert, do_lower_case=True)

In [None]:
%%time
for key in datasets.keys():
    datasets[key]['input_ids'] = datasets[key]['excerpt'].apply(lambda x: tokenizer(x))
    datasets[key]['token_type_ids'] = datasets[key]['input_ids'].apply(lambda x: x['token_type_ids'])
    datasets[key]['attention_mask'] = datasets[key]['input_ids'].apply(lambda x: x['attention_mask'])
    datasets[key]['input_ids']      = datasets[key]['input_ids'].apply(lambda x: x['input_ids'])

In [None]:
train_dataset  = dataset(X_train.transpose().to_dict())
valid_dataset  = dataset(X_valid.transpose().to_dict())
test_dataset   = dataset(X_test.transpose().to_dict())
submit_dataset = dataset(test_csv.transpose().to_dict())

In [None]:
BATCH_SIZE = 16

In [None]:
train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers=4,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(train_dataset, x)
)

valid_loader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=4,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(valid_dataset, x)
)

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=4,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(test_dataset, x)
)

submit_loader = DataLoader(
    dataset = submit_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=4,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(submit_dataset, x, target=False)
)

In [None]:
print(len(train_dataset), len(valid_dataset), len(test_dataset), len(submit_dataset))

## Model

## Train

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
estimator = Estimator(
    hyperparameters = {
        'learning_rate': 2e-5,
        'epoch': 5,
        'batch_size': BATCH_SIZE,
        'weight_decay':1e-2,
    }, 
    device = device,
    model = BERT()
)

In [None]:
estimator.fit({'train': train_loader, 
               'test': valid_loader},
              save_name='best_model')

In [None]:
estimator.load_weight('best_model.pth')
estimator.inference(test_loader)

In [None]:
X_valid['predict'] = estimator.inference(valid_loader, False)

In [None]:
X_valid[['excerpt', 'target', 'predict']].sample(5)

## Submission

In [None]:
submit = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
submit['target'] = estimator.inference(submit_loader, False)

In [None]:
submit

In [None]:
submit.to_csv('submission.csv', index=False)