**【Updated Points】**

・I attached a classification module to RobertaModel.

・I used the RobertaModel outputs of [the last 4 encoding layers' CLS tokens][1].

・I adjusted learning rates, layer by layer.

---

**My previous note**

[CLRP: RoBERTa simple finetune baseline-1][2]

---

**Comments**: Thanks to previous great Notebooks.

1. [Pytorch BERT beginner's room][3]

2. [CLRP: Pytorch Roberta Finetune][4]



[1]: https://www.kaggle.com/c/google-quest-challenge/discussion/123770

[2]: https://www.kaggle.com/masatomurakawamm/clrp-roberta-simple-finetune-baseline-1

[3]: https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room

[4]: https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune

In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import os
import random

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import transformers

from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')

In [None]:
# Settings
config = {
    'train_batch_size': 16,
    'valid_batch_size': 32,
    'max_len': 314,
    'nfolds': 4,
    'seed': 42,
    'epochs': 10,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'{device} is used')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.detarministic = True 
    torch.backends.cudnn.benchmark = True 

seed_everything(seed=config['seed'])

def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

## Datasets and DataLoaders

In [None]:
# Load the data
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
# k-fold
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:, 'bins'] = pd.cut(train_data['target'], bins=num_bins, labels=False)

train_data['fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],
                        shuffle=True,
                        random_state=config['seed'])
for k, (train_idx, valid_idx) in enumerate(kfold.split(X=train_data, y=train_data.bins)):
    train_data.loc[valid_idx, 'fold'] = k


In [None]:
# Dataset, DataLoader
class CLRPDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.target = df['target'].to_numpy()
        self.tokenizer = tokenizer 

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        sentence = self.excerpt[idx]
        sentence = sentence.replace('\n', ' ')
        bert_sens = tokenizer.encode_plus(sentence,
                                          add_special_tokens=True,
                                          max_length=config['max_len'],
                                          pad_to_max_length=True,
                                          truncation=True,
                                          return_attention_mask=True)
        ids = torch.tensor(bert_sens['input_ids'])
        mask = torch.tensor(bert_sens['attention_mask'])
        targets = torch.tensor(self.target[idx], dtype=torch.float)
        return {'ids': ids, 'mask': mask, 'targets': targets}
        

In [None]:
model_path = '../input/roberta-base'

tokenizer = transformers.RobertaTokenizer.from_pretrained(model_path)

p_fold = 0
p_train = train_data.query(f'fold != {p_fold}').reset_index(drop=True)
p_valid = train_data.query(f'fold == {p_fold}').reset_index(drop=True)

train_dataset = CLRPDataset(p_train, tokenizer)
valid_dataset = CLRPDataset(p_valid, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=config['train_batch_size'],
                              shuffle=True, num_workers=4, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=config['valid_batch_size'],
                              shuffle=False, num_workers=4, pin_memory=True)

## Model

In [None]:
class RobertaCLRPClassificationHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()

        self.dense = nn.Linear(in_features*4, hidden_dim)
        self.dropout = nn.Dropout(0.1, inplace=False)
        self.out_proj = nn.Linear(hidden_dim, 1)

        nn.init.normal_(self.dense.weight, std=0.02)
        nn.init.normal_(self.dense.bias, 0)
        nn.init.normal_(self.out_proj.weight, std=0.02)
        nn.init.normal_(self.out_proj.bias, 0)

    def forward(self, x):
        x = self.dense(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class CommonLitModel(nn.Module):

    def __init__(self, model_path):
        super(CommonLitModel, self).__init__()
        self.roberta = transformers.RobertaModel.from_pretrained(model_path)        
        self.classifier = RobertaCLRPClassificationHead(768, 768)
        
    def forward(self, ids, mask):
        all_encoder_layers = self.roberta(ids, mask, output_hidden_states=True)['hidden_states']
        
        vec1 = all_encoder_layers[-1][:, 0, :]
        vec2 = all_encoder_layers[-2][:, 0, :]
        vec3 = all_encoder_layers[-3][:, 0, :]
        vec4 = all_encoder_layers[-4][:, 0, :]

        roberta_output = torch.cat([vec1, vec2, vec3, vec4], dim=1)
        output = self.classifier(roberta_output)
 
        return output   # torch.Size([16, 1])


model = CommonLitModel(model_path)
model.to(device)


In [None]:
# freezing parameters

for param in model.parameters():
    param.requires_grad = False
    
for param in model.classifier.parameters():
    param.requires_gard = True
    
for param in model.roberta.encoder.layer[11].parameters():
    param.requires_grad = True
    
for param in model.roberta.encoder.layer[10].parameters():
    param.requires_grad = True
    
for param in model.roberta.encoder.layer[9].parameters():
    param.requires_grad = True
    
for param in model.roberta.encoder.layer[8].parameters():
    param.requires_grad = True


In [None]:
# optimizer, scheduler and criterion
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW([
    {'params': model.classifier.parameters(), 'lr': 2e-5},
    {'params': model.roberta.encoder.layer[11].parameters(), 'lr': 1e-5},
    {'params': model.roberta.encoder.layer[10].parameters(), 'lr': 4e-6},
    {'params': model.roberta.encoder.layer[9].parameters(), 'lr': 2e-6},
    {'params': model.roberta.encoder.layer[8].parameters(), 'lr': 1e-6},
], betas=(0.9, 0.98), weight_decay=1e-2)

epochs = config['epochs']

train_steps = int(len(p_train) / config['train_batch_size'] * epochs)
num_steps = int(train_steps * 0.5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

scaler = torch.cuda.amp.GradScaler()

criterion = nn.MSELoss()


## Training

In [None]:
scaler = torch.cuda.amp.GradScaler()
epochs = config['epochs']

def training(train_dataloader, model, optimizer, scheduler=None):

    model.train()
    
    all_preds = []
    all_targets = []
    losses = []

    for a in train_dataloader:

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            ids = a['ids'].to(device, non_blocking=True)
            mask = a['mask'].to(device, non_blocking=True)

            output = model(ids, mask)
            output = output.squeeze(-1)
            
            target = a['targets'].to(device, non_blocking=True)

            loss = criterion(output, target)

            losses.append(loss.item())
            all_preds.append(output.detach().cpu().numpy())
            all_targets.append(target.detach().squeeze(-1).cpu().numpy())

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        del loss
        torch.cuda.empty_cache()

        if scheduler:
            scheduler.step()

    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    losses = np.mean(losses)
    train_score = rmse_score(all_targets, all_preds)

    return losses, train_score

In [None]:
def validating(valid_dataloader, model):

    model.eval()

    all_preds = []
    all_targets = []
    losses = []

    for b in valid_dataloader:

        with torch.no_grad():

            ids = b['ids'].to(device, non_blocking=True)
            mask = b['mask'].to(device, non_blocking=True)

            output = model(ids, mask)
            output = output.squeeze(-1)

            target = b['targets'].to(device, non_blocking=True)

            loss = criterion(output, target)
            losses.append(loss.item())
            all_preds.append(output.detach().cpu().numpy())
            all_targets.append(target.detach().squeeze(-1).cpu().numpy())
            
            del loss
            torch.cuda.empty_cache()

    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)

    losses = np.mean(losses)
    valid_score = rmse_score(all_targets, all_preds)
    
    return all_preds, all_targets, losses, valid_score


In [None]:
# training
train_losses = []
valid_losses = []
best_score = None 

train_scores = []
valid_scores = []

for epoch in tqdm(range(epochs)):
    print("---------------" + str(epoch) + "start-------------\n")

    train_loss, train_score = training(train_dataloader, model, optimizer, scheduler)
    train_losses.append(train_loss)
    train_scores.append(train_score)
    print(f'train_score is {train_score}\n')

    preds, targets, valid_loss, valid_score = validating(valid_dataloader, model)
    valid_losses.append(valid_loss)
    valid_scores.append(valid_score)
    print(f'valid_score is {valid_score}\n')

    if best_score is None:
        best_score = valid_score
        torch.save(model.state_dict(), 'model0.pth')
        print('Save the first model')

    elif best_score > valid_score:
        best_score = valid_score
        torch.save(model.state_dict(), 'model0.pth')        
        print('found better point')

    else:
        pass
        

In [None]:
# visualization of results
fig = plt.figure(figsize=(6, 10))
ax1 = fig.add_subplot(2, 1, 1)
ax2 = fig.add_subplot(2, 1, 2)
plt.subplots_adjust(wspace=1.0)

ax1.scatter(targets, preds)
ax1.plot([-4, 2], [-4, 2])
ax1.set_xlabel('targets')
ax1.set_ylabel('preds')

x = np.arange(epochs)
ax2.plot(x, train_losses, label='train_losses')
ax2.plot(x, valid_losses, label='valid_losses')
ax2.legend()
ax2.set_xlabel('epochs')
ax2.set_ylabel('losses')

In [None]:
# remaining k-fold

best_scores = []
best_scores.append(best_score)

for p_fold in range(1, config['nfolds']):
    # initializing the data
    p_train = train_data.query(f'fold != {p_fold}').reset_index(drop=True)
    p_valid = train_data.query(f'fold == {p_fold}').reset_index(drop=True)

    train_dataset = CLRPDataset(p_train, tokenizer)
    valid_dataset = CLRPDataset(p_valid, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=config['train_batch_size'],
                                  shuffle=True, num_workers=4, pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=config['valid_batch_size'],
                                  shuffle=False, num_workers=4, pin_memory=True)
    
    model = CommonLitModel(model_path)
    model.to(device)

    for name, param in model.named_parameters():
        param.requires_grad = False
    for param in model.classifier.parameters():
        param.requires_gard = True
    for name, param in model.roberta.encoder.layer[11].named_parameters():
        param.requires_grad = True
    for name, param in model.roberta.encoder.layer[10].named_parameters():
        param.requires_grad = True
    for name, param in model.roberta.encoder.layer[9].named_parameters():
        param.requires_grad = True
    for name, param in model.roberta.encoder.layer[8].named_parameters():
        param.requires_grad = True
    
    # optimizer, scheduler and criterion
    from transformers import AdamW
    from transformers import get_linear_schedule_with_warmup
    
    optimizer = AdamW([
                {'params': model.classifier.parameters(), 'lr': 2e-5},
                {'params': model.roberta.encoder.layer[11].parameters(), 'lr': 1e-5},
                {'params': model.roberta.encoder.layer[10].parameters(), 'lr': 4e-6},
                {'params': model.roberta.encoder.layer[9].parameters(), 'lr': 2e-6},
                {'params': model.roberta.encoder.layer[8].parameters(), 'lr': 1e-6},
                ], betas=(0.9, 0.98), weight_decay=1e-2)
    
    epochs = 10
    train_steps = int(len(p_train) / config['train_batch_size'] * epochs)
    num_steps = int(train_steps * 0.5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    
    scaler = torch.cuda.amp.GradScaler()
    criterion = nn.MSELoss()

    # training
    train_losses = []
    valid_losses = []
    best_score = None

    train_scores = []
    valid_scores = []

    for epoch in tqdm(range(epochs)):
        print("---------------" + str(epoch) + "start-------------\n")

        train_loss, train_score = training(train_dataloader, model, optimizer, scheduler)
        train_losses.append(train_loss)
        train_scores.append(train_score)
        print(f'train_score is {train_score}\n')

        preds, targets, valid_loss, valid_score = validating(valid_dataloader, model)
        valid_losses.append(valid_loss)
        valid_scores.append(valid_score)
        print(f'valid_score is {valid_score}\n')

        if best_score is None:
            best_score = valid_score
            torch.save(model.state_dict(), f'model{p_fold}.pth')
            print('Save the first model')

        elif best_score > valid_score:
            best_score = valid_score
            torch.save(model.state_dict(), f'model{p_fold}.pth')
            print('found better point')

        else:
            pass

    best_scores.append(best_score)


## Inference

In [None]:
model_path = '../input/roberta-base'
tokenizer = transformers.RobertaTokenizer.from_pretrained(model_path)

test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
# Dataset and DataLoader for inference
class CLRPInferenceDataset(Dataset):

    def __init__(self, df, tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        sentence = self.excerpt[idx]
        sentence = sentence.replace('\n', ' ')
        roberta_sens = tokenizer.encode_plus(sentence,
                                            add_special_tokens=True,
                                            max_length=config['max_len'],
                                            pad_to_max_length=True,
                                            truncation=True,
                                            return_attention_mask=True)
        ids = torch.tensor(roberta_sens['input_ids'])
        mask = torch.tensor(roberta_sens['attention_mask'])
        return {'ids': ids, 'mask': mask}

test_dataset = CLRPInferenceDataset(test_data, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=config['valid_batch_size'],
                             shuffle=False, num_workers=4, pin_memory=True)


In [None]:
model = CommonLitModel(model_path)

finetune_result_path = './'
model_names = [s for s in os.listdir(finetune_result_path) if '.pth' in s]
pthes = [os.path.join(finetune_result_path, s) for s in os.listdir(finetune_result_path) if '.pth' in s]

def clrp_inference(test_dataloader, model, model_names, pthes):
    all_preds = []
    all_models = []
    for model_name, state in zip(model_names, pthes):
        model.load_state_dict(torch.load(state))
        model.to(device)
        model.eval()

        preds = []
        all_valid_loss = 0

        with torch.no_grad():
            for a in test_dataloader:
                ids = a['ids'].to(device)
                mask = a['mask'].to(device)

                output = model(ids, mask)
                output = output.squeeze(-1)

                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)
            all_preds.append(preds)
            all_models.append(model_name)

    print('\npredicted!')
    return all_preds, all_models

all_preds, all_models = clrp_inference(test_dataloader, model, model_names, pthes)

In [None]:
preds_df = pd.DataFrame(all_preds).T
preds_df.columns = all_models

preds_df


In [None]:
fin_preds = preds_df.mean(axis=1)
sample['target'] = fin_preds
sample


In [None]:
sample.to_csv('submission.csv', index=False)
