In [None]:
# 参考： https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain

import sys
import os
import time
import random
import re
from math import sqrt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from transformers import (AutoModel, AutoTokenizer, 
     get_linear_schedule_with_warmup, get_constant_schedule_with_warmup)

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

class CFG:
    #model_name_or_path = 'roberta-base'
    #model_name_or_path = 'bert-base-cased'
    #model_name_or_path = '../input/huggingface-bert-variants/bert-large-cased/bert-large-cased'
    #model_name_or_path = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'
    model_name_or_path = '../input/roberta-transformers-pytorch/roberta-base'
    #model_name_or_path = '../input/clrp-pytorch-roberta-pretrain/clrp_roberta_base'
#     model_name_or_path = '../input/roberta-transformers-pytorch/roberta-large'
    batch_size = 8
    max_seq_length = 512
    seq_length = 100
    learning_rate = 2.0e-5  #0.7e-5
    weight_decay = 1e-1
    use_lr_scheduler = True
    mid_eval = True
    mid_eval_step_num = 50
    random_seed = 2021
    model_output_dir = './'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
cfg = CFG()
seed_everything(cfg.random_seed)

QUICK_CHECK = False

global_start_t = time.time()
print('ok')

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train_data = train_data[['excerpt', 'target']]
print('000, train_data.shape: ', train_data.shape, 'test_data.shape: ', test_data.shape)

train_data = train_data.sample(len(train_data))
if QUICK_CHECK:
    train_data = train_data[:800]

print('111, train_data.shape: ', train_data.shape)

TRAIN_RATIO = 0.8
train_num = int(TRAIN_RATIO * len(train_data))
train_data, valid_data = train_data[:train_num], train_data[train_num:]
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)

print('222, train_data.shape: ', train_data.shape, 'valid_data.shape: ', valid_data.shape)

print(f'train_data.shape: {train_data.shape}, test_data.shape: {test_data.shape}')

In [None]:
rmse1 = mean_squared_error(train_data['target'], [train_data['target'].median()]*len(train_data), squared=False)
rmse2 = mean_squared_error(train_data['target'], [train_data['target'].mean()]*len(train_data), squared=False)
RMSE1 = sqrt((np.square(np.subtract(train_data['target'], train_data['target'].median()))).mean())
RMSE2 = sqrt((np.square(np.subtract(train_data['target'].mean(), train_data['target']))).mean())

rmse1, rmse2, RMSE1, RMSE2

In [None]:
class CLRP_Dataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.df = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        excerpt = self.df.iloc[index]['excerpt']
        target = self.df.iloc[index]['target']
        d_encode = self.tokenizer(excerpt,
                                  #return_tensors='pt',
                                  padding="max_length",
                                  max_length=self.max_seq_length,
                                  truncation=True)
        #print('d_encode.keys()', d_encode.keys())
        if 'token_type_ids' in d_encode:
            return {"input_ids": d_encode['input_ids'],
                    "token_type_ids": d_encode['token_type_ids'],
                    "attention_mask": d_encode['attention_mask'],
                    "length" : sum(d_encode['attention_mask']),
                    "target": target}
        else:
            return {"input_ids": d_encode['input_ids'],
                    "attention_mask": d_encode['attention_mask'],
                    "length" : sum(d_encode['attention_mask']),
                    "target": target}
    
def collate_fn(batch):
    max_len = max([x['length'] for x in batch])
    input_ids = torch.tensor([x['input_ids'][:max_len] for x in batch])
    attention_mask = torch.tensor([x['attention_mask'][:max_len] for x in batch])
    targets = torch.tensor([x["target"] for x in batch]).float()
    
    if 'token_type_ids' in batch[0]:
        token_type_ids = torch.tensor([x['token_type_ids'][:max_len] for x in batch])
        return {"all_input_ids": input_ids,
                "all_attention_mask": attention_mask,
                "all_token_type_ids": token_type_ids,
                "all_targets": targets}
    else:
        return {"all_input_ids": input_ids,
                "all_attention_mask": attention_mask,
                "all_targets": targets}
    
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name_or_path)
cfg.tokenizer = tokenizer

ds_train = CLRP_Dataset(train_data, tokenizer, cfg.max_seq_length)
dl_train = DataLoader(ds_train, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn, num_workers=0)
print('len of ds_train: ', len(ds_train), 'len of dl_train: ', len(dl_train))

ds_valid = CLRP_Dataset(valid_data, tokenizer, cfg.max_seq_length)
dl_valid = DataLoader(ds_valid, batch_size=2*cfg.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=0)
print('len of ds_valid: ', len(ds_valid), 'len of dl_valid: ', len(dl_valid))

print('ok')

In [None]:
class CLRP_model_V1(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*2, embedding_dim)
        else:
            self.fc1 = nn.Linear(768*2, embedding_dim)
        self.activation1 = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        last_hidden_state, pooled_output = output['last_hidden_state'], output['pooler_output']
        last_hidden_state = self.drop_out(last_hidden_state)
        seq_avg = torch.mean(last_hidden_state, dim=1)
        seq_max = torch.max(last_hidden_state, dim=1)[0]
        concat_out = torch.cat((seq_avg, seq_max), dim=1)
        preds = self.fc2(self.activation1(self.fc1(concat_out)))        
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
class CLRP_model_V2(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*1, 1)
        else:
            self.fc1 = nn.Linear(768*1, 1)
        

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        pooled_output = output['pooler_output']
        pooled_output = self.drop_out(pooled_output)
        preds = self.fc1(pooled_output)
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
CLRP_model = CLRP_model_V2
model = CLRP_model(cfg.model_name_or_path)
model.to(cfg.device)

print('model is ', model)

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)
            
print('ok')

In [None]:
valid_data.head(5)

In [None]:
def evaluate(eval_iter, model, criterion, train_part=False):
    global cfg
    model.eval()

    predictions_lst, targets_lst = [], []
    for step, batch in enumerate(eval_iter):
        for key in batch.keys():
            batch[key] = batch[key].to(cfg.device)
        with torch.no_grad():
            if 'all_token_type_ids' in batch:
                predictions = model(
                    input_ids=batch['all_input_ids'],
                    attention_mask=batch['all_attention_mask'],
                    token_type_ids=batch['all_token_type_ids'])
            else:
                predictions = model(
                    input_ids=batch['all_input_ids'],
                    attention_mask=batch['all_attention_mask'])
        predictions_lst += list(predictions.cpu().numpy().ravel())
        targets_lst += list(batch['all_targets'].cpu().numpy().ravel())

    #model.train()  # 将模型重新置为训练状态
    assert len(targets_lst)==len(predictions_lst), 'length should be equal'

    RMSE_val = mean_squared_error(targets_lst, predictions_lst, squared=False)
    return RMSE_val

def train(train_iter, test_iter, model, optimizer, criterion, lr_scheduler=None):
    global cfg, global_step_num, global_best_valid_loss, valid_loss_history
    model.train()
    #model.eval()
    
    predictions_lst, targets_lst = [], []
    for step, batch in enumerate(train_iter):
        global_step_num += 1
        for key in batch.keys():
            batch[key] = batch[key].to(cfg.device)
        if 'all_token_type_ids' in batch:
            predictions = model(
                input_ids=batch['all_input_ids'],
                attention_mask=batch['all_attention_mask'],
                token_type_ids=batch['all_token_type_ids'])
        else:
            predictions = model(
                input_ids=batch['all_input_ids'],
                attention_mask=batch['all_attention_mask'])
        #print('in train() predictions.shape: ', predictions.shape, 'batch[all_targets]', batch['all_targets'].shape)
        loss = criterion(predictions, batch['all_targets'])
        model.zero_grad()
        loss.backward()
        optimizer.step()
        if lr_scheduler is not None:
            lr_scheduler.step()
            
        predictions_lst += list(predictions.detach().cpu().numpy().ravel())
        targets_lst += list(batch['all_targets'].cpu().numpy().ravel())
        
        if cfg.mid_eval and global_step_num%cfg.mid_eval_step_num==0:
            valid_loss = evaluate(test_iter, model, criterion)
            valid_loss_history.append(valid_loss)
            print(f'mid eval, global_step_num: {global_step_num}, valid_loss: {valid_loss:.7f}')
            if valid_loss < global_best_valid_loss:
                global_best_valid_loss = valid_loss
                print(f'get new valid_loss: {valid_loss: .7f}, saving model now!')
                torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, "best_model.pth"))
                
    assert len(targets_lst)==len(predictions_lst), 'length should be equal'
    RMSE_val = mean_squared_error(targets_lst, predictions_lst, squared=False)
    
    return RMSE_val

print('ok')

In [None]:
EPOCH_NUM = 3
global_step_num = 0
#optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=1e-4)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
lr_scheduler = None
if cfg.use_lr_scheduler:
#     lr_scheduler = get_constant_schedule_with_warmup(optimizer, 100)
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                       num_warmup_steps = int(0.1*EPOCH_NUM*len(dl_train)),
                       num_training_steps = EPOCH_NUM*len(dl_train))
train_loss_history = []
valid_loss_history = []
global_best_train_loss = 1e9
global_best_valid_loss = 1e9
criterion = nn.MSELoss().to(cfg.device)

for epoch_n in range(EPOCH_NUM):
    print('epoch_n: ', epoch_n)
    if time.time() - global_start_t > 60*60*7.5:
        break
    train_loss = train(dl_train, dl_valid, model, optimizer, criterion, lr_scheduler=lr_scheduler)
    valid_loss = evaluate(dl_valid, model, criterion)
    
    train_loss_history.append(train_loss)
    valid_loss_history.append(valid_loss)
    
    if train_loss < global_best_train_loss:
        global_best_train_loss = train_loss
    if valid_loss < global_best_valid_loss:
        global_best_valid_loss = valid_loss
        print(f'after epoch {epoch_n}, global_step_num: {global_step_num} get new best_valid_loss: {valid_loss:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_model.pth'))
        
print(f'ok, global_best_train_loss: {global_best_train_loss:.7f} global_best_valid_loss: {global_best_valid_loss:.7f}')

In [None]:
# epoch_n:  0
# mid eval, global_step_num: 50, valid_loss: 0.9781250
# get new valid_loss:  0.9781250, saving model now!
# mid eval, global_step_num: 100, valid_loss: 0.6129924
# get new valid_loss:  0.6129924, saving model now!
# mid eval, global_step_num: 150, valid_loss: 0.6577457
# mid eval, global_step_num: 200, valid_loss: 0.5382791
# get new valid_loss:  0.5382791, saving model now!
# mid eval, global_step_num: 250, valid_loss: 0.5990030
# after epoch 0, global_step_num: 284 get new best_valid_loss: 0.50697, save the model now!
# epoch_n:  1
# mid eval, global_step_num: 300, valid_loss: 0.5512493
# mid eval, global_step_num: 350, valid_loss: 0.4743678
# get new valid_loss:  0.4743678, saving model now!
# mid eval, global_step_num: 400, valid_loss: 0.5294796
# mid eval, global_step_num: 450, valid_loss: 0.4888377
# mid eval, global_step_num: 500, valid_loss: 0.5500329
# mid eval, global_step_num: 550, valid_loss: 0.4636581
# get new valid_loss:  0.4636581, saving model now!
# after epoch 1, global_step_num: 568 get new best_valid_loss: 0.46200, save the model now!
# epoch_n:  2
# mid eval, global_step_num: 600, valid_loss: 0.5433109
# mid eval, global_step_num: 650, valid_loss: 0.4648798
# mid eval, global_step_num: 700, valid_loss: 0.4613321
# get new valid_loss:  0.4613321, saving model now!
# mid eval, global_step_num: 750, valid_loss: 0.4566262
# get new valid_loss:  0.4566262, saving model now!
# mid eval, global_step_num: 800, valid_loss: 0.4564575
# get new valid_loss:  0.4564575, saving model now!
# mid eval, global_step_num: 850, valid_loss: 0.4538009
# get new valid_loss:  0.4538009, saving model now!
# after epoch 2, global_step_num: 852 get new best_valid_loss: 0.45379, save the model now!
# ok, global_best_train_loss: 0.3171585 global_best_valid_loss: 0.4537924

In [None]:
# lr=1.5e-5  weight_decay=1e-4
# epoch_n:  0
# mid eval, global_step_num: 50, valid_loss: 0.9391119
# get new valid_loss:  0.9391119, saving model now!
# mid eval, global_step_num: 100, valid_loss: 0.7008228
# get new valid_loss:  0.7008228, saving model now!
# mid eval, global_step_num: 150, valid_loss: 0.5801484
# get new valid_loss:  0.5801484, saving model now!
# mid eval, global_step_num: 200, valid_loss: 0.5481895
# get new valid_loss:  0.5481895, saving model now!
# mid eval, global_step_num: 250, valid_loss: 0.5780302
# after epoch 0, global_step_num: 284 get new best_valid_loss: 0.52035, save the model now!
# epoch_n:  1
# mid eval, global_step_num: 300, valid_loss: 0.4985234
# get new valid_loss:  0.4985234, saving model now!
# mid eval, global_step_num: 350, valid_loss: 0.4815534
# get new valid_loss:  0.4815534, saving model now!
# mid eval, global_step_num: 400, valid_loss: 0.5474174
# mid eval, global_step_num: 450, valid_loss: 0.4902881
# mid eval, global_step_num: 500, valid_loss: 0.5117764
# mid eval, global_step_num: 550, valid_loss: 0.4805826
# get new valid_loss:  0.4805826, saving model now!
# after epoch 1, global_step_num: 568 get new best_valid_loss: 0.47472, save the model now!
# epoch_n:  2
# mid eval, global_step_num: 600, valid_loss: 0.4689728
# get new valid_loss:  0.4689728, saving model now!
# mid eval, global_step_num: 650, valid_loss: 0.4680705
# get new valid_loss:  0.4680705, saving model now!
# mid eval, global_step_num: 700, valid_loss: 0.4656835
# get new valid_loss:  0.4656835, saving model now!
# mid eval, global_step_num: 750, valid_loss: 0.4662796
# mid eval, global_step_num: 800, valid_loss: 0.4656795
# get new valid_loss:  0.4656795, saving model now!
# mid eval, global_step_num: 850, valid_loss: 0.4609354
# get new valid_loss:  0.4609354, saving model now!
# after epoch 2, global_step_num: 852 get new best_valid_loss: 0.46093, save the model now!
# ok, global_best_train_loss: 0.3325094 global_best_valid_loss: 0.4609301


# lr=2.0e-5  weight_decay=1e-1
# epoch_n:  0
# mid eval, global_step_num: 50, valid_loss: 0.9731567
# get new valid_loss:  0.9731567, saving model now!
# mid eval, global_step_num: 100, valid_loss: 0.8206061
# get new valid_loss:  0.8206061, saving model now!
# mid eval, global_step_num: 150, valid_loss: 0.6018713
# get new valid_loss:  0.6018713, saving model now!
# mid eval, global_step_num: 200, valid_loss: 0.5351610
# get new valid_loss:  0.5351610, saving model now!
# mid eval, global_step_num: 250, valid_loss: 0.5814825
# after epoch 0, global_step_num: 284 get new best_valid_loss: 0.51046, save the model now!
# epoch_n:  1
# mid eval, global_step_num: 300, valid_loss: 0.4970206
# get new valid_loss:  0.4970206, saving model now!
# mid eval, global_step_num: 350, valid_loss: 0.4771110
# get new valid_loss:  0.4771110, saving model now!
# mid eval, global_step_num: 400, valid_loss: 0.5300065
# mid eval, global_step_num: 450, valid_loss: 0.5006436
# mid eval, global_step_num: 500, valid_loss: 0.5250693
# mid eval, global_step_num: 550, valid_loss: 0.4652649
# get new valid_loss:  0.4652649, saving model now!
# epoch_n:  2
# mid eval, global_step_num: 600, valid_loss: 0.4700000
# mid eval, global_step_num: 650, valid_loss: 0.4708829
# mid eval, global_step_num: 700, valid_loss: 0.4619811
# get new valid_loss:  0.4619811, saving model now!
# mid eval, global_step_num: 750, valid_loss: 0.4615610
# get new valid_loss:  0.4615610, saving model now!
# mid eval, global_step_num: 800, valid_loss: 0.4591095
# get new valid_loss:  0.4591095, saving model now!
# mid eval, global_step_num: 850, valid_loss: 0.4576899
# get new valid_loss:  0.4576899, saving model now!
# after epoch 2, global_step_num: 852 get new best_valid_loss: 0.45769, save the model now!
# ok, global_best_train_loss: 0.3082862 global_best_valid_loss: 0.4576878

In [None]:
model = CLRP_model(cfg.model_name_or_path)
model.to(cfg.device)

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_model.pth')))
valid_loss = evaluate(dl_valid, model, criterion)

print(f'final best model valid_loss: {valid_loss:.7f}, global_best_valid_loss: {global_best_valid_loss:.7f}')

In [None]:
figure, axes = plt.subplots(figsize=(15, 6))

axes.plot(list(range(len(valid_loss_history))), valid_loss_history, color='red', label='Valid_RMSE_Loss')
plt.title('valid_loss History')
plt.xlabel('step_num')
plt.ylabel('valid_loss')
plt.legend()
plt.savefig('loss_history_valid.png')
plt.show()

In [None]:
figure, axes = plt.subplots(figsize=(15, 6))

axes.plot(list(range(len(train_loss_history))), train_loss_history, color='blue', label='Train_RMSE_Loss')
plt.title('train_loss History')
plt.xlabel('step_num')
plt.ylabel('train_loss')
plt.legend()
plt.savefig('loss_history_train.png')
plt.show()

print('finished, total cost time: ', time.time()-global_start_t)