In [None]:
# 参考： https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain

import sys
import os
import time
import random
import re
from math import sqrt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from transformers import (AutoModel, AutoTokenizer, 
     get_linear_schedule_with_warmup, AdamW)

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

class CFG:
    #model_name_or_path = 'roberta-base'
    #model_name_or_path = 'bert-base-cased'
    #model_name_or_path = '../input/huggingface-bert-variants/bert-large-cased/bert-large-cased'
    #model_name_or_path = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'
    model_name_or_path = '../input/roberta-transformers-pytorch/roberta-base'
    #model_name_or_path = '../input/roberta-transformers-pytorch/roberta-large'
    batch_size = 8
    max_seq_length = 512
    learning_rate = 2.5e-5
    weight_decay = 1e-1
    use_lr_scheduler = True
    mid_eval = True
    mid_eval_step_num = 50
    CV_fold_num = 5
    random_seed = 2021
    model_output_dir = './'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
cfg = CFG()
seed_everything(cfg.random_seed)

QUICK_CHECK = False

global_start_t = time.time()
print('ok')

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train_data = train_data[['excerpt', 'target']]
print('000, train_data.shape: ', train_data.shape, 'test_data.shape: ', test_data.shape)

data = train_data.sample(len(train_data)).reset_index(drop=True)
if QUICK_CHECK:
    data = data[:800]
    
num_bins = int(np.floor(1 + np.log2(len(data))))
data.loc[:,'bins'] = pd.cut(data['target'], bins=num_bins, labels=False)
bins = data['bins'].to_numpy()

print('111, data.shape: ', data.shape)

In [None]:
class CLRP_Dataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.df = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        excerpt = self.df.iloc[index]['excerpt']
        target = self.df.iloc[index]['target']
        d_encode = self.tokenizer(excerpt,
                                  #return_tensors='pt',
                                  padding="max_length",
                                  max_length=self.max_seq_length,
                                  truncation=True)
        #print('d_encode.keys()', d_encode.keys())
        if 'token_type_ids' in d_encode:
            return {"input_ids": d_encode['input_ids'],
                    "token_type_ids": d_encode['token_type_ids'],
                    "attention_mask": d_encode['attention_mask'],
                    "length" : sum(d_encode['attention_mask']),
                    "target": target}
        else:
            return {"input_ids": d_encode['input_ids'],
                    "attention_mask": d_encode['attention_mask'],
                    "length" : sum(d_encode['attention_mask']),
                    "target": target}
    
def collate_fn(batch):
    max_len = max([x['length'] for x in batch])
    input_ids = torch.tensor([x['input_ids'][:max_len] for x in batch])
    attention_mask = torch.tensor([x['attention_mask'][:max_len] for x in batch])
    targets = torch.tensor([x["target"] for x in batch]).float()
    
    if 'token_type_ids' in batch[0]:
        token_type_ids = torch.tensor([x['token_type_ids'][:max_len] for x in batch])
        return {"all_input_ids": input_ids,
                "all_attention_mask": attention_mask,
                "all_token_type_ids": token_type_ids,
                "all_targets": targets}
    else:
        return {"all_input_ids": input_ids,
                "all_attention_mask": attention_mask,
                "all_targets": targets}
    
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name_or_path)
cfg.tokenizer = tokenizer

def get_dataloader(train_data, valid_data):
    global cfg
    ds_train = CLRP_Dataset(train_data, cfg.tokenizer, cfg.max_seq_length)
    dl_train = DataLoader(ds_train, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn, num_workers=0)
    print('len of ds_train: ', len(ds_train), 'len of dl_train: ', len(dl_train))

    ds_valid = CLRP_Dataset(valid_data, cfg.tokenizer, cfg.max_seq_length)
    dl_valid = DataLoader(ds_valid, batch_size=2*cfg.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=0)
    print('len of ds_valid: ', len(ds_valid), 'len of dl_valid: ', len(dl_valid))
    return dl_train, dl_valid

print('ok')

In [None]:
class CLRP_model_V1(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*2, embedding_dim)
        else:
            self.fc1 = nn.Linear(768*2, embedding_dim)
        self.activation1 = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        last_hidden_state, pooled_output = output['last_hidden_state'], output['pooler_output']
        last_hidden_state = self.drop_out(last_hidden_state)
        seq_avg = torch.mean(last_hidden_state, dim=1)
        seq_max = torch.max(last_hidden_state, dim=1)[0]
        concat_out = torch.cat((seq_avg, seq_max), dim=1)
        preds = self.fc2(self.activation1(self.fc1(concat_out)))        
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
class CLRP_model_V2(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*1, 1)
        else:
            self.fc1 = nn.Linear(768*1, 1)
        

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        pooled_output = output['pooler_output']
        pooled_output = self.drop_out(pooled_output)
        preds = self.fc1(pooled_output)
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
CLRP_model = CLRP_model_V2
    
model = CLRP_model(cfg.model_name_or_path)
model.to(cfg.device)

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)
del model
            
print('ok')

In [None]:
def evaluate(eval_iter, model, criterion, train_part=False):
    global cfg
    model.eval()

    predictions_lst, targets_lst = [], []
    for step, batch in enumerate(eval_iter):
        for key in batch.keys():
            batch[key] = batch[key].to(cfg.device)
        with torch.no_grad():
            if 'all_token_type_ids' in batch:
                predictions = model(
                    input_ids=batch['all_input_ids'],
                    attention_mask=batch['all_attention_mask'],
                    token_type_ids=batch['all_token_type_ids'])
            else:
                predictions = model(
                    input_ids=batch['all_input_ids'],
                    attention_mask=batch['all_attention_mask'])
        predictions_lst += list(predictions.cpu().numpy().ravel())
        targets_lst += list(batch['all_targets'].cpu().numpy().ravel())

    #model.train()  # 将模型重新置为训练状态
    assert len(targets_lst)==len(predictions_lst), 'length should be equal'

    RMSE_val = mean_squared_error(targets_lst, predictions_lst, squared=False)
    return RMSE_val

def train(fold_num, train_iter, test_iter, model, optimizer, criterion, lr_scheduler=None):
    global cfg, global_step_num, global_best_valid_loss
    model.train()
    
    predictions_lst, targets_lst = [], []
    for step, batch in enumerate(train_iter):
        global_step_num += 1
        for key in batch.keys():
            batch[key] = batch[key].to(cfg.device)
        if 'all_token_type_ids' in batch:
            predictions = model(
                input_ids=batch['all_input_ids'],
                attention_mask=batch['all_attention_mask'],
                token_type_ids=batch['all_token_type_ids'])
        else:
            predictions = model(
                input_ids=batch['all_input_ids'],
                attention_mask=batch['all_attention_mask'])
        loss = criterion(predictions, batch['all_targets'])
        model.zero_grad()
        loss.backward()
        optimizer.step()
        model.zero_grad()
        if lr_scheduler is not None:
            lr_scheduler.step()
            
        predictions_lst += list(predictions.detach().cpu().numpy().ravel())
        targets_lst += list(batch['all_targets'].cpu().numpy().ravel())
        
        if cfg.mid_eval and global_step_num%cfg.mid_eval_step_num==0:
            valid_loss = evaluate(test_iter, model, criterion)
            print(f'mid eval, global_step_num: {global_step_num}, valid_loss: {valid_loss:.7f}')
            if valid_loss < global_best_valid_loss:
                global_best_valid_loss = valid_loss
                print(f'get new valid_loss: {valid_loss: .7f}, saving model now!')
                torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, f'best_model_fold{fold_num}.pth'))
                
    assert len(targets_lst)==len(predictions_lst), 'length should be equal'
    RMSE_val = mean_squared_error(targets_lst, predictions_lst, squared=False)
    
    return RMSE_val

print('ok')

In [None]:
def run_whole_train(fold_num, dl_train, dl_valid):
    global cfg, EPOCH_NUM, global_step_num
    global global_best_train_loss, global_best_valid_loss
    
    model = CLRP_model(cfg.model_name_or_path)
    model.to(cfg.device)
    criterion = nn.MSELoss()
    
    global_step_num = 0
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
    #optimizer = AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
    lr_scheduler = None
    if cfg.use_lr_scheduler:
        lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                           num_warmup_steps = int(0.1*EPOCH_NUM*len(dl_train)),
                           num_training_steps = EPOCH_NUM*len(dl_train))
    global_best_train_loss = 1e9
    global_best_valid_loss = 1e9

    for epoch_n in range(EPOCH_NUM):
        print('epoch_n: ', epoch_n)
        if time.time() - global_start_t > 60*60*7.5:
            break
        train_loss = train(fold_num, dl_train, dl_valid, model, optimizer, criterion, lr_scheduler=lr_scheduler)
        valid_loss = evaluate(dl_valid, model, criterion)

        if train_loss < global_best_train_loss:
            global_best_train_loss = train_loss
        if valid_loss < global_best_valid_loss:
            global_best_valid_loss = valid_loss
            print(f'after epoch {epoch_n}, global_step_num: {global_step_num} get new best_valid_loss: {valid_loss:.5f}, save the model now!')
            torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, f'best_model_fold{fold_num}.pth'))
            
    return global_best_valid_loss
        
print('ok')

In [None]:
EPOCH_NUM = 3
global_step_num = 0
global_best_train_loss = 1e9
global_best_valid_loss = 1e9

valid_rmse_lst = []
kfold = StratifiedKFold(cfg.CV_fold_num, shuffle=True, random_state=cfg.random_seed)
for fold_num, (train_index, valid_index) in enumerate(kfold.split(data, bins)):
# kfold = KFold(cfg.CV_fold_num, shuffle=True, random_state=cfg.random_seed)
# for fold_num, (train_index, valid_index) in enumerate(kfold.split(data)):
    train_data, valid_data = data.iloc[train_index], data.iloc[valid_index]
    print('****'*20)
    print(f'run fold_num: {fold_num} now! train_data.shape: {train_data.shape} valid_data.shape: {valid_data.shape}')
    print('****'*20)
    dl_train, dl_valid = get_dataloader(train_data, valid_data)
    valid_rmse = run_whole_train(fold_num, dl_train, dl_valid)
    print(f'run fold_num: {fold_num} finished! get valid_rmse: {valid_rmse:7f}')
    valid_rmse_lst.append(valid_rmse)
    #break
    
print('ok, final valid_rmse_lst:', valid_rmse_lst, 'avg rmse: ', np.mean(valid_rmse_lst))
# ok, final valid_rmse_lst: [0.5544281, 0.6582969, 0.5821244, 0.6942738, 0.70970976] avg rmse:  0.6397666
# ok, final valid_rmse_lst: [0.64655215, 0.6353341, 0.6530859, 0.64252084, 0.6323978] avg rmse:  0.64197814
# ok, final valid_rmse_lst: [0.6535841, 0.6094322, 0.77323335, 0.6097351, 0.5807594] avg rmse:  0.64534885

In [None]:
# learning_rate = 2.5e-5      weight_decay = 1e-1  torch.optim.AdamW
# ********************************************************************************
# run fold_num: 0 now! train_data.shape: (2267, 3) valid_data.shape: (567, 3)
# ********************************************************************************
# len of ds_train:  2267 len of dl_train:  284
# len of ds_valid:  567 len of dl_valid:  36
# epoch_n:  0
# mid eval, global_step_num: 50, valid_loss: 0.9053377
# get new valid_loss:  0.9053377, saving model now!
# mid eval, global_step_num: 100, valid_loss: 0.6626867
# get new valid_loss:  0.6626867, saving model now!
# mid eval, global_step_num: 150, valid_loss: 0.6542238
# get new valid_loss:  0.6542238, saving model now!
# mid eval, global_step_num: 200, valid_loss: 0.5725907
# get new valid_loss:  0.5725907, saving model now!
# mid eval, global_step_num: 250, valid_loss: 0.5422387
# get new valid_loss:  0.5422387, saving model now!
# epoch_n:  1
# mid eval, global_step_num: 300, valid_loss: 0.5664148
# mid eval, global_step_num: 350, valid_loss: 0.5028599
# get new valid_loss:  0.5028599, saving model now!
# mid eval, global_step_num: 400, valid_loss: 0.5335220
# mid eval, global_step_num: 450, valid_loss: 0.5373504
# mid eval, global_step_num: 500, valid_loss: 0.4964960
# get new valid_loss:  0.4964960, saving model now!
# mid eval, global_step_num: 550, valid_loss: 0.4986220
# after epoch 1, global_step_num: 568 get new best_valid_loss: 0.49409, save the model now!
# epoch_n:  2
# mid eval, global_step_num: 600, valid_loss: 0.5517622
# mid eval, global_step_num: 650, valid_loss: 0.4894424
# get new valid_loss:  0.4894424, saving model now!
# mid eval, global_step_num: 700, valid_loss: 0.4783250
# get new valid_loss:  0.4783250, saving model now!
# mid eval, global_step_num: 750, valid_loss: 0.4814345
# mid eval, global_step_num: 800, valid_loss: 0.4791949
# mid eval, global_step_num: 850, valid_loss: 0.4801228
# run fold_num: 0 finished! get valid_rmse: 0.478325
# ok, final valid_rmse_lst: [0.47832498] avg rmse:  0.47832498

In [None]:
# ********************************************************************************
# run fold_num: 0 now! train_data.shape: (2267, 3) valid_data.shape: (567, 3)
# ********************************************************************************
# len of ds_train:  2267 len of dl_train:  284
# len of ds_valid:  567 len of dl_valid:  36
# epoch_n:  0
# mid eval, global_step_num: 50, valid_loss: 0.9642584
# get new valid_loss:  0.9642584, saving model now!
# mid eval, global_step_num: 100, valid_loss: 0.6496056
# get new valid_loss:  0.6496056, saving model now!
# mid eval, global_step_num: 150, valid_loss: 0.6068502
# get new valid_loss:  0.6068502, saving model now!
# mid eval, global_step_num: 200, valid_loss: 0.6726950
# mid eval, global_step_num: 250, valid_loss: 0.5651695
# get new valid_loss:  0.5651695, saving model now!
# epoch_n:  1
# mid eval, global_step_num: 300, valid_loss: 0.5648718
# get new valid_loss:  0.5648718, saving model now!
# mid eval, global_step_num: 350, valid_loss: 0.5158271
# get new valid_loss:  0.5158271, saving model now!
# mid eval, global_step_num: 400, valid_loss: 0.5241506
# mid eval, global_step_num: 450, valid_loss: 0.5661162
# mid eval, global_step_num: 500, valid_loss: 0.5110499
# get new valid_loss:  0.5110499, saving model now!
# mid eval, global_step_num: 550, valid_loss: 0.5000905
# get new valid_loss:  0.5000905, saving model now!
# epoch_n:  2
# mid eval, global_step_num: 600, valid_loss: 0.6079005
# mid eval, global_step_num: 650, valid_loss: 0.4984870
# get new valid_loss:  0.4984870, saving model now!
# mid eval, global_step_num: 700, valid_loss: 0.4881012
# get new valid_loss:  0.4881012, saving model now!
# mid eval, global_step_num: 750, valid_loss: 0.4873407
# get new valid_loss:  0.4873407, saving model now!
# mid eval, global_step_num: 800, valid_loss: 0.4882061
# mid eval, global_step_num: 850, valid_loss: 0.4905744
# run fold_num: 0 finished! get valid_rmse: 0.487341
# ok, final valid_rmse_lst: [0.48734075] avg rmse:  0.48734075

In [None]:
# ********************************************************************************
# run fold_num: 0 now! train_data.shape: (2267, 3) valid_data.shape: (567, 3)
# ********************************************************************************
# len of ds_train:  2267 len of dl_train:  284
# len of ds_valid:  567 len of dl_valid:  36
# epoch_n:  0
# mid eval, global_step_num: 50, valid_loss: 0.9642584
# get new valid_loss:  0.9642584, saving model now!
# mid eval, global_step_num: 100, valid_loss: 0.7282062
# get new valid_loss:  0.7282062, saving model now!
# mid eval, global_step_num: 150, valid_loss: 0.6355429
# get new valid_loss:  0.6355429, saving model now!
# mid eval, global_step_num: 200, valid_loss: 0.6707624
# mid eval, global_step_num: 250, valid_loss: 0.5698632
# get new valid_loss:  0.5698632, saving model now!
# epoch_n:  1
# mid eval, global_step_num: 300, valid_loss: 0.5696163
# get new valid_loss:  0.5696163, saving model now!
# mid eval, global_step_num: 350, valid_loss: 0.5831050
# mid eval, global_step_num: 400, valid_loss: 0.5705826
# mid eval, global_step_num: 450, valid_loss: 0.5149580
# get new valid_loss:  0.5149580, saving model now!
# mid eval, global_step_num: 500, valid_loss: 0.6398769
# mid eval, global_step_num: 550, valid_loss: 0.6121566
# after epoch 1, global_step_num: 568 get new best_valid_loss: 0.50032, save the model now!
# epoch_n:  2
# mid eval, global_step_num: 600, valid_loss: 0.5818124
# mid eval, global_step_num: 650, valid_loss: 0.5793452
# mid eval, global_step_num: 700, valid_loss: 0.5918077
# mid eval, global_step_num: 750, valid_loss: 0.5861278
# mid eval, global_step_num: 800, valid_loss: 0.5341970
# mid eval, global_step_num: 850, valid_loss: 0.5541372
# run fold_num: 0 finished! get valid_rmse: 0.500319
# ok, final valid_rmse_lst: [0.5003194] avg rmse:  0.5003194

In [None]:
model = CLRP_model(cfg.model_name_or_path)
model.to(cfg.device)
criterion = nn.MSELoss()

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_model_fold0.pth')))
valid_loss = evaluate(dl_valid, model, criterion)

print(f'final best model valid_loss: {valid_loss:.7f}, global_best_valid_loss: {global_best_valid_loss:.7f}')
print('finished, total cost time: ', time.time()-global_start_t)