In [None]:
# 参考： https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain

import sys
import os
import time
import random
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from transformers import (AutoModel, AutoModelForMaskedLM, AutoTokenizer, 
    LineByLineTextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, get_linear_schedule_with_warmup)

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

class CFG:
    model_name_or_path = '../input/roberta-transformers-pytorch/roberta-base'
    #model_name_or_path = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased/'
    clip_by_train_range = False
    batch_size = 8
    max_seq_length = 512
    seq_length = 100
    learning_rate = 2e-5
    use_lr_scheduler = True
    mid_eval = True
    mid_eval_step_num = 50
    random_seed = 2021
    model_output_dir = './'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
cfg = CFG()
seed_everything(cfg.random_seed)

global_start_t = time.time()
print('ok')

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train_target_min, train_target_max = train_data['target'].min(), train_data['target'].max()

print(f'train_data.shape: {train_data.shape}, test_data.shape: {test_data.shape}')
train_target_min, train_target_max
# (-3.676267773, 1.7113898269999999)

In [None]:
test_data.head(10)

In [None]:
class CLRP_Dataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length, train=True):
        self.df = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.train = train
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        excerpt = self.df.iloc[index]['excerpt']
        d_encode = self.tokenizer(excerpt,
                                  padding="max_length",
                                  max_length=self.max_seq_length,
                                  truncation=True)
        return {"input_ids": d_encode['input_ids'],
                "attention_mask": d_encode['attention_mask'],
                "length" : sum(d_encode['attention_mask'])}

    
def collate_fn_test(batch):
    max_len = max([x['length'] for x in batch])
    input_ids = torch.tensor([x['input_ids'][:max_len] for x in batch])
    attention_mask = torch.tensor([x['attention_mask'][:max_len] for x in batch])
    #token_type_ids = torch.tensor([x['token_type_ids'][:max_len] for x in batch])
    
    if 'token_type_ids' in batch[0]:
        token_type_ids = torch.tensor([x['token_type_ids'][:max_len] for x in batch])
        return {"all_input_ids": input_ids,
                "all_attention_mask": attention_mask,
                "all_token_type_ids": token_type_ids}
    else:
        return {"all_input_ids": input_ids,
                "all_attention_mask": attention_mask}
    
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name_or_path)
cfg.tokenizer = tokenizer

ds_test = CLRP_Dataset(test_data, tokenizer, cfg.max_seq_length, train=False)
dl_test = DataLoader(ds_test, batch_size=2*cfg.batch_size, shuffle=False, collate_fn=collate_fn_test, num_workers=0)
print('len of ds_test: ', len(ds_test), 'len of dl_test: ', len(dl_test))

print('ok')

In [None]:
class CLRP_model_V1(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*2, embedding_dim)
        else:
            self.fc1 = nn.Linear(768*2, embedding_dim)
        self.activation1 = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        last_hidden_state, pooled_output = output['last_hidden_state'], output['pooler_output']
        last_hidden_state = self.drop_out(last_hidden_state)
        seq_avg = torch.mean(last_hidden_state, dim=1)
        seq_max = torch.max(last_hidden_state, dim=1)[0]
        concat_out = torch.cat((seq_avg, seq_max), dim=1)
        preds = self.fc2(self.activation1(self.fc1(concat_out)))        
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
class CLRP_model_V2(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*1, 1)
        else:
            self.fc1 = nn.Linear(768*1, 1)
        

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        pooled_output = output['pooler_output']
        pooled_output = self.drop_out(pooled_output)
        preds = self.fc1(pooled_output)
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
CLRP_model = CLRP_model_V2
    
model = CLRP_model(cfg.model_name_or_path)
model.to(cfg.device)

model_path = '../input/clrp-pytorch-roberta-base-my-first-model-train/best_model.pth'
model.load_state_dict(torch.load(model_path))

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)
            
print('ok')

In [None]:
def evaluate(eval_iter, model):
    global cfg
    predictions_lst = []
    
    model.eval()
    for step, batch in enumerate(eval_iter):
        for key in batch.keys():
            batch[key] = batch[key].to(cfg.device)
        with torch.no_grad():
            predictions = model(
                input_ids=batch['all_input_ids'],
                attention_mask=batch['all_attention_mask'])
        predictions_lst += list(predictions.cpu().numpy().ravel())

    print('len of predictions_lst: ', len(predictions_lst))
    return predictions_lst

# predictions = evaluate(dl_test, model)
# predictions = np.array(predictions)
# print('before predictions.shape is ', predictions.shape, 'target min: ', 
#       predictions.min(), 'target max: ', predictions.max())

# if cfg.clip_by_train_range:
#     predictions = np.clip(predictions, train_target_min, train_target_max)
# print('after predictions.shape is ', predictions.shape, 'target min: ', 
#       predictions.min(), 'target max: ', predictions.max())

# submission_df = test_data[['id']]
# submission_df['target'] = predictions
# submission_df.to_csv('./submission.csv', index=False)

# print('total cost time: ', time.time() - global_start_t)

print('ok')

In [None]:
model_path = '../input/clrp-roberta-base-my-first-model-5fold-cv/'
predictions_lst = []
CV_FOLD_NUM = 5
for fold_num in range(CV_FOLD_NUM):
    model = CLRP_model(cfg.model_name_or_path)
    model.to(cfg.device)
    model.load_state_dict(torch.load(model_path + f'best_model_fold{fold_num}.pth'))
    predictions = evaluate(dl_test, model)
    predictions = np.array(predictions)
    print('fold_num: ', fold_num, 'predictions.shape: ', predictions.shape)
    predictions_lst.append(predictions)
    del model
    
predictions_array = np.array(predictions_lst)
predictions_avg = np.mean(predictions_array, axis=0)
print(f'predictions_array.shape: {predictions_array.shape}, predictions_avg.shape: {predictions_avg.shape}')

submission_df = test_data[['id']]
submission_df['target'] = predictions_avg
submission_df.to_csv('./submission.csv', index=False)

print('total cost time: ', time.time() - global_start_t)

In [None]:
submission_df.head(10)