In [None]:
import os
import random
import gc
from pprint import pprint
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

In [None]:
TRAIN = '../input/commonlitreadabilityprize/train.csv'
TEST = '../input/commonlitreadabilityprize/test.csv'
# BERT
BERT = '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'
# Distilbert
DISTILBERT = '../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased'
# Roberta
ROBERTA = '../input/huggingface-roberta-variants/roberta-base/roberta-base'

ARCH_PATH = DISTILBERT

cfg={}
cfg['train']={'n_folds':5}

seed=28

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)


In [None]:
def get_data_stratified(df,n_bins=20,n_splits=5):
    df['bin']=pd.cut(df.target,n_bins,labels=[i for i in range(n_bins)])
    df['fold']=np.nan
    skf=StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True).split(df.id,y=df.bin)
    for fold,(idx_train,idx_val) in enumerate(skf):
        df.loc[idx_val,'fold']=fold
    df['fold'] = df['fold'].astype('int8')

In [None]:
df = pd.read_csv(TRAIN)
get_data_stratified(df)
for fold in range(cfg['train']['n_folds']):
    sns.histplot(data=df.loc[df.fold==fold],x='target',bins=10,hue='fold',label=f'fold{fold}')
plt.legend()

In [None]:
df.loc[106]['excerpt']

In [None]:
df.sort_values('target',ascending=False).head(1)['excerpt'].values[0]

In [None]:
df.sort_values('target').head(1)['excerpt'].values[0]

# 定义分词器Tokenizer，为蒸馏bert预训练模型

In [None]:
cfg['tokenizer'] ={'name': ARCH_PATH,'max_length': 210}
tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])

# Dataset+Dataloader

In [None]:
class cldataset(Dataset):
    '''
    call时，传入index，将该句分词，
    返回第一部分{ids,mask,token_type_ids}，供bert使用
    第二部分target，作为label
    '''
    def __init__(self,df,tokenizer,max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self,index):
        text = self.df.loc[index, 'excerpt']
        inputs = self.tokenizer.encode_plus(
            text,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        ids=inputs['input_ids']
        mask=inputs['attention_mask']
        if cfg['tokenizer']['name']=='bert-base-uncased':
            token_type_ids = inputs['token_type_ids'] 
        else:
            token_type_ids = 1.
        target=self.df.loc[index,['target']]
        return {
            'ids': torch.LongTensor(ids),#单词在词典中编码
            'mask': torch.LongTensor(mask),#self-attention操作指定
             'token_type_ids': torch.tensor(token_type_ids)#区分两个句子的编码
            },{
            'target': torch.Tensor(target)
    }

In [None]:
ds = cldataset(df=df,tokenizer=tokenizer,max_len=cfg['tokenizer']['max_length'])
ds = iter(ds)
inputs, targets = next(ds)#例子

In [None]:
cfg['dl_train'] = {
    'batch_size': 8 if device.type=='cpu' else 16, 
    'shuffle': True, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}
cfg['dl_val'] = {
    'batch_size': 8 if device.type=='cpu' else 64, 
    'shuffle': False, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

In [None]:
ds = cldataset(df=df,tokenizer=tokenizer, 
                max_len=cfg['tokenizer']['max_length'])
dl = DataLoader(ds, **cfg['dl_train'])

# Model

In [None]:
class clbert(nn.Module):
    def __init__(self,name,dropout=True):
        super(clbert, self).__init__()
        self.bert = AutoModel.from_pretrained(name)#导入预训练模型
        self.name = name
        
        if name == BERT:
            self.in_features = self.bert.pooler.dense.out_features
        elif name == DISTILBERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
        elif name == ROBERTA:
            self.in_features = self.bert.pooler.dense.out_features
        else:
            self.in_features = 768
        
        self.fc = nn.Linear(self.in_features, 1)
        self.dense = nn.Linear(self.in_features, self.in_features)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(p=0.2)
        
        torch.nn.init.kaiming_normal_(self.dense.weight)
        torch.nn.init.kaiming_normal_(self.fc.weight)
        
    def forward(self, ids, mask, token_type_ids):
        if self.name == BERT:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.name == DISTILBERT:
            last_hidden_state = self.bert(ids, 
                                           attention_mask=mask, 
                                           return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.name == ROBERTA:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
#                                                   token_type_ids=token_type_ids,
                                                  return_dict=False)
        output = self.dropout(output)
        output = self.fc(output)
        return output

In [None]:
# !pip install tensorwatch

In [None]:
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple torch==1.2

!pip install graphviz  # 安装graphviz
!pip install git+https://github.com/szagoruyko/pytorchviz  # 通过git安装torchviz

import torch
from torchvision.models import AlexNet
from torchviz import make_dot
 
model = clbert(name=cfg['model']['name'])
data=next(iter(dl))
inputs=data[0]
outputs=model(**inputs)

# 这三种方式都可以
g = make_dot(outputs)
 g=make_dot(y, params=dict(model.named_parameters()))
#g = make_dot(y, params=dict(list(model.named_parameters()) + [('x', x)]))
g.render('bert', view=True)

In [None]:
cfg['model'] = {'name': ARCH_PATH}# roberta

In [None]:
model = clbert(name=cfg['model']['name'])
data=next(iter(dl))
inputs=data[0]
outputs=model(**inputs)

# Criterion

In [None]:
def clmetric(y_pred, y_gt):
    assert y_pred.size() == y_gt.size()
    metric = nn.MSELoss()
    metric = torch.sqrt(metric(y_pred, y_gt))
    return metric

# Optimizer

In [None]:
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

from transformers import get_cosine_schedule_with_warmup
from transformers import AdamW

cfg['optim'] = {'lr': 3e-5}
cfg['scheduler'] = {'num_warmup_steps':3, 
                    'num_training_steps':7, 
#                     'num_cycles': 1,
                   }

# Training/infer

In [None]:
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
cfg['train'] ={
    'n_folds': 5,
    'n_epochs': 100
}

In [None]:
class storeloss:    
    def __init__(self, fold):
        self.loss_train_mean = []
        self.loss_train_std = []
        self.loss_val_mean = []
        self.loss_val_std = []
        
        self.fold = fold
        
    def get_loss(self, loss_train, loss_val):
        self.loss_train_mean.append(loss_train[0])
        self.loss_train_std.append(loss_train[1])
        self.loss_val_mean.append(loss_val[0])
        self.loss_val_std.append(loss_val[1])

In [None]:
def train_fun(model,dl,cri,optim,scheduler):
    scaler=GradScaler()
    loss_train=[]
    loss_total=0
    model.train()
    model.to(device)
    progress_bar=tqdm(dl,desc='训练')
    for i, data in enumerate(progress_bar):
        optim.zero_grad()
        inputs={key:value.to(device) for key,value in data[0].items()}
        targets=data[1]['target'].to(device)
        # Enables autocasting for the forward pass (model + loss)
        with autocast():
            output = model(**inputs)
            loss = clmetric(output, targets)

        # Exits the context manager before backward()
        scaler.scale(loss).backward()
        loss_train.append(loss.item())
        loss_total+=loss.item()
        scaler.step(optim)
        scaler.update()
    return np.mean(loss_train),np.std(loss_train)

def val_fun(model,dl):
    scaler=GradScaler()
    loss_val=[]
    loss_total=0
    model.eval()
    model.to(device)
    progress_bar=tqdm(dl,desc='测试')
    with torch.no_grad():
        for i, data in enumerate(progress_bar):

            inputs={key:value.to(device) for key,value in data[0].items()}
            targets=data[1]['target'].to(device)
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                output = model(**inputs)
                loss = clmetric(output, targets)
            loss_val.append(loss.item())
            loss_total += loss.item()

    loss_val_2 = np.array(loss_val)**2 * cfg['dl_val']['batch_size'] / len(dl.dataset)
    print('RMSE for validation set overall: ', np.sqrt(loss_val_2.sum()))
    
        
    return np.sqrt(loss_val_2.sum()), np.std(loss_val)

In [None]:
def run_one_epoch(model, train_dl, val_dl, criterion, optim, scheduler):
    inputs_train = {
        'model': model, 
        'dl': train_dl, 
        'cri': criterion, 
        'optim': optim, 
        'scheduler': scheduler
    }

    inputs_val = {'model': model, 
                  'dl': val_dl}

    loss_train = train_fun(**inputs_train)
    loss_val = val_fun(**inputs_val)
    
    return loss_train, loss_val

In [None]:
def get_dls_for_n_fold(df, fold, tokenizer):
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_ds = cldataset(
        train_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    val_ds = cldataset(
        val_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    train_dl = DataLoader(train_ds, **cfg['dl_train'])
    val_dl = DataLoader(val_ds, **cfg['dl_val'])
    
    return train_dl, val_dl

In [None]:
class earlystopping:
    def __init__(self, patience=2, seq=False):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.stop = False
    def __call__(self, loss, model, optim, cfg, path):
        if self.best_score is None:
            self.best_score = loss
            self.save_checkpoint(model, optim, cfg, path)
        elif loss < self.best_score:
            print(f'Loss decreased {self.best_score} -> {loss}.')
            self.best_score = loss
            self.counter = 0
            self.save_checkpoint(model, optim, cfg, path)
        else:
            self.counter += 1
            if self.counter > self.patience: self.stop = True
            #两轮不更新，则停止
    def save_checkpoint(self, model, optim, cfg, path):
        save_list = {'model': model.state_dict(), 
#                      'optim': optim.state_dict(), 
                     'cfg': cfg}
        SAVE_PATH = path
        torch.save(save_list, SAVE_PATH)

In [None]:
train=True

In [None]:
if train:
    def main():
        df = pd.read_csv(TRAIN)
        get_data_stratified(df, n_splits=cfg['train']['n_folds'])
        tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
        for fold in range(cfg['train']['n_folds']):
            store=storeloss(fold=fold)
            es=earlystopping()
            train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

            model=clbert(name=cfg['model']['name'])
            criterion=clmetric
            optim=AdamW(model.parameters(),**cfg['optim'])
            scheduler=get_cosine_schedule_with_warmup(optim, **cfg['scheduler'])
            if optim.param_groups[0]['lr']==0:
                optim.step()
                scheduler.step()
            inputs = {'model': model,
                      'train_dl': train_dl,
                      'val_dl': val_dl,
                      'criterion': criterion,
                      'optim': optim,
                      'scheduler': scheduler}
            for epoch in range(cfg['train']['n_epochs']):
                loss_train, loss_val = run_one_epoch(**inputs)
                store.get_loss(loss_train, loss_val)
                es(loss_val[0], model, optim, cfg, path=f'clbert_fold{fold}.tar')
                if es.stop:
                    print('Early Stop !')
                    print()
                    break

                scheduler.step()

            del model, optim
            gc.collect()
    main()

In [None]:
def val_fn_cv(model, dl):
    '''
    用model预测传入的dl数据
    '''
    scaler = GradScaler()
    preds = []
    
    model.eval()
    model.to(device)    
    progress_bar = tqdm(dl, desc='cv')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['target'].to(device)            
            with autocast():
                outputs = model(**inputs)
            preds.append(outputs.detach().cpu().numpy())
    
    preds = np.concatenate(preds)    
    return preds

MODEL_NAME = 'clbert'
if train==False:
    MODEL_NAME='../input/clberttrainingoutputs/clbert'
def main_cv():
    '''
    交叉验证训练集
    '''
#     seed_everything(SEED)    
    df = pd.read_csv(TRAIN)
    get_data_stratified(df, n_splits=cfg['train']['n_folds'])
    df['oof'] = np.nan

    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

        model = clbert(name=cfg['model']['name'])
        PATH = os.path.join(MODEL_NAME + f'_fold{fold}.tar')
        saved_contents = torch.load(PATH, map_location=device)        
        model.load_state_dict(saved_contents['model'])
        
        if fold==0:
            cfg_for_train = saved_contents['cfg']
            print('Configuration for training:')
            print()
            pprint(cfg_for_train)
            print()
        
        print('Fold:', fold)
        
        inputs = {'model': model,
                  'dl': val_dl}
        preds = val_fn_cv(**inputs)
        df.loc[df.fold==fold, 'oof'] = preds

    return df

In [None]:
from sklearn.metrics import mean_squared_error
df = main_cv()
df.to_csv('oof_df.csv', index=False)

mse = mean_squared_error(df['target'], df['oof'])
rmse = np.sqrt(mse)
print('CV score: ', rmse)

In [None]:
def main_infer(): 
    '''
    预测test数据
    '''
    df = pd.read_csv(TEST)
    df['target'] = 0.
    
    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        print('Fold:', fold)
        test_ds =cldataset(df, tokenizer=tokenizer,max_len=cfg['tokenizer']['max_length'])
    
        test_dl = DataLoader(test_ds, **cfg['dl_val'])
        #加载训练好的模型
        model = clbert(name=cfg['model']['name'])
        PATH = os.path.join( MODEL_NAME +f'_fold{fold}.tar')
        state_dict = torch.load(PATH, map_location=device)['model']
        model.load_state_dict(state_dict)
        #定义测试集的输入
        inputs = {'model': model,
                  'dl': test_dl}
        #引用了另一个函数
        preds = val_fn_cv(**inputs)
        df['target'] = df['target'] + np.concatenate(preds)
    
    df['target'] = df['target'] / cfg['train']['n_folds']
    return df
MODEL_NAME = 'clbert'
if train==False:
    MODEL_NAME='../input/clberttrainingoutputs/clbert'
df = main_infer()
df[['id', 'target']].to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')
pd.read_csv('oof_df.csv')