In [None]:
!pip install ../input/pytorchlightning160/pytorch_lightning-1.6.0-py3-none-any.whl

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
print(f"pytorch_lightning.__version__: {pl.__version__}")

# import tokenizers
import transformers
# print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class CFG:
    ## 常规设置

    data_dir = '../input/feedback-prize-effectiveness'
    output_dir = '.'
    model = "../input/debertav3small"

    debug = False
    debug_size = 0
    train = True
    #     seed = 42
    seed = 2022
    n_fold = 4
    trn_fold = [0]
    print_freq = 50

    ## 模型设置
    fc_dropout = 0.2
    fgm = False
    fgm_epsilion = 0.1
    label_smooth = False
    smoothing = 0.05

    ## 优化器设置
    scheduler = 'cosine'  # ['linear', 'cosine']
    #     batch_scheduler = True
    num_cycles = 0.5
    warmup_steps = 0.2
    encoder_lr = 2e-5
    decoder_lr_ratio = 1
    #     min_lr = 1e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0.01

    ## 数据设置
    d_padding = False  # Dynamic padding
    num_workers = 4
    batch_size = 16
    max_len = 512
    pin_memory = True
    target_size = 3

    ## Trainer设置
    # "amp_level" is used only when "precision = 16" and "amp_backend = 'apex' ",
    # i.e it's only relevant for only one type of configuration and is generally not required.
    apex = False
    apex_level = 'O1'
    max_epochs = 4
    gradient_accumulation_steps = 4
    precision = 32
    max_grad_norm = 1 if precision == 32 else 500
    fast_dev_run = 0  # 快速检验，取 n 个train, val, test batches
    num_sanity_val_steps = 0  # 在开始前取 n 个val batches
    val_check_interval = 0.5

    if debug:
        d_padding = False
        debug_size = 1000
        max_epochs = 2
        num_workers = 0
        trn_fold = [0]
        fast_dev_run = 0
        num_sanity_val_steps = 0
        val_check_interval = 0.2


## Utils

In [None]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return log_loss(labels, outputs)

## DataModule

In [None]:
class FPEDataModule(pl.LightningDataModule):
    def __init__(self, config, tokenizer_path=None, prepare_train=True, prepare_test=True):
        super().__init__()
        self.prepare_data_per_node = False
        self.seed = config.seed
        self.debug = config.debug
        self.debug_size = 0 if self.debug == False else config.debug_size
        self.shuffle = (self.debug == False)
        self.batch_size = config.batch_size
        self.pin_memory = config.pin_memory
        self.num_workers = config.num_workers
        self.max_len = config.max_len
        self.data_dir = config.data_dir
        self.output_dir = config.output_dir
        self.n_fold = config.n_fold

        self.select_dataset()
        if tokenizer_path != None:
            self.tokenizer = self.get_tokenizer(tokenizer_path)
        else:
            self.tokenizer = self.get_tokenizer(config.model)

        self.prepare_train = prepare_train
        self.prepare_test = prepare_test

    def get_tokenizer(self, tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        return tokenizer

    def set_trn_fold(self, trn_fold):
        self.trn_fold = trn_fold

    def select_dataset(self, dataset_name='structure_1'):
        # TODO
        self.TrainDataset = FeedBackDataset
        self.TestDataset = FeedBackDataset

    def CV_split(self, dataset, n_splits=5, debug=False, debug_size=1000, seed=0):
        # StratifiedKFold
        dataset['label'] = dataset['discourse_effectiveness'].map({'Ineffective': 0, 'Adequate': 1, 'Effective': 2})
        if debug:
            dataset = dataset.sample(n=debug_size, random_state=seed).reset_index(drop=True)

        Fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for n, (train_index, val_index) in enumerate(Fold.split(dataset, dataset['label'])):
            dataset.loc[val_index, 'fold'] = int(n)
        dataset['fold'] = dataset['fold'].astype(int)

        return dataset

    def prepare_data(self):
        if self.prepare_train == True:
            train = pd.read_csv(Path(self.data_dir) / 'train.csv')
            # 将数据切分成 n 折
            train = self.CV_split(train, n_splits=self.n_fold, debug=self.debug,
                                  debug_size=self.debug_size)
            self.train = train
            self.prepare_train = False
            print('Train data prepared!')

        if self.prepare_test == True:
            self.test = pd.read_csv(Path(self.data_dir) / 'test.csv')
            self.submission = pd.read_csv(Path(self.data_dir) / 'sample_submission.csv')
            self.prepare_test = False
            print('Test data prepared!')

    def setup(self, stage='fit'):
        if stage == 'fit':
            self.build_fit_dataset(trn_fold=self.trn_fold)

        elif stage == 'test':
            self.build_test_dataset()

        elif stage == 'predict':
            self.build_predict_dataset()

    def build_fit_dataset(self, trn_fold=None):
        df = self.train
        if trn_fold != None:
            self.train_df = df[df['fold'] != trn_fold].reset_index(drop=True)
            self.val_df = df[df['fold'] == trn_fold].reset_index(drop=True)
            self.train_dataset = self.TrainDataset(self.train_df, self.tokenizer, self.max_len)
            self.val_dataset = self.TrainDataset(self.val_df, self.tokenizer, self.max_len)

    def build_test_dataset(self):
        self.test_dataset = self.TestDataset(self.test, self.tokenizer, self.max_len)

    def build_predict_dataset(self):
        self.predict_dataset = self.TestDataset(self.test, self.tokenizer, self.max_len)

    def train_dataloader(self):
        loader = DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers,
                            pin_memory=self.pin_memory, shuffle=self.shuffle, collate_fn=self.train_dataset.collate)
        return loader

    def val_dataloader(self):
        loader = DataLoader(self.val_dataset, batch_size=self.batch_size * 4, num_workers=self.num_workers,
                            shuffle=False, collate_fn=self.val_dataset.collate)
        return loader

    def test_dataloader(self):
        loader = DataLoader(self.test_dataset, batch_size=self.batch_size * 4, num_workers=self.num_workers,
                            shuffle=False)
        return loader

    def predict_dataloader(self):
        loader = DataLoader(self.predict_dataset, batch_size=self.batch_size * 4, num_workers=self.num_workers,
                            shuffle=False)
        return loader

## Dataset

In [None]:
class FeedBackDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        item = self.df.iloc[index]
        text = item['discourse_text'] + '[SEP]' + item['discourse_type']
        label = item['label']
        
        inputs = self.tokenizer(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len,
        )
        return {
            'input_ids':inputs['input_ids'],
            'attention_mask':inputs['attention_mask'],
            'label':label
            }
    def collate(self, batch):
        dcp = DataCollatorWithPadding(tokenizer=self.tokenizer)
        return dcp(batch)

## Model

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
# MeanPooling trick
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class FPEModel(pl.LightningModule):
    def __init__(self, config, model_config_path=None, pretrained=False, weight_path=None):
        super().__init__()
        self.save_hyperparameters('config')

        if model_config_path:
            self.model_config = AutoConfig.from_pretrained(model_config_path, output_hidden_states=True)
            # self.model_config = torch.load(model_config_path)
        else:
            self.model_config = AutoConfig.from_pretrained(config.model, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(config.model, config=self.model_config)
        else:
            self.model = AutoModel.from_config(self.model_config)

        self.pooler = MeanPooling()

        self.fc = nn.Linear(self.model_config.hidden_size, config.target_size)

        # TODO multi_dropout / layer norm
        self.dropout_0 = nn.Dropout(config.fc_dropout / 2.)
        self.dropout_1 = nn.Dropout(config.fc_dropout / 1.5)
        self.dropout_2 = nn.Dropout(config.fc_dropout)
        self.dropout_3 = nn.Dropout(config.fc_dropout * 1.5)
        self.dropout_4 = nn.Dropout(config.fc_dropout * 2.)

        self.init_weight(self.fc)
        self.set_metrics()

        if config.label_smooth:
            self.criterion = LabelSmoothLoss(smoothing=config.smoothing,
                                             loss_func=nn.BCEWithLogitsLoss(reduction="mean"))
        else:
            self.criterion = nn.CrossEntropyLoss(reduction="mean")

        if hasattr(self.hparams.config, 'fgm') and self.hparams.config.fgm:
            self.automatic_optimization = False
            self.fgm = FGM(self, config.fgm_epsilion)

        if weight_path != None:
            weight = torch.load(weight_path, map_location='cpu')
            if 'state_dict' in weight.keys():
                weight = weight['state_dict']
            self.load_state_dict(weight)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state, pooler_output = outputs[0], outputs[1]
        feature = self.pooler(last_hidden_state, inputs['attention_mask'])
        output_0 = self.fc(self.dropout_0(feature))
        output_1 = self.fc(self.dropout_1(feature))
        output_2 = self.fc(self.dropout_2(feature))
        output_3 = self.fc(self.dropout_3(feature))
        output_4 = self.fc(self.dropout_4(feature))
        return (output_0 + output_1 + output_2 + output_3 + output_4) / 5
    
    def set_metrics(self):
        self.train_losses = AverageMeter()
        self.val_losses = AverageMeter()
        self.val_acc = AverageMeter()

        self.train_losses.reset()
        self.val_losses.reset()
        self.val_acc.reset()

    def init_weight(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def training_step(self, batch, batch_idx):
        labels = batch.pop('labels')
        y_preds = self.forward(batch).float()
        loss = self.criterion(y_preds, labels)
        self.train_losses.update(loss.item(), len(labels))
        self.log('train/avg_loss', self.train_losses.avg)
        # 因为 optimizer 有 3 组参数，所有 get_last_lr() 会返回含有 3 个元素的列表
        en_lr = self.trainer.lr_scheduler_configs[0].scheduler.get_last_lr()[0]
        de_lr = self.trainer.lr_scheduler_configs[0].scheduler.get_last_lr()[-1]
        self.log('train/en_lr', en_lr, prog_bar=True)
        self.log('train/de_lr', de_lr, prog_bar=True)

        if (self.trainer.global_step) % self.hparams.config.print_freq == 0 and batch_idx % self.trainer.accumulate_grad_batches == 0:
            self.print('Global step:{global_step}.'
                       'Train Loss: {loss.val:.4f}(avg: {loss.avg:.4f}) '
                       'Encoder LR: {en_lr:.3E}, Decoder LR: {de_lr:.3E}'
                       .format(global_step=self.trainer.global_step,
                               loss=self.train_losses,
                               en_lr=en_lr,
                               de_lr=de_lr))
        return loss

    def training_epoch_end(self, outs):
        torch.cuda.empty_cache()

    def on_validation_start(self) -> None:
        self.start_val_time = time.time()

    def validation_step(self, batch, batch_idx):
        labels = batch.pop('labels')
        y_preds = self.forward(batch).float()
        loss = self.criterion(y_preds, labels)
        self.val_losses.update(loss.item(), len(labels))
        self.log('val/avg_loss', self.val_losses.avg)
        return y_preds.cpu().numpy(), labels.cpu()

    def validation_epoch_end(self, outs):
        preds = np.concatenate([item[0] for item in outs])
        val_labels = np.concatenate([item[1] for item in outs])

        val_loss_avg = self.val_losses.avg
        #  ======================== scoring ============================
        log_loss = get_score(preds, val_labels)
        self.log(f'val/loss_avg', val_loss_avg)
        self.log(f'val/log_loss', log_loss)
        self.print(f'Global step:{self.trainer.global_step}.\n Val loss avg: {val_loss_avg:.4f}, log_loss: {log_loss:.4f}')

        self.val_losses.reset()
        self.val_acc.reset()

        self.preds = preds  # 保存每折的预测，用于CV评分
        # self.trainer.checkpoint_callback.current_score = torch.tensor(round(log_loss,4))

    def on_validation_end(self) -> None:
        ### callbacks里保存oof
        current_score = self.trainer.callback_metrics[self.trainer.checkpoint_callback.monitor]
        if current_score == self.trainer.checkpoint_callback.best_model_score:
            self.oof_states = self.preds
        eval_time = time.time() - self.start_val_time

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        inputs = batch
        y_preds = self.forward(inputs)
        return y_preds.sigmoid().cpu().numpy()

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        encoder_lr = self.hparams.config.encoder_lr
        decoder_lr = self.hparams.config.decoder_lr_ratio * encoder_lr
        num_cycles = self.hparams.config.num_cycles
        # end_lr = self.hparams.config.min_lr
        weight_decay = self.hparams.config.weight_decay
        eps = self.hparams.config.eps
        betas = self.hparams.config.betas
        optimizer_parameters = [
            {'params': [p for n, p in self.model.named_parameters()
                        if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay,
             },
            {'params': [p for n, p in self.model.named_parameters()
                        if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0,
             },
            {'params': [p for n, p in self.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0,
             }
        ]
        optimizer = AdamW(optimizer_parameters,
                          lr=encoder_lr, eps=eps, betas=betas)

        if self.trainer.max_steps == None or self.trainer.max_epochs != None:
            max_steps = (
                    len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs
                    // self.hparams.config.gradient_accumulation_steps
            )
        else:
            max_steps = self.trainer.max_steps

        warmup_steps = self.hparams.config.warmup_steps
        if isinstance(warmup_steps, float):
            warmup_steps = int(warmup_steps * max_steps)

        print(f'====== Max steps: {max_steps},\t Warm up steps: {warmup_steps} =========')

        if self.hparams.config.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps,
            )
        elif self.hparams.config.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps,
                num_cycles=num_cycles
            )
        else:
            scheduler = None
        sched = {
            'scheduler': scheduler, 'interval': 'step'
        }
        return ([optimizer], [sched])

## Train

In [None]:
pl.seed_everything(CFG.seed)
dm = FPEDataModule(CFG, prepare_test=False)
dm.prepare_data()

print(f"train epochs:{CFG.max_epochs},\t batch_size:{CFG.batch_size} * {CFG.gradient_accumulation_steps}")
print(f"FGM:{CFG.fgm}_{CFG.fgm_epsilion}, \t label_smooth:{CFG.label_smooth}_{CFG.smoothing},")
print(f"encoder_lr:{CFG.encoder_lr},\t", f"decoder_lr:{CFG.encoder_lr * CFG.decoder_lr_ratio}")
print(f"precision:{CFG.precision}, grad_norm:{CFG.max_grad_norm}, \t apex:{CFG.apex}_{CFG.apex_level}")

In [None]:
for train_fold in CFG.trn_fold:
    fgm_p = 'fgm_' if CFG.fgm else ''
    ls = 'ls_' if CFG.label_smooth else ''
    prefix = f'{fgm_p}{ls}fold{train_fold}'
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filename=prefix + 'step{step}-val_log_loss{val/log_loss:.4f}',
        auto_insert_metric_name=False,
        save_top_k=1, monitor='val/log_loss', mode='min', save_last=False, verbose=True, save_weights_only=True,
    )
    callbacks = [checkpoint_callback]
    dm.set_trn_fold(train_fold)

    model = FPEModel(CFG, model_config_path=None, pretrained=True)
    print('Load model!')
    trainer = pl.Trainer(
        gpus=[0],
        default_root_dir=f"seed{CFG.seed}_fold{train_fold}_from_{CFG.model.split('/')[-1]}",
        log_every_n_steps=10,
        amp_backend="apex" if CFG.apex else "native",
        amp_level=CFG.apex_level if CFG.apex else None,
        precision=16 if CFG.apex else CFG.precision,
        max_epochs=CFG.max_epochs,
        callbacks=callbacks,
        gradient_clip_val=None if CFG.fgm else CFG.max_grad_norm,
        accumulate_grad_batches=None if CFG.fgm else CFG.gradient_accumulation_steps,
        fast_dev_run=CFG.fast_dev_run,
        num_sanity_val_steps=CFG.num_sanity_val_steps,
        val_check_interval=CFG.val_check_interval,
    )
    # "amp_level" is used only when "precision = 16" and "amp_backend = 'apex' ",
    # i.e it's only relevant for only one type of configuration and is generally not required.

    trainer.fit(model, datamodule=dm)
    torch.save(model.oof_states, model.trainer.checkpoint_callback.dirpath + f'/fold{train_fold}.oof')