In [None]:
!pip install ../input/pytorchlightning160/pytorch_lightning-1.6.0-py3-none-any.whl
# !pip install pytorch-crf

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path / str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path / filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
# from torchcrf import CRF
import pytorch_lightning as pl
from pathlib import Path

# os.system('pip uninstall -y transformers')
# os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class CFG:
    ## 常规设置
#     data_dir = '/home/tzj/data/nbme-score-clinical-patient-notes'
    data_dir = '../input/nbme-score-clinical-patient-notes'
    output_dir = './'
#     weight_path = '../input/multi-drop'
    weight_path = [
        '../input/nbme-pos-weight',
#         '../input/esemble-lb01',
#         '../input/fgm-fold4',
    ]

    debug = False

    seed = 6001
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]

    ## 数据设置
    num_workers = 4
    batch_size = 4
    max_len = 512
    pin_memory = True

    ## 模型设置
    model = "../input/deberta-v3-large/deberta-v3-large"
    fgm = False
    label_smooth = False
    smoothing = 0.1

    fc_dropout = 0.2

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(truths, preds):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
        
    将 preds 和 truths 转换为 0，1 编码， 1 表示是annotation
    然后进行 f1_score(binary)
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring_n(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths

def create_labels_for_scoring(df):
    # 整理原数据集中的 location ，作为打分的标签
    truths = []
    for location_list in df['location']:
        # 有些标注中带有 ";"
        location_list = [loc for location in location_list for loc in location.split(';')]
        truth = []
        if len(location_list) > 0:
            for loc in location_list:
                start, end = loc.split()
                truth.append([int(start), int(end)])
        truths.append(truth)
        '''
        输入形式如下：
        [[[696, 724]],
         [[668, 693]],
         [[203, 217]],
         [[70, 91], [176, 183]],
         [[222, 258]],
         [],
         [[321, 329], [404, 413], [652, 661]]]
        '''
    return truths

def get_char_probs(features, texts, predictions, tokenizer):  
    # 获取每个字符所属类别的概率
    results = [np.zeros(len(t)) for t in texts]
    for i, (feature, text, prediction) in enumerate(zip(features, texts, predictions)):
        encoded = tokenizer(feature, text, add_special_tokens=True, return_offsets_mapping=True)
        offset_mapping = encoded['offset_mapping']
        sequence_ids = encoded.sequence_ids()
        # 这里 offset_mapping 和 prediction 的长度可能不一致，因为 predictions 带有填充，但是 zip 自动丢弃了多余的部分
        for j, (offset, pred) in enumerate(zip(offset_mapping, prediction)):
            if sequence_ids[j] != 1:
                continue
            start = offset[0]
            end = offset[1]
            # 属于同一个 token 的 char 的概率统一为该 token 的概率
            results[i][start:end] = pred
    return results

def get_results(char_probs, th=0.5):
    # TODO 确认字符位置的 +1 -1 
    # 获得预测结果大于 th 的 char index， 并获得其起止位置， 用 “；”隔开每一对
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1 
#         result = np.where(char_prob >= th)[0]
        # itertools.count()： 计数器，默认从 0 开始
        # itertools.groupby(res, key)，将 res 中所有 key 相同的元素进行分组
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
#         result = [f"{min(r)} {max(r) + 1}" for r in result]
        result = ";".join(result)
        results.append(result)
        '''
        返回形式如下：
        ['2 3;11 14;30 32;44 46']
        '''
    return results

def get_predictions(results):
    # 将 str 类型的预测的 char index 转为 int 型
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    '''
    返回形式如下：
    [[2,3], [11, 14], [30, 32], [44, 46]]
    '''
    return predictions

def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

## Utils

In [None]:
def get_tokenizer(tokenizer_path):
    if 'deberta' in tokenizer_path:
        from transformers.models.deberta_v2 import DebertaV2TokenizerFast
        tokenizer = DebertaV2TokenizerFast.from_pretrained(tokenizer_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer


In [None]:
class NBMEDataModule(pl.LightningDataModule):
    def __init__(self, config, prepare_train=True, prepare_test=True):
        super().__init__()
        self.prepare_data_per_node = False
        self.debug = config.debug
        self.debug_size = 0 if self.debug == False else config.debug_size
        self.shuffle = (self.debug == False)
        self.batch_size = config.batch_size
        self.pin_memory = config.pin_memory
        self.num_workers = config.num_workers
        self.max_len = config.max_len
        self.data_dir = config.data_dir
        self.n_fold = config.n_fold

        self.tokenizer = get_tokenizer(config.model)
        self.prepare_train = prepare_train
        self.prepare_test = prepare_test

    def set_trn_fold(self, trn_fold):
        self.trn_fold = trn_fold

    def load_train(self):
        train = pd.read_csv(Path(self.data_dir) / 'train.csv')
        features = pd.read_csv(Path(self.data_dir) / 'features.csv')
        patient_notes = pd.read_csv(Path(self.data_dir) / 'patient_notes.csv')
        train['annotation'] = train['annotation'].apply(ast.literal_eval)
        train['location'] = train['location'].apply(ast.literal_eval)

        train, features, patient_notes = correcting(train, features, patient_notes)
        train = train.merge(features, on=['feature_num', 'case_num'], how='left')
        train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
        train['annotation_length'] = train['annotation'].apply(len)
        return train

    def load_test(self):
        test = pd.read_csv(Path(self.data_dir) / 'test.csv')
        features = pd.read_csv(Path(self.data_dir) / 'features.csv')
        patient_notes = pd.read_csv(Path(self.data_dir) / 'patient_notes.csv')
        submission = pd.read_csv(Path(self.data_dir) / 'sample_submission.csv')

        features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"

        test = test.merge(features, on=['feature_num', 'case_num'], how='left')
        test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
        return test, submission

    def calculate_max_len(self, dataset):
        dataset['pn_history'].fillna('')
        dataset['feature_text'].fillna('')
        tqdm.pandas(desc="pn_history_lens")
        pn_history_lens = dataset['pn_history'].progress_apply(
            lambda x: len(self.tokenizer(x, add_special_tokens=False)['input_ids']))
        tqdm.pandas(desc="pn_history_lens")
        feature_text_lens = dataset['feature_text'].progress_apply(
            lambda x: len(self.tokenizer(x, add_special_tokens=False)['input_ids']))
        max_len_feat = feature_text_lens.max()
        max_len_pn = pn_history_lens.max()
        return (feature_text_lens, pn_history_lens, max_len_feat + max_len_pn + 3)  # cls & sep & sep

    def prepare_data(self):
        if self.prepare_train == True:
            train = self.load_train()
            # 将数据切分成 n 折
            train = CV_group_split(train, self.n_fold, self.debug, self.debug_size)
            self.train_max_len = self.calculate_max_len(train)[2]
            self.train = train
            self.prepare_train = False
            print('Train data prepared!')

        if self.prepare_test == True:
            self.test, self.submission = self.load_test()
            self.prepare_test = False
            print('Test data prepared!')

    def setup(self, stage='fit'):
        if stage == 'fit':
            self.build_fit_dataset(trn_fold=self.trn_fold)

        elif stage == 'test':
            self.build_test_dataset()

        elif stage == 'predict':
            self.build_predict_dataset()

    def build_fit_dataset(self, trn_fold=None):
        df = self.train
        if trn_fold != None:
            self.train_df = df[df['fold'] != trn_fold].reset_index(drop=True)
            self.val_df = df[df['fold'] == trn_fold].reset_index(drop=True)
            self.train_dataset = NBMEDataset(self.train_df, self.tokenizer, self.train_max_len, split='train')
            self.val_dataset = NBMEDataset(self.val_df, self.tokenizer, self.train_max_len, split='val')

    def build_test_dataset(self):
        self.test_dataset = NBMEInferDataset(self.test, self.tokenizer, self.max_len)

    def build_predict_dataset(self):
        self.predict_dataset = NBMEInferDataset(self.test, self.tokenizer, self.max_len)

    def train_dataloader(self):
        loader = DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers,
                            pin_memory=self.pin_memory, shuffle=self.shuffle)
        return loader

    def val_dataloader(self):
        loader = DataLoader(self.val_dataset, batch_size=self.batch_size * 4, num_workers=self.num_workers,
                            shuffle=False)
        return loader

    def test_dataloader(self):
        loader = DataLoader(self.test_dataset, batch_size=self.batch_size * 4, num_workers=self.num_workers,
                            shuffle=False)
        return loader

    def predict_dataloader(self):
        loader = DataLoader(self.predict_dataset, batch_size=self.batch_size * 4, num_workers=self.num_workers,
                            shuffle=False)
        return loader


In [None]:
def tokenize(tokenizer, singel_data, max_len, return_offsets_mapping=True):
    features, text = singel_data[['feature_text', 'pn_history']]
    inputs = tokenizer(
        features,  # question
        text,  # paragraph
        add_special_tokens=True,  # cls, sep
        max_length=max_len,
        padding='max_length',
        return_offsets_mapping=return_offsets_mapping)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

def add_labels(tokenizer, singel_data, max_len, return_offsets_mapping=True):
    text, location_list, annotation_length = singel_data[
        ['pn_history', 'location', 'annotation_length']]
    encoded = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_offsets_mapping=return_offsets_mapping)

    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]

    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        # 每个 feature 可能在同一条 patient note 出现多次
        for location in location_list:
            # 注意，可能有些location 存在";"，如第 8478 条，表示对应的 annotation 存在跳跃
            # ['79 94;100 116']  -> [['79 94'], ['100' '116']]
            for loc in [s.split() for s in location.split(';')]:
                token_start_idx = -1
                token_end_idx = -1
                char_start_idx, char_end_idx = int(loc[0]), int(loc[1])
                ######## offset_mapping 里面， 每一组(start, end)可能会包含前置的空格
                # token_start_index 不能超过界限， 并且其对应单词的首个 char 的位置不能大于 answer 的 start_char
                for idx in range(len(offset_mapping)):
                    if (token_start_idx == -1) & (char_start_idx < offset_mapping[idx][0]):
                        token_start_idx = idx - 1
                    if (token_end_idx == -1) & (char_end_idx <= offset_mapping[idx][1]):
                        token_end_idx = idx + 1
                if token_start_idx == -1:
                    token_start_idx = token_end_idx
                if (token_start_idx != -1) & (token_end_idx != -1):
                    label[token_start_idx:token_end_idx] = 1.0
    return torch.tensor(label, dtype=torch.float)


In [None]:
class NBMEInferDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        single_data = self.df.iloc[index]
        inputs = tokenize(self.tokenizer, single_data, self.max_len, return_offsets_mapping=False)
        return inputs


In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class NBMEModel(pl.LightningModule):
    def __init__(self, config, model_config_path=None, pretrained=False, weight_path=None):
        super().__init__()
        self.save_hyperparameters('config')

        if model_config_path:
            self.model_config = torch.load(model_config_path)
        else:
            self.model_config = AutoConfig.from_pretrained(config.model, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(config.model, config=self.model_config)
        else:
            self.model = AutoModel.from_config(self.model_config)

        self.fc = nn.Linear(self.model_config.hidden_size, 1)

        # TODO multi_dropout / layer norm
        self.dropout_0 = nn.Dropout(config.fc_dropout / 2.)
        self.dropout_1 = nn.Dropout(config.fc_dropout / 1.5)
        self.dropout_2 = nn.Dropout(config.fc_dropout)
        self.dropout_3 = nn.Dropout(config.fc_dropout * 1.5)
        self.dropout_4 = nn.Dropout(config.fc_dropout * 2.)     
        self.__init_weight(self.fc)
        self.__set_metrics()
 
        if config.label_smooth:
            self.criterion = LabelSmoothLoss(smoothing=config.smoothing, loss_func=nn.BCEWithLogitsLoss(reduction="none"))
        else:
            self.criterion = nn.BCEWithLogitsLoss(reduction="none")

        if hasattr(self.hparams.config, 'fgm') and self.hparams.config.fgm:
            self.automatic_optimization = False
            self.fgm = FGM(self)
            
        if weight_path != None:
            weight = torch.load(weight_path, map_location='cpu')
            if 'state_dict' in weight.keys():
                weight = weight['state_dict']
            self.load_state_dict(weight)


    def __set_metrics(self):
        self.train_losses = AverageMeter()
        self.val_losses = AverageMeter()
        self.val_acc = AverageMeter()

        self.train_losses.reset()
        self.val_losses.reset()
        self.val_acc.reset()

    def __init_weight(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states, pooler_output = outputs[0], outputs[1]
        output_0 = self.fc(self.dropout_0(last_hidden_states))
        output_1 = self.fc(self.dropout_1(last_hidden_states))
        output_2 = self.fc(self.dropout_2(last_hidden_states))
        output_3 = self.fc(self.dropout_3(last_hidden_states))
        output_4 = self.fc(self.dropout_4(last_hidden_states))
        logits = (output_0 + output_1 + output_2 + output_3 + output_4) / 5
        return logits

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        y_preds = self.forward(inputs)
        loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        self.train_losses.update(loss.item(), len(labels))
        self.log('train/avg_loss', self.train_losses.avg)
        # 因为 optimizer 有 3 组参数，所有 get_last_lr() 会返回含有 3 个元素的列表
        en_lr = self.trainer.lr_scheduler_configs[0].scheduler.get_last_lr()[0]
        de_lr = self.trainer.lr_scheduler_configs[0].scheduler.get_last_lr()[-1]
        self.log('train/en_lr', en_lr, prog_bar=True)
        self.log('train/de_lr', de_lr, prog_bar=True)

        if (self.trainer.global_step) % self.hparams.config.print_freq == 0:
            # if (self.trainer.global_step + 1) % self.hparams.config.print_freq == 0:
            self.print('Global step:{global_step}.'
                       'Train Loss: {loss.val:.4f}(avg: {loss.avg:.4f}) '
                       'Encoder LR: {en_lr:.8f}, Decoder LR: {de_lr:.8f}'
                       .format(global_step=self.trainer.global_step,
                           loss=self.train_losses,
                               en_lr=en_lr,
                               de_lr=de_lr))
        # 如果没有FGM，在这里就可以返回loss
        # 为了使用FGM，这里要手动进行求导和优化器更新
        if self.hparams.config.fgm:
            # loss regularization， 但是不加效果要更好一些
            # if self.hparams.config.gradient_accumulation_steps > 1:
            #     loss = loss / self.hparams.config.gradient_accumulation_steps
            self.manual_backward(loss)
            torch.nn.utils.clip_grad_norm(self.parameters(), self.hparams.config.max_grad_norm)
            # 这里不能用 global_step ，否则因为关闭了自动优化，global_step 只能在 step 之后才会更新，会陷入死循环
            if (batch_idx + 1) % self.hparams.config.gradient_accumulation_steps == 0:
            # if (self.trainer.global_step + 1) % self.hparams.config.gradient_accumulation_steps == 0:
                self.fgm.attack()
                y_preds_adv = self.forward(inputs)
                loss_adv = self.criterion(y_preds_adv.view(-1, 1), labels.view(-1, 1))
                loss_adv = torch.masked_select(loss_adv, labels.view(-1, 1) != -1).mean()
                self.manual_backward(loss_adv)
                self.fgm.restore()

                opt = self.optimizers()
                opt.step()
                opt.zero_grad()
                sch = self.lr_schedulers()
                sch.step()

        return loss

    def training_epoch_end(self, outs):
        torch.cuda.empty_cache()

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        y_preds = self.forward(inputs)
        loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        self.val_losses.update(loss.item(), len(labels))
        self.log('val/avg_loss', self.val_losses.avg)
        return loss, y_preds.sigmoid().cpu().numpy()

    def validation_epoch_end(self, outs):
        val_df = self.trainer.datamodule.val_df
        val_labels = create_labels_for_scoring_n(val_df)
        valid_features, valid_texts = val_df['feature_text'], val_df['pn_history']
        preds = np.concatenate([item[1] for item in outs])
        val_loss_avg = self.val_losses.avg
        #  ======================== scoring ============================
        char_probs = get_char_probs(valid_texts, preds, self.trainer.datamodule.tokenizer)
        results = get_results(char_probs)
        predictions = get_predictions(results)
        score = get_score(val_labels, predictions)
        self.log(f'val/loss_avg', val_loss_avg)
        self.log(f'val/score', score)
        self.print(f'Global step:{self.trainer.global_step}.\n Val loss avg: {val_loss_avg}, score: {score}')

        self.val_losses.reset()
        self.val_acc.reset()

    def predict_step(self, batch, batch_idx, dataloader_idx= None):
        inputs = batch
        y_preds = self.forward(inputs)
        return y_preds.sigmoid().cpu().numpy()

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        encoder_lr = self.hparams.config.encoder_lr
        decoder_lr = self.hparams.config.decoder_lr
        num_cycles = self.hparams.config.num_cycles
        # end_lr = self.hparams.config.min_lr
        weight_decay = self.hparams.config.weight_decay
        eps = self.hparams.config.eps
        betas = self.hparams.config.betas
        optimizer_parameters = [
            {'params': [p for n, p in self.model.named_parameters()
                        if not any(nd in n for nd in no_decay)],
             'lr':encoder_lr , 'weight_decay': weight_decay,
             },
            {'params': [p for n, p in self.model.named_parameters()
                        if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0,
             },
            {'params': [p for n, p in self.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0,
             }
        ]
        optimizer = AdamW(optimizer_parameters,
                          lr=encoder_lr, eps=eps, betas=betas)

        if self.trainer.max_steps == None or self.trainer.max_epochs != None:
            # 注意，因为使用FGM需要关闭自动优化，传入 trainer 的 accumulate_grad_batches 是None
            # 因此这里计算不能使用 trainer 的参数，要使用 config 里的参数
            # max_steps = (
            #         len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs
            #         // self.trainer.accumulate_grad_batches
            # )
            max_steps = (
                    len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs
                    // self.hparams.config.gradient_accumulation_steps
            )
        else:
            max_steps = self.trainer.max_steps

        warmup_steps = self.hparams.config.warmup_steps
        if isinstance(warmup_steps, float):
            warmup_steps = int(warmup_steps * max_steps)

        print(f'====== Max steps: {max_steps},\t Warm up steps: {warmup_steps} =========')

        if self.hparams.config.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps,
            )
        elif self.hparams.config.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps,
                                num_cycles=num_cycles
            )
        else:
            scheduler = None
        sched = {
            'scheduler': scheduler, 'interval': 'step'
        }
        return ([optimizer], [sched])



In [None]:
#     cfg = create_cfg('test')
cfg = CFG
pl.seed_everything(cfg.seed)
dm = NBMEDataModule(cfg, prepare_train=False)
dm.prepare_data()

test_df = dm.test
valid_features, valid_texts = test_df['feature_text'], test_df['pn_history']
weight_paths = []
for p in cfg.weight_path:
    weight_paths.extend(list(Path(p).rglob('*.ckpt')))
# weight_paths = list(Path(cfg.weight_path).rglob('*.ckpt'))
weight_paths

In [None]:
cv_score = [float(re.search('score([\d.]*)', weight_path.stem).group(1)) for weight_path in weight_paths]
cv_score = torch.tensor(cv_score)
cv_score[-1] = 0.8830
cv_score

In [None]:
weights = nn.functional.softmax(cv_score/0.01, dim=0).float().numpy()
weights

In [None]:
predictions = []
for weight_path in weight_paths:
    weight_name = weight_path.name
    print(f"Using weight from {weight_name}.")

    model = NBMEModel(cfg, model_config_path=None, pretrained=False, weight_path=weight_path)
    # model_weight = torch.load(weight, map_location='cpu')
    # model.load_state_dict(model_weight)
    trainer = pl.Trainer(
        gpus=[0],
        default_root_dir=cfg.output_dir,
    )
    prediction = trainer.predict(model, datamodule=dm)
    prediction = np.concatenate([batch_pred for batch_pred in prediction]).squeeze(axis=-1)
    # prediction = prediction.reshape((len(test_df), cfg.max_len))
    char_prob = get_char_probs(valid_features, valid_texts, prediction, dm.tokenizer)

    predictions.append(char_prob)

    del model, trainer, prediction, char_prob
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# predictions = np.asarray(predictions)
# predictions = np.matmul(weights, predictions.transpose(1, 0, 2)).squeeze()
# predictions = torch.tensor(predictions, dtype=torch.float32)
# predictions = torch.matmul(ws, predictions.permute(1, 0, 2)).squeeze().numpy()
predictions = np.asarray(predictions)
predictions = np.sum([w * p for w, p in zip(weights, predictions)], axis=0)

res = get_results(predictions, th=0.5)
dm.submission['location'] = res
dm.submission[['id', 'location']].to_csv('submission.csv', index=False)




In [None]:
res

In [None]:
res