In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_DISABLED"] = "true"

import shutil
from pathlib import Path
import pickle
import random
import uuid
import datetime
import json
import ast
import itertools

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

import torch
from torch.utils.data import Dataset
import torch.nn as nn

from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, DebertaV2Tokenizer
from transformers import AutoModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

class cfg:
    exp_id = "1002" # 实验ID
    seed = 42 # 随机种子
    data_path = "/home/xm/workspace/nbme-score-clinical-patient-notes/train_processed.pkl" # 
    pretrained_checkpoint = '/home/xm/workspace/output/1001/checkpoint-7908'
    lr = 1e-5
    batch_size = 32
    epochs = 10
    save_total_limit = 2 # 最多checkpoint的数量
    fold = 5


def seed_everything(seed=42):
    '''
    设置随机种子
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(cfg.seed)


def save_json(obj, path):
    '''
    保存json文件
    '''
    with open(path, 'w') as f:
        json.dump(obj, f, indent=4)

def save_pickle(obj, path):
    '''
    保存json文件
    '''
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

In [None]:
df = pd.read_pickle(cfg.data_path)
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_checkpoint, trim_offsets=False)  # trim_offsets==False 删除因offsets造成的空白token
df

In [None]:
def prepare_input(tokenizer, text, feature_text):
    '''
    构造 input 数据
    '''
    inputs = tokenizer(text, #note
                       feature_text, # feature
                       add_special_tokens=True,# 加入特殊token 如[CLS]，[SEP] 
                       return_offsets_mapping=False # 将每个tokens映射回原始文本character级别的位置。
                      )
    return inputs

def create_label(tokenizer, text, feature_text, annotation_length, location_list):
    '''
    构造 label 数据
    '''
    encoded = tokenizer(text, feature_text, #note and features
                        add_special_tokens=True, # 加入特殊token 如[CLS]，[SEP] 
                        return_offsets_mapping=True # 将每个tokens映射回原始文本character级别的位置，[SEP]后从0开始
                       )
    offset_mapping = encoded['offset_mapping']
    # encoded.sequence_ids() 会返回 [CLS] 0*notes [SEP]features[SEP], notes部分为0， feature部分为1
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0] # 非notes部分的全部索引值list
    # 创建label list，非notes部分设置为-100
    label = np.zeros(len(offset_mapping), dtype=int)
    label[ignore_idxes] = -100
    
    if annotation_length != 0: # 有annotation的样本
        for location in location_list: 
            for loc in [s.split() for s in location.split(';')]: # 循环每一个annotation
                start_idx = -1; end_idx = -1 # token-level 位置,待映射
                start, end = int(loc[0]), int(loc[1]) # char-level 位置
                
                # 用 char-level 位置 映射到 token-level 位置
                for idx in range(len(offset_mapping)): # 循环每一个token
                    
                    if (start_idx == -1) & (start < offset_mapping[idx][0]): # 
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1: # 特例:当feature是最后一个词
                    start_idx = end_idx - 1
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1 # 填充label列表
                    
    # label like:    
    # array([-100,    0,    0,     0,    0,    0,    0,    0,    0,    
    #         0,      1,    1,     1,    0,    0, -100, -100, -100,])
    return label



class NBMEDataset(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer = tokenizer 
        self.feature_texts = df['feature_text'].values # feature_text
        self.pn_historys = df['pn_history'].values  # notes_text
        self.annotation_lengths = df['annotation_length'].values # annotation 数量
        self.locations = df['location'].values # annotation location

    def __len__(self):
        return len(self.feature_texts) # 长度

    def __getitem__(self, item):
        inputs = prepare_input(self.tokenizer,
                               self.pn_historys[item],
                               self.feature_texts[item]
                              )
        
        label = create_label(self.tokenizer,
                             self.pn_historys[item],
                             self.feature_texts[item],
                             self.annotation_lengths[item],
                             self.locations[item]
                            )
        
        return {**inputs, 'label': label}

In [None]:
# Model
class NBMEModel(nn.Module):
    def __init__(self, checkpoint):
        super().__init__()
        self.config = AutoConfig.from_pretrained(checkpoint, output_hidden_states=True) # AutoConfig
        self.backbone = AutoModel.from_pretrained(checkpoint) # AutoModel
        self.dropout = nn.Dropout(0.1) # Dropout
        self.classifier = nn.Linear(self.config.hidden_size, 1) #MLP
        self._init_weights(self.classifier) # 初始化 

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # Linear 层
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # Embedding 层 
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)  # initializer_range: 0.02
            if module.padding_idx is not None:
                # padding部分置零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm): 
            # Normalization层 
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, **inputs):
        #inputs(dict)
        #    input_ids, token_type_ids, attention_mask, label
        outputs = self.backbone(**{k: v for k, v in inputs.items() if k != 'label'})
        # outputs: [last_hidden_state], last_hidden_state: [bs, seq_len, hidden_size]
        sequence_output = outputs[0]
        logits = self.classifier(self.dropout(sequence_output)) #获得 preds
        loss = None
        if 'label' in inputs:
            # 计算loss
            loss_fct = nn.BCEWithLogitsLoss(reduction="none")
            loss = loss_fct(logits.view(-1, 1), inputs['label'].view(-1, 1).float())
            loss = torch.masked_select(loss, inputs['label'].view(-1, 1) != -100).mean()
        # 返回值
        return TokenClassifierOutput(
            loss=loss, # loss
            logits=logits, # logits
        )

In [None]:
def spans_to_binary(spans, length=None):
    """
    将spans转换为二元数组，表明每个char是否在span中。
    spans (list of lists of two ints).
    Returns: np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1 # span部分填1
    return binary

def get_score(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of list of lists of two ints): Prediction spans.
        truths (list of list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            # pred和truth的span都为空则跳过
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0) # 获取最大的span值
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
        
        # bin_preds  (list of lists of ints(0 ro 1)): Predictions.
        # bin_truths (list of lists of ints(0 ro 1)): Ground truths.
        preds = np.concatenate(bin_preds)
        truths = np.concatenate(bin_truths)
        
    return f1_score(truths, preds)


def compute_metrics(eval_pred):
    '''
    计算验证集分数
    
    eval_pred:
    logits [n, max_len, 1] / [2860, 417, 1], 填充值为-100
    labels [n, max_len]    / [2860, 417]
    '''

    logits, labels = eval_pred # 验证集预测
    logits = logits.reshape(logits.shape[0], -1) # [n, max_len, 1]  to [n, max_len]
    assert logits.shape == labels.shape
    assert len(logits.shape) == 2
    predictions = (logits > 0).astype(int) #logits数组二元化为 0和1
    predictions_masked = []
    labels_masked = []
    for i in range(logits.shape[0]):
        for j in range(logits.shape[1]):
            if labels[i][j] != -100:
                predictions_masked.append(predictions[i][j])
                labels_masked.append(labels[i][j])
    return {
        'nbme_f1': f1_score(labels_masked, predictions_masked)
    }



def get_char_logits(texts, predictions, tokenizer):
    '''
    获得每个char上的预测概率值
    texts: 原始notes文本数据(会重复，notes对应多个features)
    predictions: token级预测概率值
    '''
    results = [np.zeros(len(t)) for t in texts] # 输出列表 [[0,0,0],[0,0,0,0]]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, # note
                            add_special_tokens=True, # 加入特殊token 如[CLS]，[SEP] 
                            return_offsets_mapping=True # 将每个tokens映射回原始文本char级别的位置。
                           )
        offset_mappings = encoded['offset_mapping']
        for idx, (offset_mapping, pred) in enumerate(zip(offset_mappings, prediction)):
            start, end = offset_mapping
            results[i][start:end] = pred # char级填上logits
    return results



def my_get_results(char_logits, texts, th=0):
    '''
    生成所有样本的span字符串 of list，同一样本的span用;隔开
    '''
    results = []
    for i, char_prob in enumerate(char_logits): # 循环所有样本
        result = np.where(char_prob > th)[0] # 大于阈值的索引值
        # 根据数值是否连续进行分组
        # result: array([  0,   1,  90,  91,  92,  93,  94,  95,  96,  97,  98, 628, 629, 630])
        # to
        # result: [[0, 1], [90, 91, 92, 93, 94, 95, 96, 97, 98], [628, 629, 630]]
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        temp = []
        for r in result:
            s, e = min(r), max(r)
            while texts[i][s] == ' ': # 去掉左侧空格
                s += 1 
            while texts[i][e] == ' ': # 去掉右侧空格
                e -= 1
            temp.append(f"{s} {e+1}")
        result = temp
        result = ";".join(result) # 加入;后保存
        results.append(result)
        
    #  results like ['0 5;64 72', '91 99', '128 134']
    return results

def get_predictions(results):
    '''
    span 字符串 转 list
    from ['0 5;64 72', '91 99', '128 134']  
    return [[[0, 5], [64, 72]], [[91, 99]], [[128, 134]]]
    '''
    predictions = []
    for result in results: # 循环所有样本的span字符串
        prediction = []
        if result != "": # 非空span
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def create_labels_for_scoring(df):
    '''
    label 格式化
    from [['0 5', '64 72'], ['91 99'], ['128 134']] 
    return [[[0, 5], [64, 72]], [[91, 99]], [[128, 134]]]
    '''
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location'] # 获取df的原始location
        if lst:
            new_lst = ';'.join(lst) # 同一样本之间的span加入分号
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]: # 根据分号进行列表创建
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths

In [None]:
scores = []
oof_preds = {}
for fold in range(cfg.fold):
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    val_df = df[df['fold'] == fold].reset_index(drop=True)
    name = f"{cfg.exp_id}_fold{fold}" # exp_name
    args = TrainingArguments(
        output_dir=f"./output/{name}", # output 路径
        evaluation_strategy="steps", # 评估策略 steps
        save_strategy="steps", # 保存策略 steps
        learning_rate=cfg.lr,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        load_best_model_at_end=True, # 训练结束时保存best model
        warmup_ratio=0.2, # 升温比例
        fp16=True, # 混合精度
        dataloader_num_workers=4, # cpu 线程数
        group_by_length=True, # 使用动态padding 更快的训练
        run_name=name, # 实验ID
        metric_for_best_model="nbme_f1", # 评价指标 f1
        save_total_limit=2, # 最多checkpoint的数量
        label_names=['label'], # label 的列名
        seed=cfg.seed, # 随机种子
    )
    model = NBMEModel(cfg.pretrained_checkpoint) # 创建模型
    
    trainer = Trainer(
        model, # 模型
        args,  # 超参数
        train_dataset=NBMEDataset(tokenizer, train_df),  # train 数据集
        eval_dataset=NBMEDataset(tokenizer, val_df), # valid 数据集
        tokenizer=tokenizer, # tokenizer
        data_collator=DataCollatorForTokenClassification(tokenizer), # 数据整理器
        compute_metrics=compute_metrics # 计算评估分数
    )
    trainer.train() # 开始训练
    
    # 验证评估
    predictions = trainer.predict(NBMEDataset(tokenizer, val_df)).predictions  # [n, maxlen, 1]
    predictions = predictions.reshape(len(val_df), -1) # [n, maxlen]
    char_logits = get_char_logits(val_df['pn_history'].values, predictions, tokenizer) # 每个样本char级的预测概率值
    oof_preds.update({k: v for k, v in zip(val_df['id'], char_logits)}) # 字典 样本id : 样本char级的预测概率值
    results = my_get_results(char_logits, val_df['pn_history'].values) # 生成所有样本的span字符串 of list
    preds = get_predictions(results) # span 字符串 转 list
    scores.append(get_score(create_labels_for_scoring(val_df), preds)) # 计算验证集score
    print(f'fold {fold} score: {scores[-1]}')

    # 保存最佳模型
    Path(f"./output/{cfg.exp_id}/").mkdir(parents=True, exist_ok=True)
    torch.save(model.state_dict(), f"./output/{cfg.exp_id}/{fold}.pt")
    # shutil.rmtree(f"./output/{name}")  # 删除中间结果

# 保存config和oof score
print(f'cv score: {np.mean(scores)}')
save_json({**vars(cfg), 'score': np.mean(scores)}, f"./output/{cfg.exp_id}/config.json")
save_pickle(oof_preds, f"./output/{cfg.exp_id}/oof.pkl")