In [1]:
import numpy as np
import pandas as pd
import json

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score, performance_measure
from seqeval.scheme import IOB1, IOB2
# from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import KFold

from helpers import *

In [2]:
#Read in Notes for training and test set
all_notes = pd.read_csv('mimic-iv_notes_training_set.csv',index_col='note_id')
all_annotations = pd.read_csv('train_annotations.csv',index_col='note_id')
print("# of Notes:",len(all_notes))

# of Notes: 204


In [3]:
# Set the seed for train/eval/test split
rng = np.random.default_rng(seed=42)
shuffled_indices = rng.permutation(len(all_notes))

# Split notes
train_notes = all_notes.iloc[shuffled_indices[:184],:] #~90%
eval_notes = all_notes.iloc[shuffled_indices[184:194],:] #~5%
test_notes = all_notes.iloc[shuffled_indices[194:],:] #~5%

# Add annotations to each dataset
train_notes_with_annotations = pd.merge(left=train_notes,right=all_annotations,how='left',left_index=True,right_index=True)
eval_notes_with_annotations = pd.merge(left=eval_notes,right=all_annotations,how='left',left_index=True,right_index=True)
test_notes_with_annotations = pd.merge(left=test_notes,right=all_annotations,how='left',left_index=True,right_index=True)

print('Train notes:',len(train_notes),': # of Annotations:',train_notes_with_annotations.shape)
print('Eval notes:',len(eval_notes),': # of Annotations:',eval_notes_with_annotations.shape)
print('Test notes:',len(test_notes),': # of Annotations:',test_notes_with_annotations.shape)

Train notes: 184 : # of Annotations: (46955, 4)
Eval notes: 10 : # of Annotations: (2709, 4)
Test notes: 10 : # of Annotations: (1910, 4)


In [4]:
# model_names = ['bert-base-cased',"dmis-lab/biobert-large-cased-v1.1","microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
#                "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"]
model_names = ["microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"]

scores = pd.DataFrame(columns=['model','char_f1','char_f1_2','accuracy','accuracy_2'])

scores = scores.set_index('model')
MAX_LEN = 512

evaluation_range = len(train_notes)
for i in model_names:
    print('--------------------------')
    print(i)
    tokenizer = AutoTokenizer.from_pretrained(i, model_max_length=MAX_LEN)

    train_tokens, train_token_array, train_map_token_to_char, train_orig_char_array = tokenize_and_label_3label(train_notes,train_notes_with_annotations,tokenizer,use_overflow=True)
    eval_tokens, eval_token_array, eval_map_token_to_char, eval_orig_char_array = tokenize_and_label_3label(eval_notes,eval_notes_with_annotations,tokenizer,use_overflow=True)
    test_tokens, test_token_array, test_map_token_to_char, test_orig_char_array = tokenize_and_label_3label(test_notes,test_notes_with_annotations,tokenizer,use_overflow=True)
    
    label_list = ['O','B-TERM','I-TERM']
    label_to_num = {label: i for i, label in enumerate(label_list)}
    num_to_label = {i: label for i, label in enumerate(label_list)}
    NUM_LABELS = len(label_list)

    # learning_rates = [1e-4,5e-5,5e-6]
    learning_rates = [5e-5]
    for lr in learning_rates:
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            logging_strategy="epoch",
            num_train_epochs=10,
            learning_rate=lr,
            save_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='loss'
        )

        device = "cuda:0" if torch.cuda.is_available() else "cpu"

        model = AutoModelForTokenClassification.from_pretrained(i, num_labels=NUM_LABELS)
        model = model.to(device)

        cols = list(train_tokens.columns)
        cols.remove('offset_mapping')
        train_data = Dataset.from_dict(train_tokens[cols])
        eval_data = Dataset.from_dict(eval_tokens[cols])

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=eval_data,
            tokenizer=tokenizer
        )



        print('Training...')
        trainer.train()

        print('Evaluating...')
        y_pred = []
        y_pred_overflow = []
        all_predicitions = {note_id:[] for note_id in list(test_notes.index)}
        for j in range(len(test_notes)):
            one_input = tokenizer(list(test_notes['text'].values)[j], padding='max_length',
                                    truncation=True, return_tensors="pt").to(device)
            res = model(**one_input).logits.argmax(-1)[0]
            y_pred.append(res)
            all_predicitions[list(test_notes.index)[j]].extend(res)
            for o in one_input[0].overflowing:
                overflow_input = {}
                overflow_input['input_ids'] = torch.as_tensor([o.ids]).to(device)
                if 'token_type_ids' in list(one_input.keys()):
                    overflow_input['token_type_ids'] = torch.as_tensor([o.type_ids]).to(device)
                overflow_input['attention_mask'] = torch.as_tensor([o.attention_mask]).to(device)
                res = model(**overflow_input).logits.argmax(-1)[0]
                y_pred_overflow.append(res)
                all_predicitions[list(test_notes.index)[j]].extend(res)

        combined_model_res = y_pred + y_pred_overflow

        train_annotations = {note_id:[] for note_id in list(train_notes.index)}

        for batch_input in train_tokens.iterrows():
            one_note_annotations = []
            start_token = -1
            end_token = -1

            for j,t in enumerate(batch_input[1]['labels']):
                if t > 0:
                    if start_token == -1:
                        start_token = j

                    end_token = j

                    if j == len(batch_input[1]['labels'])-1 or batch_input[1]['labels'][j+1] in [0,1]:
                        one_note_annotations.append({train_notes.loc[batch_input[0],'text'][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]]:
                                                    [batch_input[1]['offset_mapping'][start_token][0],batch_input[1]['offset_mapping'][end_token][1]]})
                        start_token = -1
                        end_token = -1

            train_annotations[batch_input[0]].extend(one_note_annotations)

        true_annotations = {note_id:[] for note_id in list(test_notes.index)}
        pred_annotations = {note_id:[] for note_id in list(test_notes.index)}
        true_char_array = {note_id:np.zeros(len(test_notes.loc[note_id,'text']),dtype=np.int32) for note_id in list(test_notes.index)}
        true_char_array_2 = {note_id:np.zeros(len(test_notes.loc[note_id,'text']),dtype=np.int32) for note_id in list(test_notes.index)}
        pred_char_array = {note_id:np.zeros(len(test_notes.loc[note_id,'text']),dtype=np.int32) for note_id in list(test_notes.index)}
        pred_char_array_2 = {note_id:np.zeros(len(test_notes.loc[note_id,'text']),dtype=np.int32) for note_id in list(test_notes.index)}
        num = 0
        for batch_input in test_tokens.iterrows():
            ## Get TRUE Annotations
            one_note_annotations = []
            start_token = -1
            end_token = -1
            for j,t in enumerate(batch_input[1]['labels']):
                if t > 0:
                    if start_token == -1:
                        start_token = j

                    end_token = j

                    if j == len(batch_input[1]['labels'])-1 or batch_input[1]['labels'][j+1] in [0,1]:
                        one_note_annotations.append({test_notes.loc[batch_input[0],'text'][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]]:
                                                    [batch_input[1]['offset_mapping'][start_token][0],batch_input[1]['offset_mapping'][end_token][1]]})

                        true_char_array[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]] = t
                        true_char_array_2[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]] = t
                        if t in [2]:
                            true_char_array_2[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]] = t-1
                            true_char_array[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][start_token][1]] = t-1


                        start_token = -1
                        end_token = -1

            true_annotations[batch_input[0]].extend(one_note_annotations)

            ##Get PREDICTED Annotations
            one_note_annotations = []
            start_token = -1
            end_token = -1
            for j,t in enumerate(combined_model_res[num]):
                if t > 0:
                    if start_token == -1:
                        start_token = j

                    end_token = j

                    if j == len(combined_model_res[num])-1 or combined_model_res[num][j+1] in [0,1]:
                        one_note_annotations.append({test_notes.loc[batch_input[0],'text'][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]]:
                                                    [batch_input[1]['offset_mapping'][start_token][0],batch_input[1]['offset_mapping'][end_token][1]]})

                        pred_char_array[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]] = t.cpu()
                        pred_char_array_2[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][end_token][1]] = t.cpu()
                        if t in [2]:
                            pred_char_array_2[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]] = t.cpu()-1
                            pred_char_array[batch_input[0]][batch_input[1]['offset_mapping'][start_token][0]:batch_input[1]['offset_mapping'][start_token][1]] = t.cpu()-1

                        start_token = -1
                        end_token = -1

            pred_annotations[batch_input[0]].extend(one_note_annotations)

            num+=1

        #Evaluation
        pred_char_f1 = []
        pred_char_f1_2 = []
        for x in pred_char_array.keys():
            pred_char_f1.append([num_to_label[y] for y in pred_char_array[x]])
        for x in pred_char_array_2.keys():
            pred_char_f1_2.append([num_to_label[y] for y in pred_char_array_2[x]])
        true_char_f1 = []
        true_char_f1_2 = []
        for x in true_char_array.keys():
            true_char_f1.append([num_to_label[y] for y in true_char_array[x]])
        for x in true_char_array_2.keys():
            true_char_f1_2.append([num_to_label[y] for y in true_char_array_2[x]])

        scores.loc[i+'('+str(lr)+')','char_f1'] = f1_score(true_char_f1,pred_char_f1)
        scores.loc[i+'('+str(lr)+')','char_f1_2'] = f1_score(true_char_f1_2,pred_char_f1_2)
        scores.loc[i+'('+str(lr)+')','accuracy'] = accuracy_score(true_char_f1,pred_char_f1)
        scores.loc[i+'('+str(lr)+')','accuracy_2'] = accuracy_score(true_char_f1_2,pred_char_f1_2)
    
scores

--------------------------
microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext


Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training...


Epoch,Training Loss,Validation Loss
1,0.2287,0.166675
2,0.1397,0.15073
3,0.1137,0.147618
4,0.0927,0.150095
5,0.077,0.158598
6,0.0631,0.180432
7,0.0524,0.184288
8,0.0452,0.199561
9,0.0392,0.213233
10,0.0353,0.218792


Evaluating...


Unnamed: 0_level_0,Train_P(O)_Actual,P(O)_Actual,P(B)_Actual,P(O),P(B),f1,f1_strict,char_f1,char_f1_2,accuracy_2,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext(5e-05),,,,,,,,0.817887,0.784401,0.915484,0.915484


In [5]:
pred_annotations

{'14652764-DS-17': [{'No Known': [178, 186]},
  {'Adverse Drug Reactions': [199, 221]},
  {'ulcerative colitis': [259, 277]},
  {'ileostomy takedown': [322, 340]},
  {'abdominal colectomy': [406, 425]},
  {'laparoscopic proctectomy': [427, 451]},
  {'diverting\nloop ileostomy': [457, 481]},
  {'J pouch': [483, 490]},
  {'pouch': [504, 509]},
  {'normal': [520, 526]},
  {'infection': [579, 588]},
  {'bleeding': [590, 598]},
  {'leak': [600, 604]},
  {'procedures': [620, 630]},
  {'Ulcerative Colitis': [712, 730]},
  {'Lap colectomy': [737, 750]},
  {'ileostomy': [758, 767]},
  {'CAD': [815, 818]},
  {'HLD': [820, 823]},
  {'RA': [837, 839]},
  {'DM': [840, 842]},
  {'NAD': [888, 891]},
  {'CV': [892, 894]},
  {'RRR': [896, 899]},
  {'Resp': [900, 904]},
  {'nl': [906, 908]},
  {'breathing effort': [909, 925]},
  {'GI': [926, 928]},
  {'inc': [930, 933]},
  {'ND': [941, 943]},
  {'NT': [945, 947]},
  {'soft': [949, 953]},
  {'ileostomy takedown': [1066, 1084]},
  {'procedure': [1103, 111

In [6]:
# data = json.dumps(pred_annotations)
# with open("3label_pred.json", "w") as file:
#     json.dump(data, file)

In [7]:
# model.save_pretrained('3label_NER_Final')