In [1]:
pip install transformers[sentencepiece] datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:0

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import pandas as pd
import os
import random
from tqdm import tqdm

In [4]:
file_dir = '/content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/'

In [5]:
import spacy

def display_ner(doc):
    spacy.displacy.render(doc, style="ent",manual=True, jupyter=True)

In [6]:
MODEL_NAME = 'bert-base-uncased'
MODEL_NAME = 'bert-large-uncased'
#MODEL_NAME = 'microsoft/deberta-v2-xlarge'
SEQUENCE_LENGTH = 512
BATCH_SIZE = 2
EPOCHS = 25

In [7]:
from pandas.core.groupby import groupby
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer, AutoConfig,TFAutoModel, DataCollatorForLanguageModeling, TFAutoModelWithLMHead
#from transformers import DebertaV2TokenizerFast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def is_overlapping(x1, x2, y1, y2):
    return max(x1,y1) <= min(x2,y2)

class NBMEDataset:
    def __init__(self):
        #Read the data files
        self.features = pd.read_csv(os.path.join(file_dir, 'features.csv'))
        self.patient_notes = pd.read_csv(os.path.join(file_dir, 'patient_notes.csv'))
        self.test = pd.read_csv(os.path.join(file_dir, 'test.csv'))
        self.train= pd.read_csv(os.path.join(file_dir, 'train.csv'))
        self.sample_submission= pd.read_csv(os.path.join(file_dir, 'sample_submission.csv'))
        
        #Merge patient notes and features into the train/test dataframe
        self.test = self.test.merge(self.patient_notes,on=['case_num','pn_num']).merge(self.features,on=['case_num','feature_num'])
        self.train = self.train.merge(self.patient_notes,on=['case_num','pn_num']).merge(self.features,on=['case_num','feature_num'])
    
    def sample_patient_notes(self,num=5):
        pn_unique = self.train['pn_num'].unique()
        for i in range(num):
            print('\n\n****************\n\n')
            self.pretty_print_sample(random.choice(pn_unique))
  
    def pretty_print_sample(self, pn_num):
        pn_df = self.train[self.train['pn_num']==pn_num]
        loc_arr = pn_df[['location','feature_text']].values
        ners = []
        pn_text = pn_df.iloc[0]['pn_history']
        for loc, ftext in loc_arr:
            locs = self.str_to_list(loc)
            for loc_i in locs:
                if(loc_i):
                    x,y = loc_i.split(' ')
                    ners.append({"start": int(x), "end": int(y), "label": ftext})
        doc = {"text": pn_text, "ents": sorted(ners, key=lambda i: i["start"])}
        display_ner(doc)

class NBMEforNER(NBMEDataset):
    def __init__(self):
        super().__init__()

    def print_predictions(self, text, loc_list):
        ners = []
        for loc in loc_list:
            locs = self.str_to_list(loc)
            for loc_i in locs:
                if(loc_i):
                    x,y = loc_i.split(' ')
                    ners.append({"start": int(x), "end": int(y), "label": text[int(x):int(y)]})
        doc = {"text": text, "ents": sorted(ners, key=lambda i: i["start"])}
        display_ner(doc)

    def str_to_list(self, sstring):
        for x in ["[","]","'"]:
            sstring = sstring.replace(x,'')
        sstring = sstring.replace(', ',',')
        sstring = sstring.replace(',',';')
        sstring = sstring.split(";")
        return sstring

    def build_nlp_dataset(self, mode='train'):
        #Use tokenizer from a pretrained model to split the input text into tokens, and attention masks
        #Contruct the target array using the "location" column, and the "feature_text" column
        
        #Initialize the tokenizer
        if('deberta' in MODEL_NAME):
            self.tokenizer = DebertaV2TokenizerFast.from_pretrained("microsoft/deberta-v2-xlarge")
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
            self.config = AutoConfig.from_pretrained(MODEL_NAME)

        #Initialize the sequence lists
        if(mode=='train'):
            self.sequences = []
            self.att_masks = []
            self.targets = []
            self.offsets = []
        else:
            self.sequences_testset = []
            self.att_masks_testset = []
            self.offsets_testset = []

        #Label encode the feature_num column, along with an "background" class
        if(mode=='train'):
            le = LabelEncoder()
            self.train['feature_num_le'] = le.fit_transform(self.train['feature_num'])
            self.background_class_idx = np.max(self.train['feature_num_le']) + 1

            gpby = self.train.groupby(['pn_num'])
        else:
            gpby = self.test.groupby(['pn_num'])


        for pn_dfg in gpby:
            pn_df = pn_dfg[1]
            pn_text = pn_df.iloc[0]['pn_history']
            tokens = self.tokenizer.encode_plus(pn_text, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
            offsets = tokens['offset_mapping']
            sequence = tokens['input_ids']
            att_mask = tokens['attention_mask']
            
            if(mode=='train'):
                #Should be same length as sequence, and be initialized to the index of the "background" class
                target = np.ones_like(sequence)*self.background_class_idx 

                #For each of the tokens generated for the input text,
                #check whether this token has a target associated with it,
                #and if so, which class.
                for index, row in pn_df.iterrows():
                    loc_str = row['location']
                    feature_idx = row['feature_num_le']
                    loc_list = self.str_to_list(loc_str)
                    for loc_i in loc_list:
                        if(loc_i):
                            x,y = loc_i.split(' ')
                            loc_s = int(x)
                            loc_e = int(y)
                            for noffset, offset in enumerate(offsets):
                                off_s = offset[0]
                                off_e = offset[1]
                                if(is_overlapping(loc_s, loc_e, off_s, off_e)):
                                    target[noffset] = feature_idx
                
                #Sanity test to verify the post process function
                
                if(len(self.sequences)==0):
                    print('SANITY CHECK of POSTPROCESS FUNCTION')
                    #Print the 'location' column from pn_df
                    print(pn_df[['feature_num_le','location']])

                    #Print the 'location' obtained by postprocessing the target/offset
                    print(self.postprocess_predictions(target, offsets))

                self.sequences.append(sequence)
                self.att_masks.append(att_mask)
                self.targets.append(target)
                self.offsets.append(offsets)
            else:
                self.sequences_testset.append(sequence)
                self.att_masks_testset.append(att_mask)
                self.offsets_testset.append(offsets)
        if(mode=='train'):
            self.sequences = np.array(self.sequences).astype(np.int32)
            self.att_masks = np.array(self.att_masks).astype(np.uint8)
            self.targets = np.array(self.targets).astype(np.uint32)
        else:
            self.sequences_testset = np.array(self.sequences_testset).astype(np.int32)
            self.att_masks_testset = np.array(self.att_masks_testset).astype(np.uint8)
        pass
    
    def test_train_split(self):
        self.seq_train, self.seq_test, self.mask_train, self.mask_test, self.target_train, self.target_test, self.offsets_train, self.offsets_test = train_test_split(self.sequences, self.att_masks, self.targets, self.offsets, test_size=0.20, random_state=42)
    
    def postprocess_predictions(self, pred, offset):
        #pred is expected to be an array of class labels
        chain_started = False
        prev_class = None
        prev_class_start = None
        prev_class_end = None
        class_locations = {}
        for n, pred_class in enumerate(pred):
            if(pred_class==self.background_class_idx):
                #Dump existing and Re-init chain parameters
                if(chain_started):
                    if(prev_class in class_locations):
                        class_locations[prev_class] = class_locations[prev_class][:-1] + ", '{} {}']".format(prev_class_start, prev_class_end)
                    else:
                        class_locations[prev_class] = "['{} {}']".format(prev_class_start, prev_class_end)
                chain_started = False
                prev_class = None
                prev_class_start = None
                prev_class_end = None
                continue
            else:
                if(not(chain_started)):
                    #Starting a new chain
                    prev_class = pred_class
                    prev_class_start = offset[n][0]
                    prev_class_end = offset[n][1]
                    chain_started = True
                else:
                    if(pred_class==prev_class):
                        #Continuing a chain
                        prev_class_end = offset[n][1]
                    else:
                        #Class different, Dump previous chain and start a new chain
                        if(prev_class in class_locations):
                            class_locations[prev_class] = class_locations[prev_class][:-1] + ", '{} {}']".format(prev_class_start, prev_class_end) 
                        else:
                            class_locations[prev_class] = "['{} {}']".format(prev_class_start, prev_class_end)
                        prev_class = pred_class
                        prev_class_start = offset[n][0]
                        prev_class_end = offset[n][1]
        return class_locations  
    
    def evaluation_metric_per_instance(self, true_str, pred_str):
        true_idx = []
        pred_idx = []
        true_locs = self.str_to_list(true_str)
        pred_locs = self.str_to_list(pred_str)
        
        for tc in true_locs:
            if(tc):
                x,y = tc.split(' ')
                for idx in range(int(x), int(y)):
                    true_idx.append(idx)
        for pc in pred_locs:
            if(pc):
                x,y = pc.split(' ')
                for idx in range(int(x), int(y)):
                    pred_idx.append(idx)
        
        true_idx = set(true_idx)
        pred_idx = set(pred_idx)

        #print(true_idx)
        #print(pred_idx)

        true_pos = [value for value in true_idx if value in pred_idx]
        false_neg = [value for value in true_idx if value not in pred_idx]
        false_pos = [value for value in pred_idx if value not in true_idx]

        return len(true_pos), len(false_neg), len(false_pos)
    
    def evaluation_metric(self,true_dict, pred_dict):
        tp_classes = [key for key in true_dict if key in pred_dict]
        fn_classes = [key for key in true_dict if key not in pred_dict]
        fp_classes = [key for key in pred_dict if key not in true_dict]

        tp = 0 
        fn = 0
        fp = 0

        for cls in tp_classes:
            #print(cls)
            tpi, fni, fpi = self.evaluation_metric_per_instance(true_dict[cls], pred_dict[cls])
            tp += tpi
            fn += fni
            fp += fpi
        
        for cls in fn_classes:
            #print(cls)
            tpi, fni, fpi = self.evaluation_metric_per_instance(true_dict[cls], "")
            tp += tpi
            fn += fni
            fp += fpi

        for cls in fp_classes:
            #print(cls)
            tpi, fni, fpi = self.evaluation_metric_per_instance("", pred_dict[cls])
            tp += tpi
            fn += fni
            fp += fpi
        
        return tp, fn, fp


class NBMEforMLM(NBMEDataset):

    def __init__(self):
        super().__init__()

    def build_mlm_dataset(self):
        
        #Tokenizers
        self.mlm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
        self.mlm_config = AutoConfig.from_pretrained(MODEL_NAME)

        #DataCollator for MaskedLanguageMOdeling
        self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.mlm_tokenizer, mlm=True, mlm_probability=0.15)

        #Tokenize input texts
        self.tokens = self.mlm_tokenizer.batch_encode_plus(self.patient_notes['pn_history'].to_list(), max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True)
        self.tokens = np.array(self.tokens['input_ids'])

        #Package data for MLM task training
        self.input_tokens, self.target_tokens = self.data_collator.numpy_mask_tokens(self.tokens)

    def test_train_split(self):
        self.input_tokens_train, self.input_tokens_test, self.target_tokens_train, self.target_tokens_test = train_test_split(self.input_tokens, self.target_tokens, test_size=0.20, random_state=42)


In [8]:
dataset_ner = NBMEforNER() 
dataset_ner.build_nlp_dataset()
dataset_ner.test_train_split()
#Sanity test the evaluation metric
true_str = "['404 413', '652 661']"
pred_str = "['321 329', '652 661']"
dataset_ner.evaluation_metric_per_instance(true_str, pred_str)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  for pn_dfg in gpby:


SANITY CHECK of POSTPROCESS FUNCTION
      feature_num_le                           location
0                  0                        ['696 724']
100                1                        ['668 693']
200                2                        ['203 217']
300                3               ['70 91', '176 183']
400                4                        ['222 258']
500                5                                 []
600                6  ['321 329', '404 413', '652 661']
700                7                                 []
800                8                                 []
900                9                ['26 38', '96 118']
1000              10                          ['56 69']
1100              11                            ['5 9']
1200              12                          ['10 11']
{11: "['5 9']", 12: "['10 11']", 9: "['26 39', '95 118']", 10: "['56 69']", 3: "['70 91', '176 184']", 2: "['203 217']", 4: "['222 258']", 6: "['321 330', '404 413', '652 661']", 

(9, 9, 8)

In [9]:
def eval_ner(nlp_model):
    pred_test = nlp_model.predict([dataset_ner.seq_test, dataset_ner.mask_test])
    pred_test_class = np.argmax(pred_test, axis=2)
    tp = 0
    fn = 0
    fp = 0
    for test_idx in range(pred_test_class.shape[0]):
        pred_dict = dataset_ner.postprocess_predictions(pred_test_class[test_idx], dataset_ner.offsets_test[test_idx])
        true_dict = dataset_ner.postprocess_predictions(dataset_ner.target_test[test_idx], dataset_ner.offsets_test[test_idx])
        tpi, fni, fpi = dataset_ner.evaluation_metric(true_dict, pred_dict)
        tp += tpi
        fn += fni
        fp += fpi
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = (2*precision*recall)/(precision+recall)
    print('Precision:{}, Recall:{}, F1:{}'.format(precision, recall, f1))

In [10]:
class NER_Callback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
      eval_ner(self.model)
      return

class NER_Model:
    def __init__(self, use_pretrained_mlm=False):
        #Initialize the pretrained model
        if(not(use_pretrained_mlm)):
            self.config = AutoConfig.from_pretrained(MODEL_NAME)
            self.backbone = TFAutoModel.from_pretrained(MODEL_NAME,config=self.config)
        else:
            self.config = AutoConfig.from_pretrained('/content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/mlm_bert_large_uncased/')
            self.backbone = TFAutoModel.from_pretrained('/content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/mlm_bert_large_uncased/',config=self.config)
    
    def build_model(self, num_classes):
        tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'tokens', dtype=tf.int32)
        att_masks = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'attention', dtype=tf.int32)
        
        features = self.backbone(tokens, attention_mask=att_masks)[0]
        
        target = tf.keras.layers.Dropout(0.5)(features)
        target = tf.keras.layers.Dense(num_classes, activation='softmax')(target)
        
        self.model = tf.keras.Model([tokens,att_masks],target)

        self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                           loss=tf.keras.losses.sparse_categorical_crossentropy,
                           metrics=['accuracy'])

    def train_model(self, x_data_in, x_data_att, y_data, x_data_in_val, x_data_att_val, y_data_val, class_weights):
        history = self.model.fit(x = [x_data_in, x_data_att], y = y_data, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=([x_data_in_val, x_data_att_val], y_data_val), class_weight=class_weights, callbacks=[NER_Callback()])


def mlm_loss(y_true, y_pred):
        y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -100))
        y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -100))
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked,
                                                              y_pred_masked,
                                                              from_logits=True)
        #loss = tf.nn.compute_average_loss(loss)
        return loss




class MLM_Model:
    def __init__(self):
      self.config = AutoConfig.from_pretrained(MODEL_NAME)
      self.model = TFAutoModelWithLMHead.from_pretrained(MODEL_NAME, config=self.config)
    
      self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                            loss=mlm_loss,
                            metrics=['accuracy'])
    
    def train_model(self, x_data_in, y_data, x_data_in_val, y_data_val, epochs, callbacks):
        history = self.model.fit(x = x_data_in, y = y_data, batch_size=BATCH_SIZE, epochs=epochs, validation_data=(x_data_in_val, y_data_val))




In [11]:
def mlm_pretraining(epochs):
    dataset = NBMEforMLM()
    dataset.build_mlm_dataset()
    dataset.test_train_split()
    mlm_model = MLM_Model()
    mlm_model.train_model(dataset.input_tokens_train, dataset.target_tokens_train, dataset.input_tokens_test, dataset.target_tokens_test, epochs)
    mlm_model.model.save_pretrained('/content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/mlm_bert_large_uncased/')

In [12]:
def ner_modeling(from_pretrained=False):
    unique, counts = np.unique(dataset_ner.target_train, return_counts=True)
    class_weights = {}
    total_samples = np.sum(counts)
    n_classes = np.max(dataset_ner.train['feature_num_le']) + 2
    for class_id, count in zip(unique, counts):
        class_weights[class_id] = (1./count)*(total_samples/n_classes)

    nlp_model = NER_Model(from_pretrained)
    nlp_model.build_model(n_classes)
    nlp_model.train_model(dataset_ner.seq_train, dataset_ner.mask_train, dataset_ner.target_train, dataset_ner.seq_test, dataset_ner.mask_test, dataset_ner.target_test, class_weights)
    
    #nlp_model.model.save_pretrained('/content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/ner/')

In [None]:
#mlm_pretraining(3)

In [None]:
#ner_modeling(False)

In [None]:
ner_modeling(True)

Some layers from the model checkpoint at /content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/mlm_bert_large_uncased/ were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/My Drive/Kaggle/nbme-score-clinical-patient-notes/mlm_bert_large_uncased/ and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and in

Epoch 1/25




Precision:0.1536050156739812, Recall:0.006145279422092907, F1:0.011817765236475894
Epoch 2/25
Precision:0.47873303167420816, Recall:0.013268786997090398, F1:0.0258218827032436
Epoch 3/25
Precision:0.6451411775184822, Recall:0.18167452593558744, F1:0.2835111067619141
Epoch 4/25
Precision:0.6960164926514597, Recall:0.5250326076050968, F1:0.5985530868433845
Epoch 5/25