In [None]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import  LabelEncoder
from tqdm.auto import tqdm
import random
import os
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import dill
import tensorflow.keras.backend as K
from tqdm.auto import tqdm
from tensorflow.keras import mixed_precision
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import AutoTokenizer, AutoConfig,TFAutoModel
import json

In [None]:
# NEW on TPU in TensorFlow 24: shorter cross-compatible TPU/GPU/multi-GPU/cluster-GPU detection code

try: # detect TPUs
    tpu  = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu )
    tf.tpu.experimental.initialize_tpu_system(tpu )
    strategy = tf.distribute.TPUStrategy(tpu )
    print('Using TPU')
except ValueError: # detect GPUs
    tpu = None
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)


AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
seed=999
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

In [None]:
TRAIN = False 

 # Load dataframes

In [None]:
features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
train= pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
sample_submission= pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

In [None]:
test = test.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])
train = train.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])

In [None]:
train.head(5)

# Tokenizer

In [None]:
MODEL_NAME = 'bert-base-uncased'
DATA_PATH = "../input/nbmebertv1"
DATA_EXISTS = os.path.exists(DATA_PATH)
SEQUENCE_LENGTH = 512

In [None]:
if DATA_EXISTS:
    tokenizer = AutoTokenizer.from_pretrained(DATA_PATH+"/my_tokenizer/",normalization=True)
    config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained('my_tokenizer')
    config.save_pretrained('my_tokenizer')

# Encode the label

In [None]:
EMPTY =  'EMPTY'
CLASSES = [EMPTY,]+features.feature_num.unique().tolist()

if DATA_EXISTS:
    label_encoder = dill.load(open(DATA_PATH+"/label_encoder.dill",'rb'))
else:
    # label_encoder
    label_encoder = LabelEncoder()
    # Encode labels
    label_encoder.fit(CLASSES)
    dill.dump(label_encoder,open('label_encoder.dill','wb'))
train['TARGET']= label_encoder.transform(train['feature_num'])
test['TARGET']= label_encoder.transform(test['feature_num'])
N_CLASSES = len(label_encoder.classes_)
EMPTY_IDX = label_encoder.transform([EMPTY,]) [0]

In [None]:
def decode_location(locations):
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])
    

In [None]:
if DATA_EXISTS:
    sequences = np.load(open(DATA_PATH+"/sequences.npy",'rb'))
    masks = np.load(open(DATA_PATH+"/masks.npy",'rb'))
    labels = np.load(open(DATA_PATH+"/labels.npy",'rb'))
else:
    sequences, labels, masks = [], [], []
    for g1 in tqdm(train.groupby('pn_num')):
        gdf = g1[1]
        pn_history  = gdf.iloc[0].pn_history

        tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
        sequence = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        label = np.array([EMPTY_IDX for _ in range(SEQUENCE_LENGTH)])

        # BUILD THE TARGET ARRAY
        offsets = tokens['offset_mapping']
        label_empty = True
        for index, row in gdf.iterrows():
            TARGET = row.TARGET
            for i, (w_start, w_end) in enumerate(offsets):
                for start,end in decode_location(row.location):
                    if w_start < w_end and (w_start >= start) and (end >= w_end):
                        label[i] = TARGET
                        label_empty = False
                    if w_start >= w_end:
                        break
        if not label_empty:
            sequences.append(sequence)
            masks.append(attention_mask)
            labels.append(label)

    sequences = np.array(sequences).astype(np.int32)
    masks = np.array(masks).astype(np.uint8)
    labels = np.array(tf.keras.utils.to_categorical(labels,N_CLASSES)).astype(np.uint8)

    np.save(open("sequences.npy",'wb'), sequences)
    np.save(open("masks.npy",'wb'), masks)
    np.save(open("labels.npy",'wb'), labels)

# Define Model

In [None]:
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'attention', dtype=tf.int32)
    
    if DATA_EXISTS:
        config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
        backbone = TFAutoModel.from_config(config)
    else:
        config = AutoConfig.from_pretrained(MODEL_NAME)
        backbone = TFAutoModel.from_pretrained(MODEL_NAME,config=config)
    
    out = backbone(tokens, attention_mask=attention)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(out)
    
    model = tf.keras.Model([tokens,attention],out)
    
    return model

# Model training

In [None]:
if TRAIN:
    with strategy.scope():
        model = build_model()

        callback = tf.keras.callbacks.EarlyStopping(monitor='loss',mode='min', patience=3)

        # Compile the model
        model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                      loss=tf.keras.losses.categorical_crossentropy,metrics=['acc',])

        history = model.fit((sequences,masks),labels,
                            batch_size=12,
                            epochs=10,
                            callbacks=[callback,])

        model.save_weights(f'model.h5')

# Submit

In [None]:
if not TRAIN:
    model = build_model()
    model.load_weights(DATA_PATH+"/model.h5")

In [None]:
test_sequences, test_masks, test_offsets = [], [],[]
row_ids = []
targets = []

for g1 in tqdm(test.groupby('pn_num')):
    gdf = g1[1]
    pn_history  = gdf.iloc[0].pn_history
    targets.append([])
    row_ids.append([])
    
    test_tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
    test_sequence = test_tokens['input_ids']
    test_attention_mask = test_tokens['attention_mask'] 

    # BUILD THE TARGET ARRAY
    offset = test_tokens['offset_mapping']
    
    for index, row in gdf.iterrows():
        targets[-1].append(row.TARGET)
        row_ids[-1].append(row.id)
         
    test_sequences.append(test_sequence)
    test_masks.append(test_attention_mask)
    test_offsets.append(offset)

test_sequences = np.array(test_sequences).astype(np.int32)
test_masks = np.array(test_masks).astype(np.uint8)
targets_to_row_ids = [dict(zip(a,b)) for a,b in zip(targets,row_ids)]

In [None]:
preds = model.predict((test_sequences,test_masks),batch_size=16)
preds = np.argmax(preds,axis=-1)

In [None]:
def decode_position(pos):
    return ";".join([" ".join(np.array(p).astype(str)) for p in pos])


def translate(preds,targets_to_row_ids,offsets):
    all_ids = []
    all_pos = []

    for k in range(len(preds)):
        offset = offsets[k]
        pred = preds[k]
        targets_to_ids = targets_to_row_ids[k]
        
        prediction = {targets_to_ids[t]:[] for t in targets_to_ids}
        i = 0
        while i<SEQUENCE_LENGTH:
            label = pred[i]
            
            if label == EMPTY_IDX:
                i += 1
                continue
            if label in targets_to_ids:
                key = targets_to_ids[label]
                start = offset[i][0]
                while i<SEQUENCE_LENGTH:
                    if pred[i] != label:
                        break
                    else:
                        end = max(offset[i])
                    i += 1
                if  end == 0:
                    break
                prediction[key].append((start,end))
            else:
                i+=1
        for key in prediction:
            all_ids.append(key)
            all_pos.append(decode_position(prediction[key]))
    df = pd.DataFrame({
        "id":all_ids,
        "location": all_pos
    })
    return df

In [None]:
sub = translate(preds,targets_to_row_ids,test_offsets)
sub.to_csv('submission.csv',index=False)
sub.head(50)