In [None]:

import numpy as np # linAutoTokenizergebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 300
pd.options.display.max_colwidth = 80

from transformers import AutoTokenizer
from IPython.display import display

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Parameters

In [None]:
MDL_PATH = "../input/distil-roberta-base/"
MAX_LEN = 512
nrows_ = None


# Read data

In [None]:
train = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/train.csv", 
                    nrows = nrows_)
notes = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv")
features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")

test = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/test.csv")


In [None]:
train.head()

In [None]:
notes.head()

In [None]:
print(features.shape)
print(features.feature_num.nunique())

features["feature_text"] = features.feature_text.str.replace("-"," ")
features.head()


In [None]:
test.head()

# Merge data - Train with Notes with Features

In [None]:
print(len(train), len(notes))
train_merged = pd.merge(train, notes, 
                        on = ["case_num", "pn_num"], 
                        how = "inner")
train_merged = pd.merge(train_merged, features[["feature_num","feature_text"]], 
                        on = ["feature_num"], 
                        how = "left")

print(len(train_merged))
train_merged.head(3)

In [None]:
print(len(test), len(notes))
test_merged = pd.merge(test, notes, 
                       on = ["case_num", "pn_num"], 
                       how = "inner")
test_merged = pd.merge(test_merged, features[["feature_num","feature_text"]], 
                       on = ["feature_num"], 
                       how = "left")

print(len(test_merged))
test_merged.head()

In [None]:
print(train.feature_num.nunique())
print(test.feature_num.nunique())

# Helper - decode location

In [None]:
# https://www.kaggle.com/tchaye59/nbme-tensorflow-bert-baseline#Tokenizer

def decode_location(locations):
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])

print(train_merged.location[3])
decode_location(train_merged.location[3])

In [None]:
train_merged['location_'] = train_merged['location'].apply(lambda x: decode_location(x))

In [None]:
notes.pn_history.str.len().describe()

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MDL_PATH)

In [None]:
sample_ = tokenizer.encode_plus((train_merged.feature_text[0], train_merged.pn_history[0]), 
                      max_length = MAX_LEN, 
                      return_offsets_mapping = True,
                      #padding='max_length',
                      #truncation=True,
                     )

pd.DataFrame([(s,tokenizer.decode(s)) for s in sample_['input_ids']]).T

# Data creation fn

### Prepare inputs as 
- Tokenize (feature_text, pn_history_text) - padding to max_len and return offsets to create labels
- For each token, check if annotation location falls within the token offsets
- If yes, then put label as 1 else 0 

In [None]:
def create_data(id_, pn_hist, feature_name, location, train=True):
    input_ids = []
    attention_mask = []
    offsets = []
    labels = []
    ids = []
    for i in range(len(pn_hist)): 
        ids.append(id_[i])
        tok_txt = tokenizer.encode_plus(( feature_name[i], pn_hist[i]), 
                                        max_length = MAX_LEN, 
                                        return_offsets_mapping = True,
                                        padding='max_length',
                                        truncation=True)
        input_ids.append(tok_txt['input_ids'])
        attention_mask.append(tok_txt['attention_mask'])
        offsets.append(tok_txt["offset_mapping"])
        label = []
        if train:
            start_sec = tok_txt['input_ids'].index(2) #Find where second sentence starts
            for i_, offset in enumerate(tok_txt["offset_mapping"]):
                #print(offset)
                if len(location[i]) > 0: 
                    if i_ > start_sec: # when second sentence starts
                        tmp_lbl = []
                        for loc in location[i]:
                            #print(loc)
                            if (offset[0] >= loc[0]) & (offset[1] <= loc[1] ):
                                #print(start_sec)
                                #print('bam')
                                #print(loc)
                                #print(offset)
                                tmp_lbl.extend([1])
                            else:
                                tmp_lbl.extend([0])
                        label.extend([max(tmp_lbl)])
                    else:
                        label.extend([0])
                else:
                    label = [0]*MAX_LEN
            labels.append(label[:MAX_LEN])
            if i < 6:
                sample_pd = pd.DataFrame({"text": tok_txt['input_ids']})
                sample_pd["decoded"] = sample_pd.text.apply(lambda x: tokenizer.decode(x))
                #print(len(label))
                sample_pd["labels"] = label[:MAX_LEN]
                sample_pd["offset"] = tok_txt["offset_mapping"]
                print(f"Example {i}")
                display(sample_pd.query("text != 1").T)
                print(f"Generated label for Example {i}")
                display(sample_pd.query("labels == 1")[['decoded']].T)
                print("\n")

        else:
            labels.append([])
    return {"input_ids":input_ids,
            "attention_mask":attention_mask,
            "offsets":offsets,
            "ids":ids}, labels
            
        

# Train data with examples of label

In [None]:
train_merged.head(6)

In [None]:
train_data, train_labels = create_data(train_merged.id,
                                       train_merged.pn_history, 
                                       train_merged.feature_text,
                                       train_merged.location_)

# Create Test data

In [None]:
test_data, test_labels = create_data(test_merged.id,
                                     test_merged.pn_history,
                                     test_merged.feature_text,
                                     None, train=False)
#test_f_enc = f_num_enc.transform(test_merged.feature_num)
