In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random
import time
import os
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from ast import literal_eval

In [None]:
df_patient_notes = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv")
df_features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")
df_train = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/train.csv")
df_test = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/test.csv")


In [None]:
def clean_string(text):
    #text = text.lower()
    #text = text.replace("\r", "")
    #text = text.replace("\n", " ")
    #TODO
    # - Split for ages so 17-yo 17yo etc = 17-yo
    #word_tokens = word_tokenize(text)    
    #return ' '.join(word_tokens)
    return text

In [None]:
%%time
#Clean the patient notes
df_patient_notes["pn_history_clean"] = df_patient_notes["pn_history"].apply(clean_string)
df_train["annotation_clean"] = df_train["annotation"].apply(clean_string)


In [None]:
max_note_length = df_patient_notes["pn_history"].apply(len).max()
print(f"Max note length: {max_note_length}")

**PREPARE TARGET**

In [None]:
df_train["location"] = df_train["location"].str.replace(";", "', '")
df_train["location_list"] = [literal_eval(x) for x in df_train["location"]]
df_train["annotation_clean"] = [literal_eval(x) for x in df_train["annotation_clean"]]
df_train.head()

In [None]:
features = df_train["feature_num"].unique()
num_of_features = len(features)
print(f"Number of unique features: {num_of_features}")

In [None]:
%%time
pn_nums = df_train["pn_num"].unique()
empty_label = [-1 for i in range(0,max_note_length)]
empty_labels =  [empty_label for i in range(0,len(df_patient_notes))]
df_patient_notes["label"] = empty_labels
labels = []

#list of features (to generate onehot 2D encoding from continuous integers of featuers)
feature_mapping = df_features["feature_num"].unique() 

for pn_num in pn_nums:
    features_for_pn = df_train[df_train["pn_num"] == pn_num]
    patient_note = df_patient_notes[df_patient_notes["pn_num"] == pn_num].iloc[0] 
    
    label = np.full((len(patient_note["pn_history_clean"])), -1)
    for feature_i in features_for_pn.index:
        feature_label = features_for_pn.at[feature_i, "feature_num"]
        feature_label = np.where(feature_mapping == feature_label)[0][0]
        locations = features_for_pn.at[feature_i, "location_list"]
        
        #print(locations)
        for location in locations:
            #print(location)
            start, end = int(location.split(" ")[0]), int(location.split(" ")[1])
            #print("jou?")
            for lbl_i in range(start, min(end, len(label))):
                label[lbl_i] = feature_label
    df_patient_notes.at[df_patient_notes[df_patient_notes["pn_num"] == pn_num].index[0], "label"] = label
    #df_patient_notes.at[df_patient_notes["pn_num"] == pn_num, "label"] = label
    
#tokenizer.sequences_to_texts([]).split(" ")


In [None]:
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
           ' ', ',', '-', '.']
alphabet_size = len(alphabet) + 2 # + 1 for oov + 1 for mask (=0)
def char_to_int(char):
    if char.lower() in alphabet:
        return alphabet.index(char.lower()) +1
    return alphabet_size-1

def list_of_chars_to_num(arr):
    return [char_to_int(char) for char in arr]
df_patient_notes["pn_history_padded"] = df_patient_notes["pn_history_clean"].apply(list).apply(list_of_chars_to_num)
df_patient_notes["pn_history_padded"] = sequence.pad_sequences( df_patient_notes["pn_history_padded"], maxlen=max_note_length).tolist()
df_patient_notes["label"] = sequence.pad_sequences( df_patient_notes["label"], maxlen=max_note_length).tolist()

In [None]:
def get_2D_target_from_label_array(label_list):
    arr = np.array(label_list)
    b = np.zeros((arr.size, num_of_features+1))
    b[np.arange(arr.size),arr] = 1
    return b

def get_2D_input_from_char_integers(integer_list):
    arr = np.array(integer_list)
    b = np.zeros((arr.size, alphabet_size))
    b[np.arange(arr.size),arr] = 1
    return b

In [None]:
trainable_cases = df_train["pn_num"].unique()
df_notes_train = df_patient_notes[df_patient_notes["pn_num"].isin(trainable_cases)].copy()
df_notes_train["label_2D"] = df_notes_train["label"].apply(get_2D_target_from_label_array)
df_notes_train["pn_history_padded"] = df_notes_train["pn_history_padded"].apply(np.array)
df_notes_train["pn_history_padded_2D"] = df_notes_train["pn_history_padded"].apply(get_2D_input_from_char_integers)

**MODEL**

In [None]:
def get_model():
    input1 = layers.Input(shape=(max_note_length,))
   # x = layers.CategoryEncoding(input_shape=(max_note_length,), num_tokens=alphabet_size, output_mode="one_hot")(input1)
    x = layers.Embedding(input_dim=alphabet_size, output_dim=alphabet_size, input_length=max_note_length, mask_zero=True)(input1)
    x = layers.Conv1D(filters=512, kernel_size=7, padding='same', activation='elu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Conv1D(filters=256, kernel_size=7, padding='same', activation='elu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='elu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='elu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Conv1D(filters=num_of_features+1, kernel_size=3, padding='same', activation='softmax')(x)
    
    model = tf.keras.Model(input1, x)
    return model

In [None]:
model = get_model()
model.summary()

In [None]:

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.CategoricalCrossentropy(
    from_logits=False, label_smoothing=0.0, axis=-1,
    name='categorical_crossentropy'
))


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.MeanSquaredError())

In [None]:
X = np.array(df_notes_train["pn_history_padded"].tolist())
Y = np.array(df_notes_train["label_2D"].tolist())

In [None]:
model.fit(X, Y, validation_split=0.2, shuffle= True,epochs=2, batch_size=256)

In [None]:
samples = df_notes_train.sample(n=10)
samples_x = np.array(samples["pn_history_padded"].tolist())
samples_y_gt = np.array(samples["label_2D"].tolist())
samples_y = model.predict(samples_x)


for i in range(0, len(samples)):
    print(f"------- SAMPLE {i}")
    print(samples["pn_history_clean"].iloc[i])
    sample_labels = np.argmax(samples_y[i], axis=1)
    print(sample_labels)
    #print(samples_y[i][0])
    #print(samples_y[i][25])
    #print(samples_y[i][45])
    sample_labels = np.argmax(samples_y_gt[i], axis=1)
    #print(sample_labels)
    #for wrd in range(0, len(sample_labels)):
    #    if sample_labels[wrd] != num_of_features:
    #        print(f"Word at {wrd} - feature: {feature_mapping[sample_labels[wrd]]}")
    #        print(tokenizer.sequences_to_texts([[samples["pn_history_tokenized"].tolist()[i][wrd]]]))
    #        print(df_features[df_features["feature_num"] == feature_mapping[sample_labels[wrd]]].iloc[0]["feature_text"])
