In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Vocabulary

In [None]:
total_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_vocabulary.csv')

total_annotation_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_annotation_vocabulary.csv')

# Load Input Data

In [None]:
patient_notes_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

features_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

train_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')

test_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/test.csv')

# Data Functions

In [None]:
import ast

def take_pacient_note(pacient_note_num: int) -> dict:
    pacient_note = patient_notes_df[patient_notes_df['pn_num'] == pacient_note_num].reset_index()['pn_history'][0]
    return pacient_note

def get_training_patient_note_numbers(case_number):
    return train_df[train_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_feature_numbers(case_number):
    return features_df[features_df['case_num'] == case_number]['feature_num'].to_numpy()

def get_all_test_patient_note_numbers(case_number):
    return list(set(test_df[test_df['case_num'] == case_number]['pn_num'].to_numpy()))

def get_all_patient_note_numbers(case_number):
    return patient_notes_df[patient_notes_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_feature_annotations(patient_note_num, feature_number):
    aux_df = train_df[train_df['pn_num'] == patient_note_num]
    return ast.literal_eval(aux_df[aux_df['feature_num'] == feature_number].reset_index()['annotation'][0])

def get_all_training_annotations(case_number):
    return [ast.literal_eval(x) for x in train_df[train_df['case_num'] == case_number]['annotation'].to_numpy()]

In [None]:
list(set(get_all_test_patient_note_numbers(0)))

# Preprocessing Open Data

In [None]:
def preprocessing(X_batch):
    X_out_batch = tf.strings.lower(X_batch)
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\n\r", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\r\n", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"[^a-zA-Z0-9-']", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"-", b" - ")
    X_out_batch = tf.strings.split(X_out_batch)
    return X_out_batch

# Defining Truncated Vocabulary

In [None]:
import pickle

with open('/kaggle/input/nbme-truncated-vocabulary/truncated.vocabulary', 'rb') as truncated_vocabulary_file:
    truncated_vocabulary = pickle.load(truncated_vocabulary_file)

# Generate Lookup Table

In [None]:
import tensorflow as tf

num_oov_buckets = 1000

def create_lookup_table(truncated_vocabulary):
    words = tf.constant(truncated_vocabulary)
    len_vocabulary = len(truncated_vocabulary)
    word_ids = tf.range(len_vocabulary, dtype=tf.int64)
    vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
    table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)
    
    vocab = dict(zip(word_ids.numpy(), words.numpy()))
    
    return len_vocabulary, vocab, table

len_vocabulary, vocab, lookup_table = create_lookup_table(truncated_vocabulary)

In [None]:
vocab[len_vocabulary - 1]

In [None]:
lookup_table

# From Text And To Text

In [None]:
def add_padding(input_string, padding_size):
    padding = " <pad> "* padding_size
    string_with_padding = padding + input_string + padding
    
    return string_with_padding
    

In [None]:
padding_size = 5

def get_X(coded_input_tensor):
    X = []
    for i in range(len(coded_input_tensor) - 2*padding_size):
        X.append(coded_input_tensor[i:i + 2 * padding_size + 1].numpy())

    X = np.array(X)
    
    return X

In [None]:
padding_size = 5

def get_input_from_text(input_string):
    string_padded = add_padding(input_string, padding_size)
    tensor_input = preprocessing(string_padded)
    coded_input_tensor = lookup_table.lookup(tensor_input)
    
    return get_X(coded_input_tensor)

In [None]:
def get_word(word_id):
    if word_id < len_vocabulary:
        return vocab[word_id].decode("utf-8")
    else:
        return ""

# Generate Submission File

In [None]:
def load_model(case_number, feature_number):
    model = keras.models.load_model(
        f"/kaggle/input/nbme-train-all-models-from-trainning/model_for_case_{case_number}_and_feature_{feature_number}")
    return model

In [None]:
def get_sequences(y_pred, X, cut_prob, input_text):
    sequences = []
    sequence = []
    started_interval = False
    act_place = 0
    lower_input_text = tf.strings.lower(input_text).numpy().decode("utf-8")
    
    for i, y_value in enumerate(y):
        word = get_word(X[i][padding_size])
        place = lower_input_text.find(word)
        lower_input_text = lower_input_text[place:]
        act_place = act_place + place 
        
        if y_value > cut_prob:
            if started_interval == False:
                started_interval = True
                start_place = act_place
            end_place = act_place + len(word)
            sequence = [start_place, end_place]
        else:
            started_interval = False
            if sequence:
                sequences.append(sequence)
            sequence = []

    if sequence:
        sequences.append(sequence)
    
    return sequences

In [None]:
def sequences_to_text(sequences):
    text = []
    for sequence in sequences:
        text.append(f"{sequence[0]} {sequence[1]}")
    
    return ";".join(text)

In [None]:
def text_id(patient_note_number, feature_number):
    feature = str(feature_number)
    feature = "0"*(3-len(feature)) + feature
    pn = str(patient_note_number)
    pn = "0"*(5-len(pn)) + pn
    
    return pn + "_" + feature

In [None]:
cut_prob = 0.5

submission = pd.DataFrame(columns=["id","location"])
for case_number in range(10):
    try:
        patient_note_numbers = get_all_test_patient_note_numbers(case_number)
        for feature_number in get_all_feature_numbers(case_number):
            try:
                print(f"Loading model for case {case_number} and feature {feature_number}")
                if patient_note_numbers:
                    model = load_model(case_number, feature_number)

                for patient_note_number in patient_note_numbers:
                    try:
                        print(f"Predicting for patient_note_number {patient_note_number}")
                        input_text = take_pacient_note(patient_note_number)
                        X = get_input_from_text(input_text)

                        y = model.predict(X)
                        
                        sequences = get_sequences(y, X, cut_prob, input_text)
                        print(sequences)

                        text_sequences = sequences_to_text(sequences)
                        print(text_sequences)

                        ident = text_id(patient_note_number, feature_number)
                        print(ident)

                        print(f"Writing df for patient_note_number {patient_note_number} and feature_number {feature_number}")
                        submission = submission.append({"id": ident, "location": text_sequences}, ignore_index = True)
                    except:
                        continue
            except:
                continue
    except:
        continue
            
print(f"Saving dataframe to csv - making submission file")
submission.sort_values("id", inplace=True)
submission.to_csv("submission.csv", index=False)