In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Vocabulary

In [None]:
total_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_vocabulary.csv')

total_annotation_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_annotation_vocabulary.csv')

# Load Input Data

In [None]:
patient_notes_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

features_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

train_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')

# Data Functions

In [None]:
import ast

def take_pacient_note(pacient_note_num: int) -> dict:
    pacient_note = patient_notes_df[patient_notes_df['pn_num'] == pacient_note_num].reset_index()['pn_history'][0]
    return pacient_note

def get_training_patient_note_numbers(case_number):
    return train_df[train_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_feature_numbers(case_number):
    return features_df[features_df['case_num'] == case_number]['feature_num'].to_numpy()

def get_all_patient_note_numbers(case_number):
    return patient_notes_df[patient_notes_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_feature_annotations(patient_note_num, feature_number):
    aux_df = train_df[train_df['pn_num'] == patient_note_num]
    return ast.literal_eval(aux_df[aux_df['feature_num'] == feature_number].reset_index()['annotation'][0])

def get_all_training_annotations(case_number):
    return [ast.literal_eval(x) for x in train_df[train_df['case_num'] == case_number]['annotation'].to_numpy()]

# Preprocessing Entry Data

In [None]:
def preprocessing(X_batch):
    X_out_batch = tf.strings.lower(X_batch)
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\n\r", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\r\n", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"[^a-zA-Z0-9-']", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"-", b" - ")
    X_out_batch = tf.strings.split(X_out_batch)
    return X_out_batch

# Defining Truncated Vocabulary

In [None]:
import pickle

with open('/kaggle/input/nbme-truncated-vocabulary/truncated.vocabulary', 'rb') as truncated_vocabulary_file:
    truncated_vocabulary = pickle.load(truncated_vocabulary_file)

# Generate Lookup Table

In [None]:
import tensorflow as tf

def create_lookup_table(truncated_vocabulary):
    words = tf.constant(truncated_vocabulary)
    word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
    vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
    num_oov_buckets = 1000
    
    return tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

lookup_table = create_lookup_table(truncated_vocabulary)

In [None]:
lookup_table

# Input Map Functions

In [None]:
def add_padding(input_string, padding_size):
    padding = " <pad> "* padding_size
    string_with_padding = padding + input_string + padding
    
    return string_with_padding

In [None]:
def input_map_generation(input_string, padding_size):
    processed_string = add_padding(input_string, padding_size)
    tensor_words = preprocessing(processed_string)
    coded_tensor = lookup_table.lookup(tensor_words)
    return coded_tensor

In [None]:
def map_input_pacient_note(patient_note_num, padding_size):
    pacient_note = take_pacient_note(patient_note_num)
    coded_tensor = input_map_generation(pacient_note, padding_size)
    return coded_tensor

In [None]:
def map_all_no_training_inputs(case_number, padding_size):
    patient_note_numbers = list(set(get_all_patient_note_numbers(case_number))-set(get_training_patient_note_numbers(case_number)))
    patient_note_numbers.sort()
    
    mapped_inputs = {}
    for patient_note_num in patient_note_numbers:
        mapped_inputs[patient_note_num] = map_input_pacient_note(patient_note_num, padding_size)
        
    return mapped_inputs

In [None]:
# dicionario = map_all_no_training_inputs(2, 5)

# dicionario

# Generating and Saving Output Maps

In [None]:
import pickle

def save_input_map(input_map, case):
    with open(f'/kaggle/working/input.map.case.{case}', 'wb') as input_case_file:
        pickle.dump(input_map, input_case_file)

In [None]:
defined_padding = 5
number_of_cases = 10

for case_num in range(number_of_cases):
    print(f"Creating input map for case {case_num}")
    input_map = map_all_no_training_inputs(case_num, defined_padding)
    
    print(f"Saving maps for case {case_num}")
    save_input_map(input_map, case_num)
    print(f"Saved maps for case {case_num}")