In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Vocabulary

In [None]:
total_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_vocabulary.csv')

total_annotation_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_annotation_vocabulary.csv')

# Load Input Data

In [None]:
patient_notes_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

features_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

train_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')

# Data Functions

In [None]:
import ast

def take_pacient_note(pacient_note_num: int) -> dict:
    pacient_note = patient_notes_df[patient_notes_df['pn_num'] == pacient_note_num].reset_index()['pn_history'][0]
    return pacient_note

def get_training_patient_note_numbers(case_number):
    return train_df[train_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_feature_numbers(case_number):
    return features_df[features_df['case_num'] == case_number]['feature_num'].to_numpy()

def get_all_patient_note_numbers(case_number):
    return patient_notes_df[patient_notes_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_feature_annotations(patient_note_num, feature_number):
    aux_df = train_df[train_df['pn_num'] == patient_note_num]
    return ast.literal_eval(aux_df[aux_df['feature_num'] == feature_number].reset_index()['annotation'][0])

def get_all_training_annotations(case_number):
    return [ast.literal_eval(x) for x in train_df[train_df['case_num'] == case_number]['annotation'].to_numpy()]

# Defining Truncated Vocabulary

In [None]:
import pickle

with open('/kaggle/input/nbme-truncated-vocabulary/truncated.vocabulary', 'rb') as truncated_vocabulary_file:
    truncated_vocabulary = pickle.load(truncated_vocabulary_file)

# Generate Lookup Table

In [None]:
import tensorflow as tf

num_oov_buckets = 1000

def create_lookup_table(truncated_vocabulary):
    words = tf.constant(truncated_vocabulary)
    word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
    vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
    
    return tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

lookup_table = create_lookup_table(truncated_vocabulary)

In [None]:
lookup_table

# Load Maps

In [None]:
import pickle

def load_input_output_trainning_maps(case_number):
    with open(f'/kaggle/input/nbme-map-trainning-inputs-and-outputs/input.map.case.{case_number}', 'rb') as input_map_file:
        input_map = pickle.load(input_map_file)

    with open(f'/kaggle/input/nbme-map-trainning-inputs-and-outputs/output.map.case.{case_number}', 'rb') as output_map_file:
        output_map = pickle.load(output_map_file)
        
    return (input_map, output_map)

# Creating Batch

In [None]:
defined_padding = 5

def get_batch(input_map, output_map, feature_number):
    X = []
    y = []
    for patient_note_number in input_map:
        for i, out in enumerate(output_map[(patient_note_number, feature_number)]):
            X.append(input_map[patient_note_number][i:i + 2 * defined_padding + 1].numpy())
            y.append([output_map[(patient_note_number, feature_number)][i].numpy()])

    X = np.array(X)
    y = np.array(y)
    
    return (X, y)

# Model Training

In [None]:
vocab_size = len(truncated_vocabulary)
embed_size = 5

def create_model():
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size + num_oov_buckets,embed_size,input_shape=[None]),
        keras.layers.Bidirectional(keras.layers.GRU(2*defined_padding + 1, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.GRU(2*defined_padding + 1)),
        keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    return model

In [None]:
def find_high(X,y,cut_value, mark_decision):
    X_choosed = []
    y_choosed = []
    for i, y_value in enumerate(y):
        if y_value > cut_value:
            if mark_decision:
                y_choosed.append([1])
                X_choosed.append(X[i])
            else:
                y_choosed.append(y[i])
                X_choosed.append(X[i])

    y_choosed = np.array(y_choosed)
    X_choosed = np.array(X_choosed)
    
    return X_choosed, y_choosed

In [None]:
def find_low(X,y,cut_value, mark_decision):
    X_choosed = []
    y_choosed = []
    for i, y_value in enumerate(y):
        if y_value < cut_value:
            if mark_decision:
                y_choosed.append([0])
                X_choosed.append(X[i])
            else:
                y_choosed.append(y[i])
                X_choosed.append(X[i])
                

    y_choosed = np.array(y_choosed)
    X_choosed = np.array(X_choosed)
    
    return X_choosed, y_choosed

In [None]:
def shuffle_batch(X,y):
    p = np.random.permutation(len(X))
    return X[p], y[p]

In [None]:
from tensorflow.keras.utils import Sequence

class EquilibriumSequence(Sequence):

    def __init__(self, full_X, full_y, low_cut=0.5, high_cut=0.5, mark_decision=False):
        X_high, y_high  = find_high(X, y, high_cut, mark_decision)
        X_low,  y_low   = find_low(X, y, low_cut, mark_decision)
        
        len_high = len(X_high)
        len_low  = len(X_low)
        
        if len_high > len_low:
            self.X_big, self.y_big     = X_high, y_high
            self.X_small, self.y_small = X_low,  y_low
        else:
            self.X_big, self.y_big     = X_low,  y_low
            self.X_small, self.y_small = X_high, y_high
            
        self.X_big, self.y_big     = shuffle_batch(self.X_big,   self.y_big)
        self.X_small, self.y_small = shuffle_batch(self.X_small, self.y_small)
        
    def __len__(self):
#         if len(self.X_small) == 0:
#             return 1
#         else:
#             return len(self.X_big) // len(self.X_small)
        return 1

    def __getitem__(self, idx):
        small_size = len(self.X_small)
        
        if small_size == 0:
            X = self.X_big
            y = self.y_big
        else:
            X_big_batch = self.X_big[idx * small_size : (idx + 1 ) * small_size]
            y_big_batch = self.y_big[idx * small_size : (idx + 1 ) * small_size]
            X = np.concatenate((X_big_batch, self.X_small), axis=0)
            y = np.concatenate((y_big_batch, self.y_small), axis=0) 
        
        
        X, y = shuffle_batch(X, y)

        return X, y

In [None]:
for case_number in range(10):
    for feature_number in get_all_feature_numbers(case_number):
        print(f"Getting data for case {case_number} and feature {feature_number}")
        
        input_map, output_map = load_input_output_trainning_maps(case_number)
        X, y = get_batch(input_map, output_map, feature_number)
        
        print(f"Fitting model for case {case_number} and feature {feature_number}")
        model = create_model()
        
        model.fit(EquilibriumSequence(X,y),epochs=4)
        
        print(f"Evaluation model for case {case_number} and feature {feature_number}")
        model.evaluate(X,y)
        
        print(f"Saving model for case {case_number} and feature {feature_number}")
        model.save(f"/kaggle/working/model_for_case_{case_number}_and_feature_{feature_number}")
        print(f"Saved model for case {case_number} and feature {feature_number}")