In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Vocabulary

In [None]:
total_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_vocabulary.csv')

total_annotation_vocabulary_df = pd.read_csv('/kaggle/input/nbme-creating-vocabulary/total_annotation_vocabulary.csv')

In [None]:
def load_vocabulary(case_number):
    return pd.read_csv(f'/kaggle/input/nbme-creating-vocabulary/case_vocabulary_{case_number}.csv')

def load_annot_vocabulary(case_number):
    return pd.read_csv(f'/kaggle/input/nbme-creating-vocabulary/case_annotation_vocabulary_{case_number}.csv')

# Load Input Data

In [None]:
patient_notes_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

features_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

train_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')

# Data Functions

In [None]:
import ast

def take_pacient_note(pacient_note_num: int) -> dict:
    pacient_note = patient_notes_df[patient_notes_df['pn_num'] == pacient_note_num].reset_index()['pn_history'][0]
    return pacient_note

def get_training_patient_note_numbers(case_number):
    return train_df[train_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_patient_note_numbers(case_number):
    return patient_notes_df[patient_notes_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_training_annotations(case_number):
    return [ast.literal_eval(x) for x in train_df[train_df['case_num'] == case_number]['annotation'].to_numpy()]

# Preprocessing Entry Data

In [None]:
def preprocessing(X_batch):
    X_out_batch = tf.strings.lower(X_batch)
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\n\r", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\r\n", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"[^a-zA-Z0-9-']", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"-", b" - ")
    X_out_batch = tf.strings.split(X_out_batch)
    return X_out_batch

# Defining Truncated Vocabulary

In [None]:
from collections import Counter
import ast

def load_voc_counter(voc_df):
    
    load_dict = {}
    for index, row in voc_df.iterrows():
        load_dict[ast.literal_eval(row['word'])] = row['count']

    voc_counter = Counter(load_dict)
        
    return voc_counter

In [None]:
def truncate_vocabulary(tot_voc_df, annot_voc_df, count_cut=5):
    
    voc_counter = load_voc_counter(tot_voc_df)
    
    size_voc = tot_voc_df[tot_voc_df['count'] > count_cut].shape[0]
    
    truncated_vocabulary = {word for word, count in voc_counter.most_common()[:size_voc]}
    
    annot_voc_counter = load_voc_counter(annot_voc_df)
    
    truncated_vocabulary = truncated_vocabulary.union({word for word, count in annot_voc_counter.most_common()})
    
    truncated_vocabulary = truncated_vocabulary.union({b'<pad>'})
    
    truncated_vocabulary = list(truncated_vocabulary)
    
    vocab_size = len(truncated_vocabulary)
    
    num_oov_buckets = vocab_size // 10
    
    return truncated_vocabulary, vocab_size, num_oov_buckets

In [None]:
import pickle

def save_truncated_vocabulary(case_number, truncated_vocabulary, vocab_size, num_oov_buckets):
    
    dict_truncated_voc = {"truncated_vocabulary": truncated_vocabulary, "vocab_size": vocab_size, "num_oov_buckets": num_oov_buckets}
    
    with open(f'/kaggle/working/truncated.vocabulary.{case_number}', 'wb') as truncated_vocabulary_file:
        pickle.dump(dict_truncated_voc, truncated_vocabulary_file)

In [None]:
for case_number in range(10):
    voc_df = load_vocabulary(case_number)
    annot_voc_df = load_annot_vocabulary(case_number)
    
    truncated_vocabulary, vocab_size, num_oov_buckets = truncate_vocabulary(voc_df, annot_voc_df)
    
    print(f"Case {case_number}: vocab_size - {vocab_size}")
    
    save_truncated_vocabulary(case_number, truncated_vocabulary, vocab_size, num_oov_buckets)

In [None]:
total_vocabulary_counter = load_voc_counter(total_vocabulary_df)

size_total_vocabulary = 10000

truncated_vocabulary = {word for word, count in total_vocabulary_counter.most_common()[:size_total_vocabulary]}

In [None]:
total_annotation_vocabulary_df.head(5)

In [None]:
from collections import Counter

load_dict = {}
for index, row in total_annotation_vocabulary_df.iterrows():
    load_dict[ast.literal_eval(row['word'])] = row['count']
    
total_annotation_vocabulary_counter = Counter(load_dict)

In [None]:
len(total_annotation_vocabulary_counter)

In [None]:
size_annotation_vocabulary = 1000

truncated_vocabulary = truncated_vocabulary.union({word for word, count in total_annotation_vocabulary_counter.most_common()[:size_annotation_vocabulary]})
truncated_vocabulary = truncated_vocabulary.union({b'<pad>'})

In [None]:
truncated_vocabulary = list(truncated_vocabulary)

In [None]:
vocab_size = len(truncated_vocabulary)
vocab_size

In [None]:
import pickle

with open('/kaggle/working/truncated.vocabulary', 'wb') as truncated_vocabulary_file:
    pickle.dump(truncated_vocabulary, truncated_vocabulary_file)

# Generate Lookup Table

In [None]:
import tensorflow as tf

def create_lookup_table(truncated_vocabulary):
    words = tf.constant(truncated_vocabulary)
    word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
    vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
    num_oov_buckets = 1000
    
    return tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

lookup_table = create_lookup_table(truncated_vocabulary)

In [None]:
lookup_table