In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Input Data

In [None]:
patient_notes_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

features_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

train_df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')

# Data Functions

In [None]:
import ast

def take_pacient_note(pacient_note_num: int) -> dict:
    pacient_note = patient_notes_df[patient_notes_df['pn_num'] == pacient_note_num].reset_index()['pn_history'][0]
    return pacient_note

def get_training_patient_note_numbers(case_number):
    return train_df[train_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_patient_note_numbers(case_number):
    return patient_notes_df[patient_notes_df['case_num'] == case_number]['pn_num'].to_numpy()

def get_all_training_annotations(case_number):
    return [ast.literal_eval(x) for x in train_df[train_df['case_num'] == case_number]['annotation'].to_numpy()]

# Looking Data

In [None]:
print("patient_notes_df")
print(repr(patient_notes_df.head(5)))
print("features_df")
print(repr(features_df.head(5)))
print("train_df")
print(repr(train_df.head(5)))

In [None]:
# Using range 10 because they have 10 different cases
for i in range(10):
    print(f"Case {i} trainning: {len(set(get_training_patient_note_numbers(i)))}")
    print(f"Case {i} total    : {len(set(get_all_patient_note_numbers(i)))}")
    print("\n")

# Preprocessing Entry Data

In [None]:
def preprocessing(X_batch):
    X_out_batch = tf.strings.lower(X_batch)
#     X_out_batch = tf.strings.regex_replace(X_out_batch, r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*',"")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\n\r", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"\r\n", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"[^a-zA-Z0-9-']", b" ")
    X_out_batch = tf.strings.regex_replace(X_out_batch, b"-", b" - ")
    X_out_batch = tf.strings.split(X_out_batch)
    return X_out_batch

# Creating Our Vocabularies
 
In order to create a bag of words, we will use all the entry dataset. 

## From patient_note_df

In [None]:
from collections import Counter
from IPython.display import clear_output

patient_note_number_fail = []

total_vocabulary = Counter()
for case_number in range(10):
    case_vocabulary = Counter()
    case_patient_note_numbers = get_all_patient_note_numbers(case_number)
    case_patient_note_numbers_len = len(case_patient_note_numbers)
    
    for pn_count, pacient_note_number in enumerate(case_patient_note_numbers):
        try:
            X = take_pacient_note(pacient_note_number)
            case_vocabulary.update(preprocessing(X).numpy())
            total_vocabulary.update(preprocessing(X).numpy())
        except:
            patient_note_number_fail.append(pacient_note_number)
            continue
        finally:
            clear_output(wait=True)
            print(f"Case {case_number}: {pn_count} of {case_patient_note_numbers_len} patient note numbers")
        
    case_vocabulary_df = pd.DataFrame.from_dict(case_vocabulary, orient='index').reset_index()
    case_vocabulary_df = case_vocabulary_df.rename(columns={'index':'word',0:'count'})
    case_vocabulary_df.to_csv(f"/kaggle/working/case_vocabulary_{case_number}.csv")

total_vocabulary_df = pd.DataFrame.from_dict(total_vocabulary, orient='index').reset_index()
total_vocabulary_df = total_vocabulary_df.rename(columns={'index':'word',0:'count'})
total_vocabulary_df.to_csv(f"/kaggle/working/total_vocabulary.csv")

In [None]:
print(f"Failed note numbers: {patient_note_number_fail}")

## From train_df

In [None]:
from collections import Counter
from IPython.display import clear_output

total_annotation_vocabulary = Counter()
for case_number in range(10):
    case_annotation_vocabulary = Counter()
    case_annotations = get_all_training_annotations(case_number)
    case_annotations_len = len(case_annotations)
    
    for ca_count, case_annotation in enumerate(case_annotations):
        for annotation in case_annotation:
            try:
                case_annotation_vocabulary.update(preprocessing(annotation).numpy())
                total_annotation_vocabulary.update(preprocessing(annotation).numpy())
            except:
                continue
            finally:
                clear_output(wait=True)
                print(f"Case {case_number}: {ca_count} of {case_annotations_len} case annotation numbers")
        
    case_annotation_vocabulary_df = pd.DataFrame.from_dict(case_annotation_vocabulary, orient='index').reset_index()
    case_annotation_vocabulary_df = case_annotation_vocabulary_df.rename(columns={'index':'word',0:'count'})
    case_annotation_vocabulary_df.to_csv(f"/kaggle/working/case_annotation_vocabulary_{case_number}.csv")

total_annotation_vocabulary_df = pd.DataFrame.from_dict(total_annotation_vocabulary, orient='index').reset_index()
total_annotation_vocabulary_df = total_annotation_vocabulary_df.rename(columns={'index':'word',0:'count'})
total_annotation_vocabulary_df.to_csv(f"/kaggle/working/total_annotation_vocabulary.csv")

# Looking Our Vocabulary

In [None]:
len(total_vocabulary)

In [None]:
total_vocabulary.most_common()[:50]

In [None]:
len(total_annotation_vocabulary)

In [None]:
total_annotation_vocabulary.most_common()[:50]