# Create Dataset Statistics

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
import statistics
import json
from transformers import AutoTokenizer

In [7]:
# Load dataset
# mimic4_path = '/home/s_hegs02/mimic-iv-note-di/dataset/all.json'
# Short references (BHC)
mimic4_path = '/home/s_hegs02/mimic-iv-note-di-bhc/dataset/all.json'

dataset = []
with open(mimic4_path, 'r') as f:
    for line in f:
        dataset.append(json.loads(line))
        
# Print total entries
print(f"Total entries: {len(dataset)}")

Total entries: 100175


In [12]:
# Select only 10000 notes
# dataset = dataset[:1000]

In [8]:
# Load Llama 2 tokenizer to determine number of tokens
model_name = 'meta-llama/Llama-2-7b-hf'
hf_token = ''

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token) 

In [9]:
keys = ['text', 'summary']
deidentified_field = '___'

for k in keys:

    num_sentences = []
    num_words = []
    num_chars = []
    num_tokens = []
    num_deidentified_fields = []

    for i in range(len(dataset)):
        entry = dataset[i][k]
        num_sentences.append(len(sent_tokenize(entry)))
        num_words.append(len(word_tokenize(entry)))
        num_chars.append(len(entry))
        num_tokens.append(len(tokenizer.tokenize(entry)))
        num_deidentified_fields.append(entry.count(deidentified_field))
        
    # Determine average and standard deviation using statisitcs module
    # Round by one digit
    print(f"{k} - mean number of sentences: {statistics.mean(num_sentences):.1f}")
    print(f"{k} - mean number of words: {statistics.mean(num_words):.1f}")
    print(f"{k} - mean number of chars: {statistics.mean(num_chars):.1f}")
    print(f"{k} - mean number of tokens: {statistics.mean(num_tokens):.1f}")
    print(f"{k} - mean number of deidentified fields: {statistics.mean(num_deidentified_fields):.1f}")
    
    print(f"{k} - std of number of sentences: {statistics.stdev(num_sentences):.1f}")
    print(f"{k} - std of number of words: {statistics.stdev(num_words):.1f}")
    print(f"{k} - std of number of chars: {statistics.stdev(num_chars):.1f}")
    print(f"{k} - std of number of tokens: {statistics.stdev(num_tokens):.1f}")
    print(f"{k} - std of number of deidentified fields: {statistics.stdev(num_deidentified_fields):.1f}")
    print()

text - mean number of sentences: 33.0
text - mean number of words: 552.0
text - mean number of chars: 3029.9
text - mean number of tokens: 858.6
text - mean number of deidentified fields: 11.5
text - std of number of sentences: 19.0
text - std of number of words: 314.0
text - std of number of chars: 1736.4
text - std of number of tokens: 498.3
text - std of number of deidentified fields: 9.7

summary - mean number of sentences: 6.5
summary - mean number of words: 113.2
summary - mean number of chars: 604.4
summary - mean number of tokens: 145.4
summary - mean number of deidentified fields: 1.1
summary - std of number of sentences: 2.6
summary - std of number of words: 47.4
summary - std of number of chars: 251.0
summary - std of number of tokens: 61.4
summary - std of number of deidentified fields: 1.7

