## **Data Anonymization**

In [None]:
%pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import spacy
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### Reading dataset

In [2]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

In [3]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [4]:
df.dropna(inplace=True)

In [5]:
# Remove unwanted characters from the 'note' and 'full note' columns
df['note'] = df['note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)
df['full_note'] = df['full_note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)

In [6]:
len(df)

30000

In [7]:
import tiktoken

# tokenizer GPT-3.5/GPT-4
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

df["token_count_note"] = df["note"].apply(lambda x: len(enc.encode(x)))
df["word_count_note"] = df["note"].apply(lambda x: len(x.split()))

df["token_count_full_note"] = df["full_note"].apply(lambda x: len(enc.encode(x)))
df["word_count_full_note"] = df["full_note"].apply(lambda x: len(x.split()))

max_row_note = df.loc[df["token_count_note"].idxmax()]
longest_note = max_row_note["note"]
longest_tokens_note = max_row_note["token_count_note"]
longest_words_note = max_row_note["word_count_note"]

max_row_full_note = df.loc[df["token_count_full_note"].idxmax()]
longest_full_note = max_row_note["full_note"]
longest_tokens_full_note = max_row_note["token_count_full_note"]
longest_words_full_note = max_row_note["word_count_full_note"]

print("# Max tokens note:", longest_tokens_note)
print("# Max words note:", longest_words_note)

print("# Max tokens full note:", longest_tokens_full_note)
print("# Max words full note:", longest_words_full_note)

# Max tokens note: 524
# Max words note: 328
# Max tokens full note: 1042
# Max words full note: 659


In [8]:
import tiktoken

# tokenizer GPT-3.5/GPT-4
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

mean_tokens_note = df["token_count_note"].mean()
mean_words_note = df["word_count_note"].mean()
mean_tokens_full_note = df["token_count_full_note"].mean()
mean_words_full_note = df["word_count_full_note"].mean()

print("# Mean tokens note:", round(mean_tokens_note, 2))
print("# Mean words note:", round(mean_words_note, 2))

print("# Mean tokens full note:", round(mean_tokens_full_note, 2))
print("# Mean words full note:", round(mean_words_full_note, 2))

# Mean tokens note: 428.68
# Mean words note: 315.0
# Mean tokens full note: 725.2
# Mean words full note: 524.88


In [19]:
from collections import Counter

all_notes = " ".join(df["full_note"].tolist())

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
tokens = enc.encode(all_notes)
token_counts = Counter(tokens)

print("\nMost 10 token frequencies:")
for tok, freq in token_counts.most_common(10):
    print(enc.decode([tok]), "→", freq)

words = all_notes.split()
word_counts = Counter(words)

print("\nMost 10 words frequencies:")
for word, freq in word_counts.most_common(10):
    print(repr(word), "→", freq)


Most 10 token frequencies:
 the → 887493
. → 717873
, → 680847
 and → 576315
 of → 569023
 was → 508831
  → 412772
 a → 311006
 to → 290373
 with → 250820

Most 10 words frequencies:
'the' → 887302
'and' → 574528
'of' → 568885
'was' → 508564
'a' → 294743
'to' → 289914
'with' → 250717
'in' → 214066
'The' → 163937
'for' → 127999
