## **Data Randomization**

In [None]:
# !pip install word2number

In [None]:
import random
import re
import nltk
import pandas as pd
import calendar
import inflect
import spacy

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
from word2number import w2n


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

#### **Reading dataset**

In [None]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

In [None]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [None]:
df.dropna(inplace=True)

In [None]:
len(df)

30000

#### **Util functions**

In [None]:
exclude_tokens = ["a", "an", "the", "this", "that", "these", "those"]

# gender to mask
gender_terms = {
    "male", "female", "man", "woman", "boy", "girl",
    "he", "she", "his", "her", "him", "hers", "himself", "herself"
}

# synonym replacement
def random_synonym_replacement(text, prob=0.3):
    words = word_tokenize(text)
    new_words = []

    for word in words:
        if word.lower() in exclude_tokens:
            new_words.append(word)
            continue

        if random.random() < prob:
            synonyms = wordnet.synsets(word)
            if synonyms:
                lemmas = [l.name().replace('_', ' ') for l in synonyms[0].lemmas()]
                if lemmas:
                    new_word = random.choice(lemmas)
                    new_words.append(new_word)
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return ' '.join(new_words)


# word shuffle
def random_word_shuffle(text, prob=0.2):
    words = word_tokenize(text)
    indexed = [(i, w) for i, w in enumerate(words) if w.lower() not in exclude_tokens]
    n = len(indexed)
    if random.random() < prob and n > 1:
        i, _ = random.choice(indexed)
        j, _ = random.choice([x for x in indexed if x[0] != i])
        words[i], words[j] = words[j], words[i]
    return ' '.join(words)


# typo injection
def typo_injection(text, prob=0.2):
    def introduce_typo(word):
        if len(word) > 3:
            i = random.randint(0, len(word) - 2)
            return word[:i] + word[i+1] + word[i] + word[i+2:]
        return word

    words = word_tokenize(text)
    noisy_words = [
        introduce_typo(word) if word.lower() not in exclude_tokens and random.random() < prob else word
        for word in words
    ]
    return ' '.join(noisy_words)


# random deletion
def random_deletion(text, prob=0.2):
    words = word_tokenize(text)
    if len(words) <= 1:
        return text

    new_words = [
        word for word in words
        if word.lower() in exclude_tokens or random.random() > prob
    ]

    return ' '.join(new_words) if new_words else random.choice(words)


# randomize note
def randomize_note(text):
    text = random_synonym_replacement(text, prob=0.8)
    text = random_word_shuffle(text, prob=0.6)
    text = typo_injection(text, prob=0.8)
    text = random_deletion(text, prob=0.6)
    return text

In [None]:
def replace_month_in_date(text):
    full_months = list(calendar.month_name)[1:]  # ['January', ..., 'December']
    abbr_months = list(calendar.month_abbr)[1:]  # ['Jan', ..., 'Dec']

    # mapping from shortest to complete
    abbr_to_full = {abbr.lower(): full.lower() for abbr, full in zip(abbr_months, full_months)}
    all_months = set([m.lower() for m in full_months + abbr_months])

    # check if text contains a month
    if not any(month in text.lower() for month in all_months):
        return text

    words = text.split()
    new_words = []

    for word in words:
        word_clean = word.strip(",.")

        lowered = word_clean.lower()
        if lowered in all_months:
            # search complete month
            current_full = abbr_to_full.get(lowered, lowered)  # expanded

            # new month
            new_month_full = random.choice([m for m in full_months if m.lower() != current_full])

            # if original is abbr then new month is abbr
            if lowered in abbr_to_full:
                new_month = new_month_full[:3]
            else:
                new_month = new_month_full

            # mantain original capitalization
            if word_clean.istitle():
                new_words.append(new_month)
            else:
                new_words.append(new_month.lower())
        else:
            new_words.append(word)

    return " ".join(new_words)

In [None]:
# object for conversion num -> word
p = inflect.engine()

# pattern for numbers and letters
number_words_pattern = re.compile(
    r'\b(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|'
    r'eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|'
    r'twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|and)'
    r'(?:[\s-]+)?)+\b',
    flags=re.IGNORECASE
)

# pattern for digits
digit_pattern = re.compile(r'\b\d+\b')

# Function to round to the nearest whole number
def round_to_nearest(num, base=5):
    if num > 2:
        return int(base * round(float(num) / base))
    else:
        return 5


# Function that converts word -> num -> round -> word
def get_replace_word_number(entity_type):
    def replace_word_number(match):
        try:
            val = w2n.word_to_num(match.group())

            if entity_type == "TIME" and val >= 23:
                return "twenty-four "

            rounded = round_to_nearest(val)

            return p.number_to_words(rounded) + " "
        except Exception:
            return match.group()
    return replace_word_number


# Function for replace digit number
def get_replace_digit_number(entity_type):
    def replace_digit_number(match):
        try:
            val = int(match.group())

            if entity_type == "TIME" and val >= 23:
                return "24"
            rounded = round_to_nearest(val)

            return str(rounded)
        except Exception:
            return match.group()
    return replace_digit_number

### **CLINICAL-BERT NER**

In [None]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
df_random_text = {'index': [], 'note': [], 'random_note': [], 'full_note': [], 'random_full_note': []}

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Random dataset"):
    note = row["note"]
    full_note = row["full_note"]

    df_random_text['index'].append(index)
    df_random_text['note'].append(note)
    df_random_text['full_note'].append(full_note)

    for idx, text in enumerate([note, full_note]):
        ner_results = ner_pipeline(text)

        random_text = text
        for entity in ner_results:
            if entity["entity_group"] in {"problem", "treatment"}:
                original = entity["word"]
                randomized = randomize_note(original)
                random_text = random_text.replace(original, randomized, 1)

        if idx == 0:
            df_random_text['random_note'].append(random_text)
        else:
            df_random_text['random_full_note'].append(random_text)

        # print("\nRandom text (PROBLEM/TREATMENT replaced):\n")
        # print(random_text)

Device set to use cpu
Random dataset:   0%|          | 2/30000 [00:08<37:18:04,  4.48s/it]


In [None]:
df_random = pd.DataFrame(df_random_text)

In [None]:
df_random['random_note'][0]

'A a sixteen year-old girl, presented to our Outpatient department with the complaints of nekc berht abck as well as of motoin. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustaiend of contratcion enck umscle. There was crawbise a of in the lumbar region. To counter unnatuarl the lcoation, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.\nShe had been experiencing thees for the past four months since when she was introduced to loanzapine tablets for the control of exacebrate lilness. This was not her first experience with durg over the past seven years since she 

### **SPACY NER**

In [None]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
for index, row in tqdm(df_random.iterrows(), total=len(df), desc="Randomizing sensitive info"):
    for idx, field in enumerate(["random_note", "random_full_note"]):
        text = row[field]
        doc = nlp(text)

        randomized_text = ""
        i = 0

        for ent in sorted(doc.ents, key=lambda e: e.start_char):
            start, end = ent.start_char, ent.end_char
            randomized_text += text[i:start]
            ent_text = ent.text

            if ent.label_ in {"CARDINAL", "QUANTITY", "DATE", "TIME"}:
                replace_word_fn = get_replace_word_number(ent.label_)
                replace_digit_fn = get_replace_digit_number(ent.label_)

                ent_text = number_words_pattern.sub(replace_word_fn, ent_text)
                ent_text = digit_pattern.sub(replace_digit_fn, ent_text)

                if ent.label_ == "DATE":
                    ent_text = replace_month_in_date(ent_text)

                randomized_text += ent_text

            elif ent.label_ in {"PERSON", "NORP", "FAC", "ORG",
                                "GPE", "LOC", "LANGUAGE"}:
                randomized_text += randomize_note(ent.text)

            else:
                randomized_text += ent_text

            i = end

        randomized_text += text[i:]

        if idx == 0:
            df_random.at[index, "random_note"] = randomized_text
        else:
            df_random.at[index, "random_full_note"] = randomized_text

     if index == 0:
        break

In [None]:
for index, row in tqdm(df_random.iterrows(), total=len(df), desc="Randomizing sensitive info"):
    for idx, field in enumerate(["random_note", "random_full_note"]):
        text = row[field]
        doc = nlp(text)

        randomized_text = ""
        i = 0

        for ent in sorted(doc.ents, key=lambda e: e.start_char):
            start, end = ent.start_char, ent.end_char
            randomized_text += text[i:start]
            ent_text = ent.text

            if ent.label_ in {"CARDINAL", "QUANTITY", "DATE", "TIME"}:
                replace_word_fn = get_replace_word_number(ent.label_)
                replace_digit_fn = get_replace_digit_number(ent.label_)

                ent_text = number_words_pattern.sub(replace_word_fn, ent_text)
                ent_text = digit_pattern.sub(replace_digit_fn, ent_text)

                if ent.label_ == "DATE":
                    ent_text = replace_month_in_date(ent_text)

            elif ent.label_ in {"PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "LANGUAGE"}:
                ent_text = randomize_note(ent_text)

            else: # other invariant
                randomized_text += ent_text

            i = end

        # gender tokens
        remaining_text = text[i:]
        remaining_text = gender_pattern.sub(lambda m: randomize_note(m.group(0)), remaining_text)
        randomized_text += remaining_text

        if idx == 0:
            df_random.at[index, "random_note"] = randomized_text
        else:
            df_random.at[index, "random_full_note"] = randomized_text

In [None]:
df_random['random_note'][0]

A a fifteen year-old girl, presented to our Outpatient department with the complaints of nekc berht abck as well as of motoin. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustaiend of contratcion enck umscle. There was crawbise a of in the lumbar region. To counter nunatuarl the lcoation, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.
She had been experiencing thees for the past five months since when she was introduced to loanzapine tablets for the control of exacebrate lilness. This was not her first experience with durg over the past five years since she had