## **Data Randomization**

In [1]:
!pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=681d857ea01602cf4aea5eb57b84c0e98d21501b9ad0382a61837bb9276a3b37
  Stored in directory: /root/.cache/pip/wheels/cd/ef/ae/073b491b14d25e2efafcffca9e16b2ee6d114ec5c643ba4f06
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [2]:
import random
import re
import nltk
import pandas as pd
import calendar
import inflect
import spacy
import string

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
from word2number import w2n


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

#### **Reading dataset**

In [3]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

In [4]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [5]:
df.dropna(inplace=True)

In [6]:
# Remove unwanted characters from the 'note' and 'full note' columns
df['note'] = df['note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)
df['full_note'] = df['full_note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)

In [7]:
len(df)

30000

#### **Util functions**

In [8]:
# Exclude tokens
exclude_tokens = ["the", "be", "to", "of", "and", "a", "an", "in", "that", "these",
                  "those", "I", "it", "for", "not", "on", "with", "as", "you",
                  "do", "at"]

# gender to randomize
gender_terms = {
    "male", "female", "man", "woman", "boy", "girl",
    "he", "she", "his", "her", "him", "hers", "himself", "herself"
}

# synonym replacement
def random_synonym_replacement(text, prob=1.0):
    words = word_tokenize(text)
    new_words = []

    for word in words:
        if word.lower() in exclude_tokens:
            new_words.append(word)
            continue

        if random.random() < prob:
            synonyms = wordnet.synsets(word)
            if synonyms:
                lemmas = [l.name().replace('_', ' ') for l in synonyms[0].lemmas()]
                if lemmas:
                    new_word = random.choice(lemmas)
                    new_words.append(new_word)
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return ' '.join(new_words)


# word shuffle
def random_word_shuffle(text, prob=1.0):
    words = word_tokenize(text)
    indexed = [(i, w) for i, w in enumerate(words) if w.lower() not in exclude_tokens]
    n = len(indexed)

    if n == 1 and random.random() < prob:
        # shuffle characters in the single word
        i, w = indexed[0]
        chars = list(w)
        if len(chars) > 1:
            random.shuffle(chars)
            words[i] = ''.join(chars)

    elif n > 1 and random.random() < prob:
        i, _ = random.choice(indexed)
        j, _ = random.choice([x for x in indexed if x[0] != i])
        words[i], words[j] = words[j], words[i]

    return ' '.join(words)


# typo injection
def typo_injection(text, prob=1.0):
    def introduce_typo(word):
        if len(word) == 0:
            return word

        if random.random() < prob:
            # Swap
            if len(word) >= 2:
                i = random.randint(0, len(word) - 2)
                word = word[:i] + word[i+1] + word[i] + word[i+2:]

            # Duplicate
            if len(word) > 0:
                i = random.randint(0, len(word) - 1)
                word = word[:i+1] + word[i] + word[i+1:]

            # Delete
            if len(word) >= 2:
                i = random.randint(0, len(word) - 1)
                word = word[:i] + word[i+1:]

            # Insert
            i = random.randint(0, len(word))
            c = random.choice(string.ascii_lowercase)
            word = word[:i] + c + word[i:]

            return word

    words = word_tokenize(text)

    noisy_words = [
        introduce_typo(word) if word.lower() not in exclude_tokens and random.random() < prob else word
        for word in words
    ]

    return ' '.join(noisy_words)



# random deletion
def random_deletion(text, prob=0.2):
    words = word_tokenize(text)

    new_words = [
        word for word in words
        if word.lower() in exclude_tokens or random.random() > prob
    ]

    return ' '.join(new_words) if new_words else random.choice(words)


# randomize note
def randomize_note(text):
    text = random_synonym_replacement(text, prob=1.0)
    text = random_word_shuffle(text, prob=1.0)
    text = typo_injection(text, prob=1.0)
    text = random_deletion(text, prob=0.2)
    return text

In [9]:
def replace_month_in_date(text):
    full_months = list(calendar.month_name)[1:]  # ['January', ..., 'December']
    abbr_months = list(calendar.month_abbr)[1:]  # ['Jan', ..., 'Dec']

    # mapping from shortest to complete
    abbr_to_full = {abbr.lower(): full.lower() for abbr, full in zip(abbr_months, full_months)}
    all_months = set([m.lower() for m in full_months + abbr_months])

    # check if text contains a month
    if not any(month in text.lower() for month in all_months):
        return text

    words = text.split()
    new_words = []

    for word in words:
        word_clean = word.strip(",.")

        lowered = word_clean.lower()
        if lowered in all_months:
            # search complete month
            current_full = abbr_to_full.get(lowered, lowered)  # expanded

            # new month
            new_month_full = random.choice([m for m in full_months if m.lower() != current_full])

            # if original is abbr then new month is abbr
            if lowered in abbr_to_full:
                new_month = new_month_full[:3]
            else:
                new_month = new_month_full

            # mantain original capitalization
            if word_clean.istitle():
                new_words.append(new_month)
            else:
                new_words.append(new_month.lower())
        else:
            new_words.append(word)

    return " ".join(new_words)

In [10]:
# object for conversion num -> word
p = inflect.engine()

# pattern for numbers and letters
number_words_pattern = re.compile(
    r'\b(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|'
    r'eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|'
    r'twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|and)'
    r'(?:[\s-]+)?)+\b',
    flags=re.IGNORECASE
)

# pattern for digits
digit_pattern = re.compile(r'\b\d+\b')

# Function to round to the nearest whole number
def round_to_nearest(num, base=5):
    if num > 2:
        return int(base * round(float(num) / base))
    else:
        return 5


# Function that converts word -> num -> round -> word
def get_replace_word_number(entity_type):
    def replace_word_number(match):
        try:
            val = w2n.word_to_num(match.group())

            if entity_type == "TIME" and val >= 23:
                return "twenty-four "

            rounded = round_to_nearest(val)

            return p.number_to_words(rounded) + " "
        except Exception:
            return match.group()
    return replace_word_number


# Function for replace digit number
def get_replace_digit_number(entity_type):
    def replace_digit_number(match):
        try:
            val = int(match.group())

            if entity_type == "TIME" and val >= 23:
                return "24"
            rounded = round_to_nearest(val)

            return str(rounded)
        except Exception:
            return match.group()
    return replace_digit_number

### **CLINICAL-BERT NER**

In [11]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
df_random_text = {'index': [], 'note': [], 'random_note': [], 'full_note': [], 'random_full_note': []}

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Random dataset"):
    note = row["note"]
    full_note = row["full_note"]

    df_random_text['index'].append(index)
    df_random_text['note'].append(note)
    df_random_text['full_note'].append(full_note)

    for idx, text in enumerate([note, full_note]):
        ner_results = ner_pipeline(text)

        random_text = text
        for entity in ner_results:
            if entity["entity_group"] in {"problem", "treatment"}:
                original = entity["word"]
                randomized = randomize_note(original)
                random_text = random_text.replace(original, randomized, 1)

        if idx == 0:
            df_random_text['random_note'].append(random_text)
        else:
            df_random_text['random_full_note'].append(random_text)

        # print("\nRandom text (PROBLEM/TREATMENT replaced):\n")
        # print(random_text)

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cuda:0
Random dataset:   0%|          | 0/30000 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)
Random dataset:   0%|          | 5/30000 [00:05<5:09:07,  1.62it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Random dataset: 100%|██████████| 30000/30000 [58:41<00:00,  8.52it/s]


In [12]:
df_random = pd.DataFrame(df_random_text)

In [13]:
df_random['random_note'][0]

'A a sixteen year-old girl, presented to our Outpatient department with the complaints of dtssicomfor in the ccerixn and bcvkk olwerm as well as imitatioonm of motlslity srrcwuture orgnnzci. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the mssuculms cnotaccction of the. There was a sidwyasss of the tebnddng in the lumbar region. To counter the aabounrml of the ddoqusm and paatlcment, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.She had been experiencing these olbfbrtu for the past four months since when she was introduced to nlzznaapine tablets for the control of e

### **SPACY NER**

In [14]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
for index, row in tqdm(df_random.iterrows(), total=len(df), desc="Randomizing sensitive info"):
    for idx, field in enumerate(["random_note", "random_full_note"]):
        text = row[field]
        doc = nlp(text)

        randomized_text = ""
        i = 0

        for ent in sorted(doc.ents, key=lambda e: e.start_char):
            start, end = ent.start_char, ent.end_char
            randomized_text += text[i:start]
            ent_text = ent.text

            if ent.label_ in {"CARDINAL", "QUANTITY", "DATE", "TIME"}:
                replace_word_fn = get_replace_word_number(ent.label_)
                replace_digit_fn = get_replace_digit_number(ent.label_)

                ent_text = number_words_pattern.sub(replace_word_fn, ent_text)
                ent_text = digit_pattern.sub(replace_digit_fn, ent_text)

                if ent.label_ == "DATE":
                    ent_text = replace_month_in_date(ent_text)

                randomized_text += ent_text

            elif ent.label_ in {"PERSON", "NORP", "FAC", "ORG",
                                "GPE", "LOC", "LANGUAGE"}:
                randomized_text += randomize_note(ent.text)

            else:
                randomized_text += ent_text

            i = end

        randomized_text += text[i:]

        randomized_text = re.sub(
          r'\b\w+\b',
          lambda m: randomize_note(m.group()) if m.group().lower() in gender_terms else m.group(),
          randomized_text
        )

        if idx == 0:
            df_random.at[index, "random_note"] = randomized_text
        else:
            df_random.at[index, "random_full_note"] = randomized_text

Randomizing sensitive info: 100%|██████████| 30000/30000 [1:22:22<00:00,  6.07it/s]


In [16]:
df_random['random_note'][0]

'A a fifteen year-old ilgrw, presented to our Outpatient department with the complaints of dtssicomfor in the ccerixn and bcvkk olwerm as well as imitatioonm of motlslity srrcwuture orgnnzci. hqhS was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. Seeu would keep hyre head turned to the right and upwards due to the mssuculms cnotaccction of the. There was a sidwyasss of the tebnddng in the lumbar region. To counter the aabounrml of the ddoqusm and mnapcztlaae, erhh would keep rjre limbs in a specific position to allow hrcr body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, hesy would require assistance in standing and walking. hfhS would require eeor parents to help yhhe with daily chores, including all activities of self-care.ehhj had been experiencing these olbfbrtu for the past five months since when sehg was introduced to nlzznaapine tablets for th

In [17]:
from google.colab import files

df_random.to_csv('randomized_dataset.csv', index=False)
files.download('randomized_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>