## **Data Masking**

In [None]:
# %pip install transformers torch

In [8]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import spacy

#### Reading dataset

In [None]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

In [None]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [None]:
df.dropna(inplace=True)

In [None]:
len(df)

30000

### **CLINICAL-BERT NER**

In [1]:
# Function to mask words mantaining first two letter (eg. nack --> na**)
def mask_word(word):
    if len(word) <= 2:
        return word
    return word[:2] + "*" * (len(word) - 2)

In [None]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
df_masked_text = {'index': [], 'note': [], 'masked_note': [], 'full_note': [], 'masked_full_note': []}

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Masked dataset"):
    note = row["note"]
    full_note = row["full_note"]

    df_masked_text['index'].append(index)
    df_masked_text['note'].append(note)
    df_masked_text['full_note'].append(full_note)

    for idx, text in enumerate([note, full_note]):
        ner_results = ner_pipeline(text)

        # Map of masked intervals
        to_mask = []
        for entity in ner_results:
            if entity["entity_group"] in {"problem", "treatment"}:
                to_mask.append((entity["start"], entity["end"]))

        # Mask words in specif ranges
        masked_text = ""
        i = 0
        for start, end in sorted(to_mask):
            masked_text += text[i:start]
            original = text[start:end]
            masked = re.sub(r'\b\w+\b', lambda m: mask_word(m.group()), original)
            masked_text += masked
            i = end

        masked_text += text[i:]

        if idx == 0:
            df_masked_text['masked_note'].append(masked_text)
        else:
            df_masked_text['masked_full_note'].append(masked_text)

        # print("\nMasked text (first two letter are visible for PROBLEM/TREATMENT):\n")
        # print(masked_text)

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cuda:0
Masked dataset:   0%|          | 4/30000 [00:00<1:26:03,  5.81it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Masked dataset:   0%|          | 8/30000 [00:01<55:43,  8.97it/s]  

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Masked dataset: 100%|██████████| 30000/30000 [49:09<00:00, 10.17it/s]


In [None]:
df_masked = pd.DataFrame(df_masked_text)

### **SPACY NER**

In [9]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Entity type to mask
entities_to_mask = {"PERSON", "NORP", "DATE", "CARDINAL", "QUANTITY", "FAC", "ORG",
                    "GPE", "LOC", "LANGUAGE", "TIME"}

# Iterate over all dataset
for index, row in tqdm(df_masked.iterrows(), total=len(df_masked), desc="Masked dataset"):
    note = row["note"]
    full_note = row["full_note"]

    for idx, text in enumerate([note, full_note]):

        # List of masked intervals
        to_mask = []
        other_to_mask = []

        doc = nlp(text)

        for ent in doc.ents:
            if ent.label_ in entities_to_mask:
                to_mask.append((ent.start_char, ent.end_char))
            if ent.label_ in ["CARDINAL", "QUANTITY", "TIME"]:
                other_to_mask.append((ent.start_char, ent.end_char))

        # Mask words in specif ranges
        masked_text_new = ""
        i = 0
        for start, end in sorted(to_mask):
            if (start, end) in other_to_mask:
                masked_text_new += text[i:start]
                masked_text_new += '*'
                i = end
                continue
            masked_text_new += text[i:start]
            original = text[start:end]
            masked = re.sub(r'\b\w+\b', lambda m: mask_word(m.group()), original)
            masked_text_new += masked
            i = end

        masked_text_new += text[i:]

        if idx == 0:
            df_masked.at[index, 'masked_note'] = masked_text_new
        else:
            df_masked.at[index, 'masked_full_note'] = masked_text_new

        print("\nMasked text (first two letter are visible for PROBLEM/TREATMENT):\n")
        print(masked_text_new)

In [None]:
from google.colab import files

df.to_csv('masked_dataset.csv', index=False)
files.download('masked_dataset.csv')

Reading masked dataset

In [6]:
import pandas as pd
import gdown

# File ID Google Drive
file_id = "1_9VFIupqIF22FrAaFUDIfq26oZmCEjqF"
# URL download
url = f"https://drive.google.com/uc?id={file_id}"

output = 'masked_dataset.csv'
gdown.download(url, output, quiet=False)

df_masked = pd.read_csv(output)
df_masked['masked_note'][0]

Downloading...
From (original): https://drive.google.com/uc?id=1_9VFIupqIF22FrAaFUDIfq26oZmCEjqF
From (redirected): https://drive.google.com/uc?id=1_9VFIupqIF22FrAaFUDIfq26oZmCEjqF&confirm=t&uuid=6b2f671e-e727-45e4-8ccc-81708b753a57
To: /content/masked_dataset.csv
100%|██████████| 330M/330M [00:08<00:00, 37.7MB/s]


'A a sixteen year-old girl, presented to our Outpatient department with the complaints of di******** in th* ne** an* lo*** ba** as well as re********* of bo** mo*******. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to th* su******* co********* of th* ne** mu*****. There was a si****** be***** of th* ba** in the lumbar region. To counter th* ab****** po********* of th* ba** an* ne**, she would keep her limbs in a specific position to allow her body weight to be supported. Due to th* restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.\nShe had been experiencing th*** di********** for the past four months since when she was introduced to ol****pi** ta***** for the control of 

In [None]:
df['masked_full_note'][0]

'A a sixteen year-old girl, presented to our Outpatient department with the complaints of di******** in th* ne** an* lo*** ba** as well as re********* of bo** mo*******. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to th* su******* co********* of th* ne** mu*****. There was a si****** be***** of th* ba** in the lumbar region. To counter th* ab****** po********* of th* ba** an* ne**, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.\\nShe had been experiencing th*** di********** for the past four months since when she was introduced to ol****pi** ta***** for the control of