## **Data Masking**

In [None]:
# %pip install transformers torch

In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import spacy
import torch

#### Reading dataset

In [None]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
len(df)

### **CLINICAL-BERT NER**

In [None]:
exclude_tokens = ["a", "an", "the", "this", "that", "these", "those"]

# Function to mask words mantaining first two letter (eg. nack --> na**)
def mask_word(word):
    if word.lower() in exclude_tokens:
        return word
    if len(word) <= 2:
        return word
    return word[:2] + "*" * (len(word) - 2)

In [None]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)
df_masked_text = {'index': [], 'note': [], 'masked_note': [], 'full_note': [], 'masked_full_note': []}

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Masked dataset"):
    note = row["note"]
    full_note = row["full_note"]

    df_masked_text['index'].append(index)
    df_masked_text['note'].append(note)
    df_masked_text['full_note'].append(full_note)

    for idx, text in enumerate([note, full_note]):
        ner_results = ner_pipeline(text)

        # Map of masked intervals
        to_mask = []
        for entity in ner_results:
            if entity["entity_group"] in {"problem", "treatment"}:
                to_mask.append((entity["start"], entity["end"]))

        # Mask words in specif ranges
        masked_text = ""
        i = 0
        for start, end in sorted(to_mask):
            masked_text += text[i:start]
            original = text[start:end]
            masked = re.sub(r'\b\w+\b', lambda m: mask_word(m.group()), original)
            masked_text += masked
            i = end

        masked_text += text[i:]

        if idx == 0:
            df_masked_text['masked_note'].append(masked_text)
        else:
            df_masked_text['masked_full_note'].append(masked_text)

        # print("\nMasked text (first two letter are visible for PROBLEM/TREATMENT):\n")
        # print(masked_text)

In [None]:
df_masked = pd.DataFrame(df_masked_text)

In [None]:
print(df_masked['masked_note'][0])

### **SPACY NER**

In [None]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

In [None]:
# Entity type to mask
entities_to_mask = {"PERSON", "NORP", "DATE", "CARDINAL", "QUANTITY", "FAC", "ORG",
                    "GPE", "LOC", "LANGUAGE", "TIME"}

# gender to mask
gender_terms = {
    "male", "female", "man", "woman", "boy", "girl",
    "he", "she", "his", "her", "him", "hers", "himself", "herself"
}

for index, row in tqdm(df_masked.iterrows(), total=len(df_masked), desc="Masked dataset"):
    for col in ["masked_note", "masked_full_note"]:
        text = row[col]

        to_mask = []
        other_to_mask = []

        doc = nlp(text)

        for ent in doc.ents:
            if ent.label_ in entities_to_mask:
                to_mask.append((ent.start_char, ent.end_char))
            if ent.label_ in ["CARDINAL", "QUANTITY", "TIME"]:
                other_to_mask.append((ent.start_char, ent.end_char))

        masked_text_new = ""
        i = 0
        for start, end in sorted(to_mask):
            if (start, end) in other_to_mask:
                masked_text_new += text[i:start]
                masked_text_new += '*'
                i = end
                continue
            masked_text_new += text[i:start]
            original = text[start:end]
            masked = re.sub(r'\b\w+\b', lambda m: mask_word(m.group()), original)
            masked_text_new += masked
            i = end

        masked_text_new += text[i:]

        def mask_gender_terms(match):
            word = match.group()
            if word.lower() in gender_terms:
                return mask_word(word)
            return word

        masked_text_new = re.sub(r'\b\w+\b', mask_gender_terms, masked_text_new)

        df_masked.at[index, col] = masked_text_new

In [None]:
import os
os.makedirs("data", exist_ok=True)
df_masked.to_csv("data/masked_dataset.csv", index=False)

In [None]:
# from google.colab import files
#
# df_masked.to_csv('masked_dataset.csv', index=False)
# files.download('masked_dataset.csv')

Reading masked dataset

In [None]:
# import pandas as pd
# import gdown
#
# # File ID Google Drive
# file_id = "1_9VFIupqIF22FrAaFUDIfq26oZmCEjqF"
# # URL download
# url = f"https://drive.google.com/uc?id={file_id}"
#
# output = 'masked_dataset.csv'
# gdown.download(url, output, quiet=False)
#
# df_masked = pd.read_csv(output)

In [None]:
print(df_masked['masked_note'][0])

In [None]:
print(df_masked['masked_full_note'][0])