## **Data Anonymization**

In [1]:
%pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import spacy

  from .autonotebook import tqdm as notebook_tqdm


#### Reading dataset

In [3]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [5]:
df.dropna(inplace=True)

In [6]:
# Remove unwanted characters from the 'note' and 'full note' columns
df['note'] = df['note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)
df['full_note'] = df['full_note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)

In [7]:
len(df)

30000

In [12]:
# gender to mask
gender_terms = {
    "male", "female", "man", "woman", "boy", "girl",
    "he", "she", "his", "her", "him", "hers", "himself", "herself"
}

def clean_duplicate_labels(text):
    # Remove consecutive [PROBLEM] or [TREATMENT]
    text = re.sub(r'(?:\s*\[PROBLEM\]\s*){2,}', ' [PROBLEM] ', text)
    text = re.sub(r'(?:\s*\[TREATMENT\]\s*){2,}', ' [TREATMENT] ', text)

    # Remove multiple spaced
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

### **CLINICAL-BERT NER**

In [9]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Output structure
df_anonym_text = {'index': [], 'note': [], 'anonymized_note': [], 'full_note': [], 'anonymized_full_note': []}

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Anonymized dataset"):
    note = row["note"]
    full_note = row["full_note"]

    df_anonym_text['index'].append(index)
    df_anonym_text['note'].append(note)
    df_anonym_text['full_note'].append(full_note)

    for idx, text in enumerate([note, full_note]):
        ner_results = ner_pipeline(text)

        # Build masked text by replacing spans
        anonym_text = ""
        i = 0
        for entity in sorted(ner_results, key=lambda x: x['start']):
            start, end = entity['start'], entity['end']
            label = entity['entity_group']

            if label in {"problem", "treatment"}:
                # Append non-entity text
                anonym_text += text[i:start]
                # Append label placeholder
                anonym_text += f"[{label.upper()}]"
                i = end

        anonym_text += text[i:]

        anonym_text = clean_duplicate_labels(anonym_text)

        if idx == 0:
            df_anonym_text['anonymized_note'].append(anonym_text)
        else:
            df_anonym_text['anonymized_full_note'].append(anonym_text)

    if index == 0: break

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cpu

  return forward_call(*args, **kwargs)
Anonymized dataset:   0%|          | 0/30000 [00:07<?, ?it/s]


In [10]:
df_anonym = pd.DataFrame(df_anonym_text)

In [11]:
df_anonym['anonymized_note'][0]

'A a sixteen year-old girl, presented to our Outpatient department with the complaints of [PROBLEM] as well as [PROBLEM]. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to [PROBLEM]. There was [PROBLEM] in the lumbar region. To counter [PROBLEM], she would keep her limbs in a specific position to allow her body weight to be supported. Due to [PROBLEM] restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.She had been experiencing [PROBLEM] for the past four months since when she was introduced to [TREATMENT] for the control of [PROBLEM]. This was not her first experience with [TREATMENT] over the past seven years since she had been diagnosed with [PROBLEM]. Her first episode of

### **SPACY NER**

In [10]:
df_anonym = pd.read_csv("../datasets/anonymized_dataset.csv", )

In [8]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
import re
from tqdm import tqdm

# Dictionary mapping short entity labels to full descriptive names in English
entity_labels_full = {
    "PERSON": "PERSON",
    "NORP": "NATIONALITY/RELIGIOUS/POLITICAL GROUP",
    "DATE": "DATE",
    "CARDINAL": "CARDINAL NUMBER",
    "QUANTITY": "QUANTITY",
    "FAC": "FACILITY",
    "ORG": "ORGANIZATION",
    "GPE": "GEO-POLITICAL ENTITY",
    "LOC": "LOCATION",
    "LANGUAGE": "LANGUAGE",
    "TIME": "TIME"
}

entities_to_mask = set(entity_labels_full.keys())

for index, row in tqdm(df_anonym.iterrows(), total=len(df_anonym), desc="Anonymized dataset"):
    for col in ["anonymized_note", "anonymized_full_note"]:
        text = row[col]
        doc = nlp(text)

        # Collect entities to replace with their full label in square brackets
        to_replace = [(ent.start_char, ent.end_char, f"[{entity_labels_full[ent.label_]}]")
                      for ent in doc.ents if ent.label_ in entities_to_mask]

        # Replace entities from the end to avoid messing up indices
        replaced_text = text
        for start, end, full_label in sorted(to_replace, key=lambda x: x[0], reverse=True):
            replaced_text = replaced_text[:start] + full_label + replaced_text[end:]

        # Gender
        replaced_text = re.sub(
            r'\b\w+\b',
            lambda m: "[GENDER]" if m.group().lower() in gender_terms else m.group(),
            replaced_text
        )

        df_anonym.at[index, col] = replaced_text

Anonymized dataset:   0%|          | 111/30000 [00:15<1:11:37,  6.95it/s]


KeyboardInterrupt: 

In [15]:
df_anonym['anonymized_note'][0]

'A a [DATE] [GENDER], presented to our [DATE] department with the complaints of [PROBLEM] as well as [PROBLEM]. [GENDER] was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. [GENDER] would keep [GENDER] head turned to the right and upwards due to [PROBLEM]. There was [PROBLEM] in the lumbar region. To counter [PROBLEM], [GENDER] would keep [GENDER] limbs in a specific position to allow [GENDER] body weight to be supported. Due to [PROBLEM] restrictions with the body movements at the neck and in the lumbar region, [GENDER] would require assistance in standing and walking. [GENDER] would require [GENDER] parents to help [GENDER] with daily chores, including all activities of self-care.[GENDER] had been experiencing [PROBLEM] for [DATE] since when [GENDER] was introduced to [TREATMENT] for the control of [PROBLEM]. This was not [GENDER] first experience with [TREATMENT] over [DATE] since [GENDER] had been diagnosed with

In [None]:
df_anonym.to_csv("datasets/anonymized_dataset_2.csv", index=False)

In [16]:
from google.colab import files

df_anonym.to_csv('anonymized_dataset.csv', index=False)
files.download('anonymized_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>