## **Data Encryption**

In [None]:
%pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [6]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import spacy
import torch

#### Reading dataset

In [2]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

In [None]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [None]:
df.dropna(inplace=True)

In [3]:
# Remove unwanted characters from the 'note' and 'full note' columns
df['note'] = df['note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)
df['full_note'] = df['full_note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)

In [None]:
len(df)

30000

### **CLINICAL-BERT NER**

In [4]:
# gender to mask
gender_terms = {
    "male", "female", "man", "woman", "boy", "girl",
    "he", "she", "his", "her", "him", "hers", "himself", "herself"
}


from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend
import base64
import hashlib

# Secret Key AES (16, 24 o 32 byte)
SECRET_KEY = hashlib.sha256(b"my-secret-key").digest()

# AES setup
def encrypt_word_aes(word, key=SECRET_KEY):
    word = word.lower()

    # Pad the word to AES block size (128 bit)
    padder = padding.PKCS7(128).padder()
    padded_data = padder.update(word.encode()) + padder.finalize()

    # ECB mode (deterministic; for individual words)
    cipher = Cipher(algorithms.AES(key), modes.ECB(), backend=default_backend())
    encryptor = cipher.encryptor()
    ct = encryptor.update(padded_data) + encryptor.finalize()

    # Return as base64 to make it readable
    return base64.urlsafe_b64encode(ct).decode()[:10]  # truncate for readability

# Core masking function
def encrypt_word(word):
    if word.isdigit() or word.lower() in gender_terms:
        return encrypt_word_aes(word)
    return encrypt_word_aes(word)

In [7]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

df_encrypt_text = {'index': [], 'note': [], 'encrypt_note': [], 'sensitive_entity_note': [],
                  'full_note': [], 'encrypt_full_note': [], 'sensitive_entity_full_note': [],
                  }

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Encrypt dataset"):
    df_encrypt_text['index'].append(index)

    sensitive_entity_note = set()
    sensitive_entity_full_note = set()

    for col in ["note", "full_note"]:
        text = row[col]
        ner_results = ner_pipeline(text)

        # Map of masked intervals
        to_mask = []
        for entity in ner_results:
            if entity["entity_group"] in {"problem", "treatment"}:
                if col == 'note': sensitive_entity_note.add(entity["word"])
                else: sensitive_entity_full_note.add(entity["word"])
                to_mask.append((entity["start"], entity["end"]))

        # Mask words in specif ranges
        encrypt_text = ""
        i = 0
        for start, end in sorted(to_mask):
            encrypt_text += text[i:start]
            original = text[start:end]
            encrypt = re.sub(r'\b\w+\b', lambda m: encrypt_word(m.group()), original)
            encrypt_text += encrypt
            i = end

        encrypt_text += text[i:]

        if col == 'note':
            df_encrypt_text['note'].append(text)
            df_encrypt_text['encrypt_note'].append(encrypt_text)
            df_encrypt_text['sensitive_entity_note'].append(sensitive_entity_note)
        else:
            df_encrypt_text['full_note'].append(text)
            df_encrypt_text['encrypt_full_note'].append(encrypt_text)
            df_encrypt_text['sensitive_entity_full_note'].append(sensitive_entity_full_note)

        # print("\nMasked text (first two letter are visible for PROBLEM/TREATMENT):\n")
        # print(masked_text)

Device set to use cuda
  return forward_call(*args, **kwargs)
Encrypt dataset:   0%|          | 0/30000 [00:00<?, ?it/s]


In [8]:
df_encrypt = pd.DataFrame(df_encrypt_text)

In [9]:
print(df_encrypt['encrypt_note'][0])

A a sixteen year-old girl, presented to our Outpatient department with the complaints of oW0rmunCQo okdVVRgc1- 0NRdXTHwsy jH-m6AIIYd 4R72SxIbI1 8Fep4XoCQH 3wYsXFPPXd as well as 7W_ekE_y9e O8mbpBC7UG UMKHQWk52S 6LqK8wobRq. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to 0NRdXTHwsy OumdYBy91J QhmKbiIHnV O8mbpBC7UG 0NRdXTHwsy jH-m6AIIYd kNMKyrorwl. There was nMubxnGZ3r bympupTB6X _TUtNj-9u0 O8mbpBC7UG 0NRdXTHwsy 3wYsXFPPXd in the lumbar region. To counter 0NRdXTHwsy HaRCGlbzwE UjIgU4tf-- O8mbpBC7UG 0NRdXTHwsy 3wYsXFPPXd 4R72SxIbI1 jH-m6AIIYd, she would keep her limbs in a specific position to allow her body weight to be supported. Due to 0NRdXTHwsy restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all 

In [11]:
df_encrypt['sensitive_entity_note'][0]

{'##pine tablets',
 'a sideways bending of the back',
 'a sustained and abnormal contraction of the neck muscles',
 'any other psychotropic medication',
 'bipolar affective disorder',
 'discomfort in her neck',
 'discomfort in the neck and lower back',
 'distress',
 'dysfunction',
 'her exacerbated mental illness',
 'her illness',
 'intensity',
 'mania',
 'olanza',
 'olanzapine',
 'olanzapine tablets',
 'pain and',
 'restriction of body movements',
 'rigidity in her upper limbs',
 'tablet olanzapine',
 'tablet trihexyphenidyl',
 'the',
 'the abnormal positioning of the back and neck',
 'the affective disorder',
 'the rigidity',
 'the sustained contraction of the neck muscles',
 'these difficulties',
 'these features',
 'this drug',
 'this medication'}

### **SPACY NER**

In [12]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
for index, row in tqdm(df.iterrows(), total=len(df_encrypt), desc="Search sensitive info"):
    for col in ["note", "full_note"]:
        text = row[col]
        doc = nlp(text)

        # NER spaCy
        for ent in sorted(doc.ents, key=lambda e: e.start_char):
            if ent.label_ in {
                "CARDINAL", "QUANTITY", "DATE", "TIME", "PERSON",
                "NORP", "FAC", "ORG", "GPE", "LOC", "LANGUAGE"
            }:
                if col == 'note':
                    df_encrypt.at[index, 'sensitive_entity_note'].add(ent.text)
                else:
                    df_encrypt.at[index, 'sensitive_entity_full_note'].add(ent.text)

        # gender terms
        def detect_gender_term(match):
            word = match.group()
            if word.lower() in gender_terms:
                if col == 'note':
                    df_encrypt.at[index, 'sensitive_entity_note'].add(word)
                else:
                    df_encrypt.at[index, 'sensitive_entity_full_note'].add(word)
            return word

        re.sub(r'\b\w+\b', detect_gender_term, text)

Search sensitive info:   0%|          | 0/1 [00:00<?, ?it/s]


In [15]:
df_encrypt['sensitive_entity_note'][0]

{'##pine tablets',
 '2.5–10 mg',
 '4 mg',
 '5 mg',
 'Her',
 'Outpatient',
 'She',
 'a sideways bending of the back',
 'a sustained and abnormal contraction of the neck muscles',
 'any other psychotropic medication',
 'around three weeks',
 'bipolar affective disorder',
 'discomfort in her neck',
 'discomfort in the neck and lower back',
 'distress',
 'dysfunction',
 'girl',
 'her',
 'her exacerbated mental illness',
 'her illness',
 'intensity',
 'mania',
 'olanza',
 'olanzapine',
 'olanzapine tablets',
 'pain and',
 'restriction of body movements',
 'rigidity in her upper limbs',
 'she',
 'sixteen year-old',
 'tablet olanzapine',
 'tablet trihexyphenidyl',
 'the',
 'the abnormal positioning of the back and neck',
 'the affective disorder',
 'the age of eleven',
 'the first three years',
 'the past four months',
 'the past seven years',
 'the rigidity',
 'the second week',
 'the sustained contraction of the neck muscles',
 'these difficulties',
 'these features',
 'this drug',
 'this m

In [16]:
# Entity type to mask
entities_to_encrypt = {"PERSON", "NORP", "DATE", "CARDINAL", "QUANTITY", "FAC", "ORG",
                    "GPE", "LOC", "LANGUAGE", "TIME"}

for index, row in tqdm(df_encrypt.iterrows(), total=len(df_encrypt), desc="Encrypt dataset"):
    for col in ["encrypt_note", "encrypt_full_note"]:
        text = row[col]
        to_encrypt = []
        doc = nlp(text)

        for ent in doc.ents:
            if ent.label_ in entities_to_encrypt:
                to_encrypt.append((ent.start_char, ent.end_char))

        encrypt_text_new = ""
        i = 0
        for start, end in sorted(to_encrypt):
            encrypt_text_new += text[i:start]
            original = text[start:end]
            encrypt = re.sub(r'\b\w+\b', lambda m: encrypt_word(m.group()), original)
            encrypt_text_new += encrypt
            i = end

        encrypt_text_new += text[i:]

        encrypt_text_new = re.sub(
          r'\b\w+\b',
          lambda m: encrypt_word(m.group()) if m.group().lower() in gender_terms else m.group(),
          encrypt_text_new
        )

        df_encrypt.at[index, col] = encrypt_text_new

Encrypt dataset: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]


In [None]:
from google.colab import files

df_encrypt.to_csv('encrypt_dataset.csv', index=False)
files.download('encrypt_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Reading masked dataset

In [17]:
print(df_encrypt['encrypt_note'][0])

A a wWsbMiy__S Seor8VO9yQ-zR9BxL4QEb zKBSYREgGq, presented to our RaxhORD2AF department with the complaints of oW0rmunCQo okdVVRgc1- trMGughzM0 jH-m6AIIYd 4R72SxIbI1 8Fep4XoCQH 3wYsXFPPXd as well as Ozmo-oSFxe O8mbpBC7UG UMKHQWk52S 6LqK8wobRq. Kcg24XvmAz was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. Kcg24XvmAz would keep yerD4tQZHH head turned to the right and upwards due to trMGughzM0 OumdYBy91J QhmKbiIHnV O8mbpBC7UG trMGughzM0 jH-m6AIIYd kNMKyrorwl. There was nMubxnGZ3r a5z2O3uVDN _TUtNj-9u0 O8mbpBC7UG trMGughzM0 GV-cLbRWUy in the lumbar region. To counter trMGughzM0 HaRCGlbzwE UjIgU4tf-- O8mbpBC7UG trMGughzM0 GV-cLbRWUy 7P_j3E4hfb jH-m6AIIYd, Kcg24XvmAz would keep yerD4tQZHH limbs in a specific position to allow yerD4tQZHH body weight to be supported. Due to trMGughzM0 restrictions with the body movements at the neck and in the lumbar region, Kcg24XvmAz would require assistance in standing and walking. Kcg2

In [None]:
print(df_encrypt['encrypt_full_note'][0])