## **Data Encryption**

In [1]:
%pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import spacy

#### Reading dataset

In [3]:
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
df.head()

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...


In [5]:
df.dropna(inplace=True)

In [6]:
# Remove unwanted characters from the 'note' and 'full note' columns
df['note'] = df['note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)
df['full_note'] = df['full_note'].astype(str).str.replace(r'(\\n|\n|\r|\[|\]|\(|\))', '', regex=True)

In [7]:
len(df)

30000

### **CLINICAL-BERT NER**

In [8]:
exclude_tokens = ["the", "be", "to", "of", "and", "a", "an", "in", "that", "these",
                  "those", "I", "it", "for", "not", "on", "with", "as", "you",
                  "do", "at"]

# gender to mask
gender_terms = {
    "male", "female", "man", "woman", "boy", "girl",
    "he", "she", "his", "her", "him", "hers", "himself", "herself"
}


from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend
import base64
import hashlib

# Secret Key AES (16, 24 o 32 byte)
SECRET_KEY = hashlib.sha256(b"my-secret-key").digest()

# AES setup
def encrypt_word_aes(word, key=SECRET_KEY):
    word = word.lower()

    # Pad the word to AES block size (128 bit)
    padder = padding.PKCS7(128).padder()
    padded_data = padder.update(word.encode()) + padder.finalize()

    # ECB mode (deterministic; for individual words)
    cipher = Cipher(algorithms.AES(key), modes.ECB(), backend=default_backend())
    encryptor = cipher.encryptor()
    ct = encryptor.update(padded_data) + encryptor.finalize()

    # Return as base64 to make it readable
    return base64.urlsafe_b64encode(ct).decode()[:10]  # truncate for readability

# Core masking function
def encrypt_word(word):
    if word.lower() in exclude_tokens:
        return word
    if word.isdigit() or word.lower() in gender_terms:
        return encrypt_word_aes(word)
    return encrypt_word_aes(word)

In [None]:
model_name = "samrawal/bert-base-uncased_clinical-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
df_encrypt_text = {'index': [], 'note': [], 'encrypt_note': [], 'full_note': [], 'encrypt_full_note': []}

# Iterate over all dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Encrypt dataset"):
    note = row["note"]
    full_note = row["full_note"]

    df_encrypt_text['index'].append(index)
    df_encrypt_text['note'].append(note)
    df_encrypt_text['full_note'].append(full_note)

    for idx, text in enumerate([note, full_note]):
        ner_results = ner_pipeline(text)

        # Map of masked intervals
        to_mask = []
        for entity in ner_results:
            if entity["entity_group"] in {"problem", "treatment"}:
                to_mask.append((entity["start"], entity["end"]))

        # Mask words in specif ranges
        encrypt_text = ""
        i = 0
        for start, end in sorted(to_mask):
            encrypt_text += text[i:start]
            original = text[start:end]
            encrypt = re.sub(r'\b\w+\b', lambda m: encrypt_word(m.group()), original)
            encrypt_text += encrypt
            i = end

        encrypt_text += text[i:]

        if idx == 0:
            df_encrypt_text['encrypt_note'].append(encrypt_text)
        else:
            df_encrypt_text['encrypt_full_note'].append(encrypt_text)

        # print("\nMasked text (first two letter are visible for PROBLEM/TREATMENT):\n")
        # print(masked_text)

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cuda:0

Encrypt dataset:   0%|          | 0/30000 [00:00<?, ?it/s][A

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Encrypt dataset:   1%|          | 271/30000 [00:46<54:40,  9.06it/s][A
Encrypt dataset:   1%|          | 273/30000 [00:46<51:04,  9.70it/s][A
Encrypt dataset:   1%|          | 275/30000 [00:47<49:13, 10.06it/s][A
Encrypt dataset:   1%|          | 277/30000 [00:47<47:47, 10.37it/s][A
Encrypt dataset:   1%|          | 279/30000 [00:47<46:29, 10.65it/s][A
Encrypt dataset:   1%|          | 281/30000 [00:47<46:26, 10.66it/s][A
Encrypt dataset:   1%|          | 283/30000 [00:47<46:16, 10.70it/s][A
Encrypt dataset:   1%|          | 285/30000 [00:47<46:28, 10.66it/s][A
Encrypt dataset:   1%|          | 287/30000 [00:48<46:33, 10.64it/s][A
Encrypt dataset:   1%|          | 289/30000 [00:48<46:24, 10.67it/s][A
Encrypt dataset:   1%|          | 291/30000 [00:48<46:24, 10.67it/s][A
Encrypt dataset:   1%|          | 293/30000 [00:48<46:07, 10.74it/s][A
Encrypt dataset:   1%|          | 295/30000 [00:48<47:42, 10.38it/s][A


In [None]:
df_encrypt = pd.DataFrame(df_encrypt_text)

In [None]:
print(df_encrypt['encrypt_note'][0])

A a sixteen year-old girl, presented to our Outpatient department with the complaints of oW0rmunCQo in the jH-m6AIIYd and 8Fep4XoCQH 3wYsXFPPXd as well as 7W_ekE_y9e of UMKHQWk52S 6LqK8wobRq. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the OumdYBy91J QhmKbiIHnV of the jH-m6AIIYd kNMKyrorwl. There was a bympupTB6X _TUtNj-9u0 of the 3wYsXFPPXd in the lumbar region. To counter the HaRCGlbzwE UjIgU4tf-- of the 3wYsXFPPXd and jH-m6AIIYd, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.She had been experiencing these luXp999Jcj for the past four months since when she was i

### **SPACY NER**

In [None]:
spacy.cli.download("en_core_web_lg")

# spaCy model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Entity type to mask
entities_to_encrypt = {"PERSON", "NORP", "DATE", "CARDINAL", "QUANTITY", "FAC", "ORG",
                    "GPE", "LOC", "LANGUAGE", "TIME"}

# gender to mask
gender_terms = {"male", "female", "man", "woman", "boy", "girl", "he", "she", "his", "her"}

for index, row in tqdm(df_encrypt.iterrows(), total=len(df_encrypt), desc="Encrypt dataset"):
    for col in ["encrypt_note", "encrypt_full_note"]:
        text = row[col]
        to_encrypt = []
        doc = nlp(text)

        for ent in doc.ents:
            if ent.label_ in entities_to_encrypt:
                to_encrypt.append((ent.start_char, ent.end_char))

        encrypt_text_new = ""
        i = 0
        for start, end in sorted(to_encrypt):
            encrypt_text_new += text[i:start]
            original = text[start:end]
            encrypt = re.sub(r'\b\w+\b', lambda m: encrypt_word(m.group()), original)
            encrypt_text_new += encrypt
            i = end

        encrypt_text_new += text[i:]

        encrypt_text_new = re.sub(
          r'\b\w+\b',
          lambda m: encrypt_word(m.group()) if m.group().lower() in gender_terms else m.group(),
          encrypt_text_new
        )

        df_encrypt.at[index, col] = encrypt_text_new

Encrypt dataset:   4%|▍         | 1314/30000 [03:33<1:41:53,  4.69it/s]

In [None]:
from google.colab import files

df_encrypt.to_csv('encrypt_dataset.csv', index=False)
files.download('encrypt_dataset.csv')

Reading masked dataset

In [None]:
print(df_encrypt['encrypt_note'][0])

In [None]:
print(df_encrypt['encrypt_full_note'][0])