### **GPT4 Data Masking**

In [1]:
from openai import OpenAI
import pandas as pd
import json
from tqdm import tqdm

from config import API_TOKEN

#### Configure prompt

In [2]:
def process_output(id_note: int, output_split: list) -> dict:

    anonym_text = output_split[1]
    obscured_entities = output_split[3].split(',')

    # Remove [
    if obscured_entities[0].strip().startswith('['):
        obscured_entities[0] = obscured_entities[0].strip()[1:]

    # Remove ]
    if obscured_entities[-1].strip().endswith(']'):
        obscured_entities[-1] = obscured_entities[-1].strip()[:-1]

    data = {
        "id_note": id_note,
        "anonym_text": anonym_text,
        "obscured_entities": set(obscured_entities)
    }

    # Remove ' '
    data['anonym_text'] = data['anonym_text'].lstrip()

    # Remove ' '
    data['obscured_entities'] = {entity.lstrip() for entity in data['obscured_entities']}

    return data

In [3]:
example_note = "A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal positioning of the back and neck, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.She had been experiencing these difficulties for the past four months since when she was introduced to olanzapine tablets for the control of her exacerbated mental illness. This was not her first experience with this drug over the past seven years since she had been diagnosed with bipolar affective disorder. Her first episode of the affective disorder was that of mania at the age of eleven which was managed with the use of olanzapine tablets in 2.5–10 mg doses per day at different times. The patient developed pain and discomfort in her neck within the second week of being put on tablet olanzapine at a dose of 5 mg per day. This was associated with a sustained and abnormal contraction of the neck muscles that would pull her head to the right in an upward direction. These features had persisted for the first three years of her illness with a varying intensity, distress, and dysfunction which would tend to correlate with the dose of olanzapine. Apart from a brief period of around three weeks when she was given tablet trihexyphenidyl 4 mg per day for rigidity in her upper limbs, she was not prescribed any other psychotropic medication. The rigidity showed good response to this medication which was subsequently"

example_masked_note = "An adolescent patient presented to our Outpatient department with complaints of di******** in th* ne** an* lo*** ba** as well as re********* of bo** mo*******. The patient was un**** to ma****** an er*** po***** and would tend to f*** on ei**** si** while st****** up fr** a si***** po******. The head was kept tu**** to th* ri*** an* up***** due to a su******* co********* of the ne** mu******. There was a si****** be***** of the ba** in the lu**** re****. To counter the ab****** po********* of the ba** an* ne**, the limbs were kept in a sp****** po****** to support the body's weight. Due to re********* wi** bo** mo******* at the ne** an* in the lu**** re****, the patient required as******** in st****** and wa******, including help with all activities of self-care. These difficulties had been experienced for several months since being introduced to ol******** tablets for an exacerbated mental illness. This was not the first experience with this drug over several years, since being diagnosed with an affective disorder. The first episode of the disorder was manic in nature, occurring in early adolescence, and was managed with the use of ol******** at varying doses. The patient developed pa** an* di******** in the ne** early in the treatment with ol******** at a specific dose. This was associated with a su******* an* ab****** co********* of the ne** mu****** that would p*** the he** to the ri*** in an up**** di*******. These features had persisted for a prolonged period with varying intensity, distress, and dysfunction, which tended to correlate with the dose of ol********. Apart from a brief period when the patient was given tr************* at a therapeutic dose for ri****** in th* up*** li***, no other psychotropic medication was prescribed. The ri****** showed a good response to this medication which was subsequently"

# Create prompt for evaluating note.
def create_prompt(note: str) -> str:
    prompt = f"""
        You are an AI assistant specializing in the anonymization of sensitive medical information, adhering to the standards set by GDPR, specifically Articles 4 and 9. You must apply either Data Masking (partially obscuring sensitive entities) or Generalization (replacing specific details with broader categories) to all potentially identifiable entities.

        The sensitive entities to be anonymized are:
        - Patient's age, name, and gender
        - Psychiatric diagnoses and detailed clinical history
        - Specific pharmacological therapies (e.g., olanzapine, trihexyphenidyl) and their dosages
        - Side effects and neurological/motor symptoms
        - Functional data and personal autonomy details
        - Detailed clinical and therapeutic timelines

        For example, given the following clinical note:
        \"\"\"
        {example_note}
        \"\"\"

        Becomes:
        \"\"\"
        {example_masked_note}
        \"\"\"

        Given the following original medical text:
        \"\"\"
        {note}
        \"\"\"

        Your output must be formatted as follows:

        Anonymized Text:
        [Anonymized version of the note]

        Obscured Entities from Original Medical Text:
        [original sensitive entity 1, original sensitive entity 2, ...]

        Do not explain or comment on the output.
        """
    return prompt

#### Reading sample masked dataset

In [4]:
df = pd.read_csv("../../datasets/sample/masked_dataset_sample.csv")

In [19]:
def make_inference():

    gpt4_masked_sample_text = {'index': [], 'note': [], 'masked_note': [], 'sensitive_entity_note': []}

    for i in tqdm(range(0, len(df)), desc="GPT4 Requests"):

        client = OpenAI(api_key=API_TOKEN, base_url="https://api.gpt4-all.xyz/v1")

        # Read original note
        note = df['note'][i]
        index = int(df['index'][i])

        # Create formatted prompt to send to GPT4.
        prompt = create_prompt(note=note)

        try:

            # Request to API.
            api_response = client.chat.completions.create(
                model="gpt-4.1",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                top_p=0.95,
                stream=False,
            )

            # Output of model.
            output = api_response.choices[0].message.content

            # Process output.
            output_split = output.split('\n')
            if '' in output_split: output_split.remove('')
            formatted_data = process_output(id_note=index, output_split=output_split)

            # Append text
            gpt4_masked_sample_text['index'].append(index)
            gpt4_masked_sample_text['note'].append(note)
            gpt4_masked_sample_text['masked_note'].append(formatted_data['anonym_text'])
            gpt4_masked_sample_text['sensitive_entity_note'].append(formatted_data['obscured_entities'])

        except Exception as e:

            # Append text
            gpt4_masked_sample_text['index'].append(index)
            gpt4_masked_sample_text['note'].append(note)
            gpt4_masked_sample_text['masked_note'].append('')
            gpt4_masked_sample_text['sensitive_entity_note'].append('')

    pd.DataFrame(gpt4_masked_sample_text).to_csv("../../datasets/sample/gpt4_masked_dataset_sample.csv", index=False)

In [20]:
make_inference()

GPT4 Requests: 100%|██████████| 5000/5000 [10:12:13<00:00,  7.35s/it]  
