### **Llama3 70b Data Generalization**

In [2]:
import ast

from openai import OpenAI
import pandas as pd
import os
from tqdm import tqdm

from config import API_TOKEN

#### Configure prompt

In [3]:
def process_output(id_note: int, output_split: list) -> dict:

    anonym_text = output_split[1]
    obscured_entities = output_split[3].split(',')

    # Remove [
    if obscured_entities[0].strip().startswith('['):
        obscured_entities[0] = obscured_entities[0].strip()[1:]

    # Remove ]
    if obscured_entities[-1].strip().endswith(']'):
        obscured_entities[-1] = obscured_entities[-1].strip()[:-1]

    data = {
        "id_note": id_note,
        "anonym_text": anonym_text,
        "obscured_entities": set(obscured_entities)
    }

    # Remove ' '
    data['anonym_text'] = data['anonym_text'].lstrip()

    # Remove ' '
    data['obscured_entities'] = {entity.lstrip() for entity in data['obscured_entities']}

    return data

In [4]:
# Create prompt for evaluating note.
def create_prompt(note: str) -> str:

    prompt = f"""
        You are an AI assistant specialized in anonymizing sensitive medical information according to GDPR regulations, with specific attention to Articles 4 and 9.  Your task is to anonymize the provided medical note by applying Data Generalization techniques: replace any identifiable or sensitive details with broader, non-identifiable categories.

        The sensitive entities to be anonymized are:
        - Patient's age, name, and gender
        - Psychiatric diagnoses and detailed clinical history
        - Specific pharmacological therapies (e.g., olanzapine, trihexyphenidyl) and their dosages
        - Side effects and neurological/motor symptoms
        - Functional data and personal autonomy details
        - Detailed clinical and therapeutic timelines

        Given the following original medical text:
        \"\"\"{note}\"\"\"

        Your output must be formatted as follows:

        Anonymized Text:
        [Anonymized version of the note]

        Obscured Entities from Original Medical Text:
        [original sensitive entity 1, original sensitive entity 2, ...]

        Do not explain or comment on the output.
        """
    return prompt

#### Reading sample masked dataset

In [5]:
df = pd.read_csv("../../datasets/sample/masked_dataset_sample.csv")

In [5]:
def make_inference():
    csv_path = "../../datasets/sample/llama3_data_generalization.csv"

    # Indexes already processed
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
        processed_indices = set(existing_df['index'].tolist())
    else:
        processed_indices = set()

    for i in tqdm(range(0, len(df)), desc="Llama3 Requests"):
        index = int(df['index'][i])
        if index in processed_indices:
            continue  # salta riga già processata

        client = OpenAI(api_key=API_TOKEN, base_url="https://api.gpt4-all.xyz/v1")
        note = df['note'][i]
        prompt = create_prompt(note=note)

        try:
            api_response = client.chat.completions.create(
                model="llama-3.3-70b",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                top_p=0.95,
                stream=False,
            )

            output = api_response.choices[0].message.content
            output_split = output.split('\n')
            if '' in output_split:
                output_split.remove('')

            formatted_data = process_output(id_note=index, output_split=output_split)
            anonym_note = formatted_data['anonym_text']
            sensitive_entity_note = formatted_data['obscured_entities']

        except Exception as e:
            print(e)
            anonym_note = ''
            sensitive_entity_note = ''

        # Create row
        new_row = {
            'index': index,
            'note': note,
            'anonym_note': anonym_note,
            'sensitive_entity_note': sensitive_entity_note
        }

        new_df = pd.DataFrame([new_row])

        # Write on file csv
        if os.path.exists(csv_path):
            new_df.to_csv(csv_path, mode='a', header=False, index=False)
        else:
            new_df.to_csv(csv_path, index=False)

In [None]:
make_inference()