## Prompt Injection

### Mock data maken

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from dotenv import load_dotenv
from faker import Faker
import pandas as pd
import random
import json
import os 




In [2]:
fake = Faker()
Faker.seed(42)
random.seed(42)

In [3]:
def generate_patient_data(num_patients=100):
    patient_data = []
    for _ in range(num_patients):
        patient = {
            "PatientName": fake.name(),
            "Age": random.randint(1, 100),
            "Gender": random.choice(["Male", "Female", "Other"]),
            "Diagnosis": random.choice(["Hypertension", "Diabetes", "Asthma", "Heart Failure", "Depression"]),
            "Medications": random.choice(["Metformin", "Paracetamol", "Lisinopril", "Salbutamol", "Sertraline"]),
            "AdmissionDate": fake.date_this_year().strftime("%Y-%m-%d"),
            "DischargeDate": fake.date_this_year().strftime("%Y-%m-%d"),
            "Department": random.choice(["Cardiology", "Psychiatry", "Endocrinology", "Pulmonology"]),
            "BloodPressure": f"{random.randint(90, 180)}/{random.randint(60, 120)}",
            "Lifestyle": {
                "Smoking": random.choice(["Yes", "No"]),
                "Alcohol": random.choice(["Yes", "No"]),
                "PhysicalActivity": random.choice(["Low", "Moderate", "High"])
            },
            "Symptoms": random.choice(["Chest pain", "Shortness of breath", "Headache", "Fatigue", "Nausea"]),
            "Address": fake.address() 
        }
        patient_data.append(patient)
    return patient_data

In [4]:
patient_df = generate_patient_data(100)

### Mock data omzetten naar ander format

Llama werkt het best met een dataset dat bestaat uit teksten in plaats van rijen en kolommen.

In [5]:
def generate_instructions():
    return random.choice([
        "Summarize the patient's medical profile.",
        "What is the diagnosis for this patient?",
        "What medications are prescribed to the patient?",
        "Suggest lifestyle changes for the patient based on their medical history.",
        "Assess the patient's risk level based on their vitals and medical history.",
        "What symptoms are reported by the patient?",
        "Explain the patient's condition in simple terms for their family.",
        "What department is the patient admitted to?",
        "When was the patient last admitted to the hospital?"
    ])

In [6]:
def convert_to_llama_format(patient_data):
    data = []
    for patient in patient_data:
        instruction = generate_instructions()
        if "Summarize" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Age: {patient['Age']}, Gender: {patient['Gender']}, "
                f"Blood Pressure: {patient['BloodPressure']}, Medications: {patient['Medications']}, Department: {patient['Department']}"
            )
            output_data = (
                f"{patient['PatientName']} is a {patient['Age']}-year-old {patient['Gender'].lower()} "
                f"treated in the {patient['Department']} department. "
                f"The patient has a blood pressure reading of {patient['BloodPressure']} and is prescribed {patient['Medications']}."
            )
        elif "What is the diagnosis" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Symptoms: {patient['Symptoms']}, Blood Pressure: {patient['BloodPressure']}, Department: {patient['Department']}"
            )
            output_data = f"The patient is diagnosed with {patient['Diagnosis']}."
        elif "What medications" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Age: {patient['Age']}, Medications: {patient['Medications']}"
            )
            output_data = f"The patient is prescribed {patient['Medications']}."
        elif "Suggest lifestyle changes" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Age: {patient['Age']}, Lifestyle: Smoking: {patient['Lifestyle']['Smoking']}, "
                f"Alcohol: {patient['Lifestyle']['Alcohol']}, Physical Activity: {patient['Lifestyle']['PhysicalActivity']}"
            )
            output_data = (
                f"The patient should quit smoking, reduce alcohol consumption, and increase physical activity to a moderate level."
                if patient['Lifestyle']['Smoking'] == "Yes" or patient['Lifestyle']['Alcohol'] == "Yes" else
                "The patient's lifestyle is healthy, no significant changes needed."
            )
        elif "Assess the patient's risk level" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Age: {patient['Age']}, Blood Pressure: {patient['BloodPressure']}, "
                f"Medical History: {patient['Diagnosis']}"
            )
            output_data = (
                f"The patient has a high risk of complications due to {patient['Diagnosis'].lower()} and elevated blood pressure."
                if "Hypertension" in patient['Diagnosis'] else
                f"The patient's risk level is moderate based on their medical history."
            )
        elif "What symptoms" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Age: {patient['Age']}, Symptoms: {patient['Symptoms']}"
            )
            output_data = f"The patient reports {patient['Symptoms']}."
        elif "Explain the patient's condition" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Age: {patient['Age']}, Diagnosis: {patient['Diagnosis']}"
            )
            output_data = (
                f"{patient['PatientName']} has been diagnosed with {patient['Diagnosis'].lower()}, which requires ongoing management and treatment."
            )
        elif "What department" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Department: {patient['Department']}"
            )
            output_data = f"The patient is admitted to the {patient['Department']} department."
        elif "When was the patient last admitted" in instruction:
            input_data = (
                f"Name: {patient['PatientName']}, Admission Date: {patient['AdmissionDate']}"
            )
            output_data = f"The patient was last admitted on {patient['AdmissionDate']}."

        example = {
            "instruction": instruction,
            "input": input_data,
            "output": output_data
        }
        data.append(example)
    return data

In [7]:
llama_data = convert_to_llama_format(patient_df)

In [8]:
output_file = "mock_patient_data.jsonl"
with open(output_file, "w") as f:
    for entry in llama_data:
        f.write(json.dumps(entry) + "\n")

print(f"Dataset opgeslagen in {output_file}")

Dataset opgeslagen in mock_patient_data.jsonl


# LLaMA toepassen


## Huggingface 

Om Llama te gebruiken via Huggingface moet er toegang geven worden aan je huggingface account. Daarom moet er ingelogd worden op huggingface

In [9]:
load_dotenv("HF_TOKEN.env")
hf_token = os.getenv("HF_TOKEN")

### Model ophalen

Het model wordt opgehaald en gedownload

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=hf_token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", token=hf_token)

### Data transformeren voor trainen

In [17]:
dataset = load_dataset("json", data_files={"train": "mock_patient_data.jsonl"})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 100
    })
})


In [19]:
print(dataset.column_names)
print(dataset['train'][0])  # Bekijk de eerste rij


{'train': ['instruction', 'input', 'output']}
{'instruction': "Summarize the patient's medical profile.", 'input': 'Name: Allison Hill, Age: 82, Gender: Male, Blood Pressure: 118/68, Medications: Lisinopril, Department: Psychiatry', 'output': 'Allison Hill is a 82-year-old male treated in the Psychiatry department. The patient has a blood pressure reading of 118/68 and is prescribed Lisinopril.'}


### Tokenizen

De tekst in iedere row wordt vervangen door cijfers (tokens) die een stuk tekst representeren. 

In [48]:
def preprocess_function(example):
    try:
        # Combine instruction and input
        input_text = example["instruction"] + " " + example["input"]
        # Tokenize input
        model_inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=512)

        # Tokenize output (labels)
        labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=512).input_ids
        model_inputs["labels"] = labels

        return model_inputs
    except Exception as e:
        print("Error processing:", example)
        print("Error message:", e)
        return {}


In [51]:
tokenized_datasets = dataset["train"].map(preprocess_function, batched=False)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [52]:
print(tokenized_datasets)
print(tokenized_datasets[0])


Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})
{'instruction': "Summarize the patient's medical profile.", 'input': 'Name: Allison Hill, Age: 82, Gender: Male, Blood Pressure: 118/68, Medications: Lisinopril, Department: Psychiatry', 'output': 'Allison Hill is a 82-year-old male treated in the Psychiatry department. The patient has a blood pressure reading of 118/68 and is prescribed Lisinopril.', 'input_ids': [128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004

### Llama Trainen

Na het tokenizen wordt Llama getraind

In [10]:
training_args = TrainingArguments(
    output_dir="./llama_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    max_steps=60,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  # Mixed precision voor snelheid
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

trainer.train()

### Model opslaan

In [56]:
trainer.save_model("./llama_finetuned")
tokenizer.save_pretrained("/content/llama_finetuned")

### Model testen

In [None]:
patient_input = "Naam: John Doe, Leeftijd: 45, Geslacht: Man, Medicatie: Lisinopril, OpnameDatum: 2024-04-10, Afdeling: Cardiologie"

# Tokeniseer de invoer
inputs = tokenizer(patient_input, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Voorspel de output
outputs = model.generate(inputs["input_ids"], max_length=150)

# Decodeer de output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Voorspelde diagnose: {generated_text}")
