<a href="https://colab.research.google.com/github/safaabuzaid/segmentation-prompt-generator/blob/main/Prompt_driven.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Prompt Generator for Radiology Segmentation tasks from Synthetic Clinical Notes**

**Note:** This dataset is synthetically generated using ChatGPT for educational and demonstration purposes only. It does not represent real patient data and should not be used for clinical decision-making or real-world applications.  
The goal is to create a prompt generator that can turn clinical notes into precise prompt that can be used later for segmentation tasks.

In [None]:
import pandas as pd

df = pd.read_csv('/content/clinical_notes.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   note       15 non-null     object
 1   organ      15 non-null     object
 2   diagnosis  15 non-null     object
 3   stage      15 non-null     object
 4   prompt     15 non-null     object
dtypes: object(5)
memory usage: 732.0+ bytes


Unnamed: 0,note,organ,diagnosis,stage,prompt
0,CT scan reveals a 3.2 cm irregular mass in the...,left lung,adenocarcinoma,stage II,Segment tumor in left lung based on stage II a...
1,MRI brain indicates a ring-enhancing lesion in...,right frontal lobe,glioblastoma multiforme,ungraded,Segment tumor in right frontal lobe based on g...
2,Ultrasound shows a solitary hepatic lesion mea...,liver,hepatocellular carcinoma,stage I,Segment tumor in liver based on stage I hepato...
3,CT chest shows multiple nodules in the right l...,right lung,metastatic carcinoma,ungraded,Segment nodules in right lung based on metasta...
4,MRI demonstrates a low-grade astrocytoma in th...,left temporal lobe,astrocytoma,low-grade,Segment tumor in left temporal lobe based on l...


In [None]:
df.isnull().sum()

Unnamed: 0,0
note,0
organ,0
diagnosis,0
stage,0
prompt,0


In [None]:
#format the dataset for the model
input_text = "Clinical Note: [note]"
target_text = "Prompt: [prompt]"

In [None]:
from datasets import Dataset

#create dictionary of note,prpompt
data_dict = {'note': df['note'], 'prompt': df['prompt']}

dataset = Dataset.from_dict(data_dict)

dataset = dataset.train_test_split(test_size=0.2)
dataset


DatasetDict({
    train: Dataset({
        features: ['note', 'prompt'],
        num_rows: 12
    })
    test: Dataset({
        features: ['note', 'prompt'],
        num_rows: 3
    })
})

In [None]:
print (df['note'][0])
print (df['prompt'][0])

CT scan reveals a 3.2 cm irregular mass in the upper lobe of the left lung; biopsy confirms stage II adenocarcinoma.
Segment tumor in left lung based on stage II adenocarcinoma


# Preprocessing the data



In [None]:
from huggingface_hub import notebook_login
notebook_login()

from transformers import pipeline
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = ["Generate a segmentation prompt from the following Clinical Note: " + note for note in examples["note"]]
    targets = ["Prompt: " + prompt for prompt in examples["prompt"]]

    model_inputs = tokenizer(inputs, padding = "max_length", truncation=True, max_length=max_input_length)
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, padding = "max_length", truncation=True, max_length=max_target_length)

    #targets= tokenizer(examples["prompt"], padding = "max_length", truncation=True, max_length=max_target_length)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['note', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12
    })
    test: Dataset({
        features: ['note', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})

# Load The moodel

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Set training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    report_to=None,
    output_dir="./finetuned-flan-t5",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False,
)

# Fine Tuning The model

In [None]:


from transformers import Seq2SeqTrainer , DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

In [None]:
#test it to the dataset
trainer.evaluate()

{'eval_loss': 31.465248107910156,
 'eval_runtime': 0.2581,
 'eval_samples_per_second': 11.624,
 'eval_steps_per_second': 3.875,
 'epoch': 5.0}

In [None]:
input_text = "Generate a segmentation prompt from the following Clinical Note: " + df['note'][0]
inputs = tokenizer(input_text, return_tensors="pt", truncation=True,padding = "max_length", max_length = 512).to(model.device)

generated_ids= model.generate(**inputs, max_new_tokens=50, num_beams = 4, early_stopping = True)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

if generated_text.lower().startswith("prompt:"):
  generated_text = generated_text[7:].strip()

print (input_text)
print(generated_text)

Generate a segmentation prompt from the following Clinical Note: CT scan reveals a 3.2 cm irregular mass in the upper lobe of the left lung; biopsy confirms stage II adenocarcinoma.
adenocarcinoma
