<a href="https://colab.research.google.com/github/safaabuzaid/segmentation-prompt-generator/blob/main/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prompt Generator for Radiology Segmentation tasks from Synthetic Clinical Notes

**Note:** This dataset is synthetically generated using ChatGPT for educational and demonstration purposes only. It does not represent real patient data and should not be used for clinical decision-making or real-world applications.
The goal is to create a prompt generator that can turn clinical notes into precise prompt that can be used later for segmentation tasks.

In [None]:
import pandas as pd

df = pd.read_csv('/content/clinical_notes.csv')
df.info()
df.head()

In [None]:
from datasets import Dataset

#create dictionary of note,prpompt
data_dict = {'note': df['note'], 'prompt': df['prompt']}

dataset = Dataset.from_dict(data_dict)

#dataset = dataset.train_test_split(test_size=0.2)
dataset

Output:


```
Dataset({
    features: ['note', 'prompt'],
    num_rows: 15
})
```


# Compute the embedding for the notes

In [None]:
from sentence_transformers import SentenceTransformer
import pickle
from huggingface_hub import login


#Load model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

#Encode notes
embeddings = model.encode(df['note'], convert_to_tensor=False)

#Save embeddings
with open('embeddings.pkl', 'wb') as f:
    pickle.dump((df['note'].tolist(), df['prompt'].tolist(), embeddings), f)

# Create a RAG pipeline

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import torch

def retrieve_top_k(input_text, k=3):
  input_embedding = model.encode([input_text], convert_to_tensor=False)

  with open('embeddings.pkl', 'rb') as f:
    notes, prompts, embeddings = pickle.load(f)

  #Compute cosine similarity between input and notes
  similarities = cosine_similarity(input_embedding, embeddings)[0]
  top_k_idx= similarities.argsort()[-k:][::-1]

  return [(notes[i], prompts[i]) for i in top_k_idx]


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#load model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model_gen = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

def generate_prompt(input_text):
  top_k = retrieve_top_k(input_text)

  context = "\n".join([f"Note: {n} \nPrompt: {p}" for n, p in top_k])
  query = f"Note: {input_text}\nPrompt:"

  full_prompt = f"{context}\n{query}"

  input_ids = tokenizer(full_prompt, return_tensors="pt", truncation =True)
  output = model_gen.generate(input_ids["input_ids"], max_new_tokens=50)

  return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the pipeline

In [None]:
print(df['note'][2])
print(generate_prompt(df['note'][2]))

Output:
Ultrasound shows a solitary hepatic lesion measuring 2.1 cm; diagnosis: stage I hepatocellular carcinoma.
Segment tumor in liver based on stage I hepatocellular carcinoma