In [1]:
!pip install sentence-transformers numpy

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch>=1.11.0
  Downloading torch-2.2.2-cp39-none-macosx_10_9_x86_64.whl (150.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.8/150.8 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.2/536.2 KB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: mpmath, sympy, torch, sentence-transformers
Successfully installed mp

In [4]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import os # Already imported, but good to keep track of dependencies

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the path to your generated KG sentences file
kg_sentences_file = 'kg_sentences.txt'

kg_sentences = []
try:
    with open(kg_sentences_file, 'r', encoding='utf-8') as f:
        for line in f:
            kg_sentences.append(line.strip())
    print(f"Successfully loaded {len(kg_sentences)} sentences from '{kg_sentences_file}'")
except FileNotFoundError:
    print(f"Error: The file '{kg_sentences_file}' was not found. Please ensure it was created in the previous step.")
    kg_sentences = [] # Initialize as empty list to prevent further errors

Successfully loaded 19 sentences from 'kg_sentences.txt'


In [5]:
# Load a pre-trained sentence embedding model
# This might take a moment the first time it's downloaded
print("Loading SentenceTransformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

Loading SentenceTransformer model...
Model loaded successfully.


In [6]:
print("Embedding KG sentences...")
kg_sentence_embeddings = model.encode(kg_sentences, convert_to_tensor=True)
print(f"Generated embeddings for {len(kg_sentence_embeddings)} sentences.")


Embedding KG sentences...
Generated embeddings for 19 sentences.


In [7]:
def retrieve_relevant_kg_facts(query_text, top_k=5):
    """
    Retrieves the most relevant KG facts (sentences) based on a query text.

    Args:
        query_text (str): The patient summary or specific question.
        top_k (int): The number of top relevant facts to retrieve.

    Returns:
        list: A list of the top_k most relevant KG sentences.
    """
    # Embed the query text
    query_embedding = model.encode(query_text, convert_to_tensor=True)

    # Calculate cosine similarity between the query and all KG sentence embeddings
    cosine_scores = util.cos_sim(query_embedding, kg_sentence_embeddings)[0]

    # Get the top_k scores and their indices
    top_results = np.argpartition(cosine_scores.cpu().numpy(), -top_k)[-top_k:]
    # Sort the top results by score in descending order
    sorted_top_results = top_results[np.argsort(cosine_scores[top_results].cpu().numpy())[::-1]]

    retrieved_facts = []
    print(f"\n--- Top {top_k} Retrieved Facts for Query: '{query_text}' ---")
    for idx in sorted_top_results:
        score = cosine_scores[idx].item()
        fact = kg_sentences[idx]
        retrieved_facts.append(fact)
        print(f"  Score: {score:.4f} - Fact: {fact}")

    return retrieved_facts


In [8]:
# Example Patient Summary (from previous step)
patient_summary_inoperable = "A 75-year-old male with Stage I Non-Small Cell Lung Cancer. He has severe heart conditions, making him medically inoperable for surgery."

# Retrieve relevant facts
relevant_facts = retrieve_relevant_kg_facts(patient_summary_inoperable, top_k=5)

print("\nRetrieved facts ready for LLM prompt:")
for fact in relevant_facts:
    print(f"- {fact}")


--- Top 5 Retrieved Facts for Query: 'A 75-year-old male with Stage I Non-Small Cell Lung Cancer. He has severe heart conditions, making him medically inoperable for surgery.' ---
  Score: 0.5950 - Fact: Non-Small Cell Lung Cancer Stage I is a type of Non-Small Cell Lung Cancer.
  Score: 0.5582 - Fact: Surgical Resection is the initial treatment for Non-Small Cell Lung Cancer Stage I.
  Score: 0.5247 - Fact: Diagnosis of Non-Small Cell Lung Cancer requires a Biopsy.
  Score: 0.4983 - Fact: Lobectomy is the preferred surgical approach for Non-Small Cell Lung Cancer Stage I.
  Score: 0.4830 - Fact: Segmentectomy or Wedge Resection is considered for patients with Small tumors or compromised lung function.

Retrieved facts ready for LLM prompt:
- Non-Small Cell Lung Cancer Stage I is a type of Non-Small Cell Lung Cancer.
- Surgical Resection is the initial treatment for Non-Small Cell Lung Cancer Stage I.
- Diagnosis of Non-Small Cell Lung Cancer requires a Biopsy.
- Lobectomy is the pref

In [14]:
# User input (can be from input() too)
query = "How do you treat early-stage NSCLC in inoperable patients?"
# Convert question to embedding
query_embedding = model.encode(query, convert_to_tensor=True)


In [15]:
# Perform semantic search
hits = util.semantic_search(query_embedding, kg_sentence_embeddings, top_k=5)[0]

# Get the top-matching fact sentences
top_matches = [kg_sentences[hit['corpus_id']] for hit in hits]

# Print results
print("🔍 Top matching KG facts:")
for i, match in enumerate(top_matches):
    print(f"{i+1}. {match}")

🔍 Top matching KG facts:
1. Molecular Markers guide targeted therapy selection for Advanced NSCLC (and sometimes adjuvant).
2. Surgical Resection is a form of Local Therapy.
3. Surgical Resection is recommended for patients who are Medically Operable.
4. Stereotactic Ablative Radiotherapy (SABR/SBRT) is recommended for patients who are Medically Inoperable.
5. Local Control is a primary goal of Non-Small Cell Lung Cancer Stage I Treatment.


In [16]:
queries = [
    "How do you treat Stage I lung cancer if the patient is medically inoperable?",
    "What should be done after resection in early-stage NSCLC?",
    "What tests are required to diagnose NSCLC?",
    "What's the preferred surgery for operable NSCLC Stage I?",
    "How is local control achieved in lung cancer treatment?"
]

for q in queries:
    query_embedding = model.encode(q, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, kg_sentence_embeddings, top_k=3)[0]
    top_matches = [kg_sentences[hit['corpus_id']] for hit in hits]

    print(f"\n🔍 Query: {q}")
    for i, match in enumerate(top_matches):
        print(f"{i+1}. {match}")



🔍 Query: How do you treat Stage I lung cancer if the patient is medically inoperable?
1. Surgical Resection is the initial treatment for Non-Small Cell Lung Cancer Stage I.
2. Non-Small Cell Lung Cancer Stage I is a type of Non-Small Cell Lung Cancer.
3. Lobectomy is the preferred surgical approach for Non-Small Cell Lung Cancer Stage I.

🔍 Query: What should be done after resection in early-stage NSCLC?
1. Molecular Markers guide targeted therapy selection for Advanced NSCLC (and sometimes adjuvant).
2. Surgical Resection is recommended for patients who are Medically Operable.
3. Surgical Resection is a form of Local Therapy.

🔍 Query: What tests are required to diagnose NSCLC?
1. Molecular Markers guide targeted therapy selection for Advanced NSCLC (and sometimes adjuvant).
2. Non-Small Cell Lung Cancer requires testing for Molecular Markers (e.g., EGFR, ALK, PD-L1).
3. Multidisciplinary Evaluation is recommended for All Non-Small Cell Lung Cancer Patients.

🔍 Query: What's the pref

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')  # More accurate

In [19]:
print("Embedding KG sentences...")
kg_sentence_embeddings = model.encode(kg_sentences, convert_to_tensor=True)
print(f"Generated embeddings for {len(kg_sentence_embeddings)} sentences.")


Embedding KG sentences...
Generated embeddings for 19 sentences.


In [20]:
queries = [
    "How do you treat Stage I lung cancer if the patient is medically inoperable?",
    "What should be done after resection in early-stage NSCLC?",
    "What tests are required to diagnose NSCLC?",
    "What's the preferred surgery for operable NSCLC Stage I?",
    "How is local control achieved in lung cancer treatment?"
]

for q in queries:
    query_embedding = model.encode(q, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, kg_sentence_embeddings, top_k=3)[0]
    top_matches = [kg_sentences[hit['corpus_id']] for hit in hits]

    print(f"\n🔍 Query: {q}")
    for i, match in enumerate(top_matches):
        print(f"{i+1}. {match}")


🔍 Query: How do you treat Stage I lung cancer if the patient is medically inoperable?
1. Surgical Resection is the initial treatment for Non-Small Cell Lung Cancer Stage I.
2. Local Control is a primary goal of Non-Small Cell Lung Cancer Stage I Treatment.
3. Lobectomy is the preferred surgical approach for Non-Small Cell Lung Cancer Stage I.

🔍 Query: What should be done after resection in early-stage NSCLC?
1. Surgical Resection is recommended for patients who are Medically Operable.
2. Adjuvant Chemotherapy may be considered after Surgical Resection.
3. Surgical Resection aims to achieve Local Control.

🔍 Query: What tests are required to diagnose NSCLC?
1. Molecular Markers guide targeted therapy selection for Advanced NSCLC (and sometimes adjuvant).
2. Multidisciplinary Evaluation is recommended for All Non-Small Cell Lung Cancer Patients.
3. Non-Small Cell Lung Cancer requires testing for Molecular Markers (e.g., EGFR, ALK, PD-L1).

🔍 Query: What's the preferred surgery for oper

In [None]:
os.environ["OPENAI_API_KEY"] = "your-api-key"

In [None]:
from openai import OpenAI
import os

import openai

# Set your OpenAI API key
api_key = os.getenv("")  

In [None]:
from openai import OpenAI

client = OpenAI(
  api_key=""
)

response = client.responses.create(
  model="gpt-4o-mini",
  input="write a haiku about ai",
  store=True,
)

print(response.output_text);


Lines of code entwined,  
Thoughts emerge from silent dreams,  
Mind of circuits shines.


In [48]:

def answer_with_grounding(question, top_facts):
    # Format the context block from retrieved KG facts
    fact_block = "\n".join(f"- {fact}" for fact in top_facts)

    # Construct a strong, role-anchored prompt
    prompt = f"""You are a clinical assistant trained in early-stage lung cancer treatment.
Use the facts below to answer the user's question truthfully and clearly.
Only use information from the provided facts. If the answer is not present, say so.

Facts:
{fact_block}

User's Question:
{question}

Answer:"""

    # Make the API call to GPT
    response = client.responses.create(
        model="gpt-4o-mini",
        input = prompt,
    )

    return response.output_text


In [50]:
# Define the clinical question and relevant facts retrieved from your KG
question = "How do you treat Stage I lung cancer if the patient is medically inoperable?"
top_facts = [
    "Stereotactic Ablative Radiotherapy (SABR/SBRT) is recommended for patients who are Medically Inoperable.",
    "Stereotactic Ablative Radiotherapy (SABR/SBRT) is the initial treatment for Non-Small Cell Lung Cancer Stage I.",
    "SABR aims to achieve Local Control."
]

# Combine facts into prompt text
fact_block = "\n".join(f"- {fact}" for fact in top_facts)

# Build the prompt for grounding
prompt = f"""You are a clinical assistant trained in early-stage lung cancer.
Use only the facts below to answer the user's question. If the answer is not present in the facts, say "I don't know."

Facts:
{fact_block}

Question:
{question}

Answer:"""

# Call the responses endpoint using Assistants API
response = client.responses.create(
    model="gpt-4o-mini",  # or gpt-3.5-turbo if you're not using 4o
    input=prompt,
    store=True  # store=True is optional but useful for tracking
)

# Output the answer
print(response.output_text)

For Stage I lung cancer in a patient who is medically inoperable, the treatment recommended is Stereotactic Ablative Radiotherapy (SABR/SBRT).


In [51]:
question = "How do you treat Stage I breast cancer if the patient is medically inoperable?"
top_facts = [
    "Stereotactic Ablative Radiotherapy (SABR/SBRT) is recommended for patients who are Medically Inoperable.",
    "Stereotactic Ablative Radiotherapy (SABR/SBRT) is the initial treatment for Non-Small Cell Lung Cancer Stage I.",
    "SABR aims to achieve Local Control."
]

print(answer_with_grounding(question, top_facts))


I'm sorry, but I do not have information on the treatment of Stage I breast cancer. I can only provide information related to lung cancer treatment.
