In [None]:
# =============================================================================
# GEMINI JUDGING
# =============================================================================

FLUENCY_PROMPT = """
You are an experienced physician and medical editor.

Evaluate the FLUENCY of the following SOAP note.

Consider:
- grammatical correctness
- clarity and coherence
- professional medical tone
- readability for clinical documentation

Score on a scale from 0 to 10:
- 0 = incoherent, ungrammatical, unusable
- 10 = perfectly fluent, natural, indistinguishable from a human physician

SOAP NOTE:
<<<
{generated_soap}
>>>

Return ONLY a single integer score between 0 and 10.
"""



CONSISTENCY_PROMPT = """
You are an experienced physician.

Evaluate the CONSISTENCY of the generated SOAP note
with respect to the reference physician SOAP note.

Consider:
- whether key clinical facts match
- whether important information is missing
- whether incorrect or fabricated details are introduced
- whether assessment and plan align with the reference

Score on a scale from 0 to 10:
- 0 = completely inconsistent or incorrect
- 10 = fully consistent and clinically aligned

REFERENCE SOAP:
<<<
{reference_soap}
>>>

GENERATED SOAP:
<<<
{generated_soap}
>>>

Return ONLY a single integer score between 0 and 10.
"""

In [None]:
!pip install -q google-generativeai

import os
import re
os.environ["GEMINI_API_KEY"] = "AIzaSyDmKkVhVFIuBcaxPdBpiZgOBqgmmVPbE_o"



import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

GEMINI_MODEL = "gemini-2.0-flash-lite"
judge_model = genai.GenerativeModel(GEMINI_MODEL)

def parse_score(text: str) -> int:
    match = re.search(r"\b([0-9]|10)\b", text)
    if match:
        return int(match.group(1))
    return 0  # fallback


In [None]:
def judge_fluency(generated_soap: str, model: str = GEMINI_MODEL) -> int:
    prompt = FLUENCY_PROMPT.format(generated_soap=generated_soap)

    response = judge_model.generate_content(
        prompt,
        generation_config={
            "temperature": 0.0,
            "max_output_tokens": 5,
        },
    )

    output = response.text.strip()
    return parse_score(output)



In [None]:
def judge_consistency(
    reference_soap: str,
    generated_soap: str,
    model: str = GEMINI_MODEL,
) -> int:
    prompt = CONSISTENCY_PROMPT.format(
        reference_soap=reference_soap,
        generated_soap=generated_soap,
    )

    response = judge_model.generate_content(
        prompt,
        generation_config={
            "temperature": 0.0,
            "max_output_tokens": 5,
        },
    )

    output = response.text.strip()
    return parse_score(output)

In [None]:
fluency_scores = []
consistency_scores = []
MAX_JUDGE_SAMPLES = 20

for i, (ref, pred) in enumerate(
    list(zip(refs, preds))[:MAX_JUDGE_SAMPLES]
):
    try:
        fluency = judge_fluency(pred)
        consistency = judge_consistency(ref, pred)

        fluency_scores.append(fluency)
        consistency_scores.append(consistency)

    except Exception as e:
        print(f"[Judge failed at sample {i}]: {e}")
        break


results = {
    "fluency_mean": round(float(np.mean(fluency_scores)), 3),
    "fluency_std": round(float(np.std(fluency_scores)), 3),
    "consistency_mean": round(float(np.mean(consistency_scores)), 3),
    "consistency_std": round(float(np.std(consistency_scores)), 3),
}

print(results)