In [1]:
## Cell 1: Install Dependencies
!pip install --upgrade pip
!pip install transformers accelerate huggingface_hub
!pip install torch    # if not already installed


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cud

In [2]:
## Cell 2: Imports & GPU Check
import os
import torch
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Verify T4 GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [3]:
## Cell 3: Load & Inspect Data
df = pd.read_csv('open_ave_data.csv')  # adjust path as needed
print("Columns:", df.columns.tolist())
df.head()


Columns: ['Unnamed: 0', 'ReportText', 'findings', 'clinicaldata', 'ExamName', 'impression']


Unnamed: 0.1,Unnamed: 0,ReportText,findings,clinicaldata,ExamName,impression
0,0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: Heart size appears normal. Lungs cle...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [4]:
## Cell 4: Data Cleaning
# 1. Drop duplicates & fully empty rows
df_clean = df.drop_duplicates().dropna(how='all').copy()

# 2. Trim whitespace in string cols
for c in df_clean.select_dtypes(include='object').columns:
    df_clean[c] = df_clean[c].str.strip()

# 3. Fill numeric NaNs with median
for c in df_clean.select_dtypes(include='number').columns:
    df_clean[c] = df_clean[c].fillna(df_clean[c].median())

print("Cleaned shape:", df_clean.shape)
df_clean.head()


Cleaned shape: (954, 6)


Unnamed: 0.1,Unnamed: 0,ReportText,findings,clinicaldata,ExamName,impression
0,0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough.,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN.,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormality.
2,2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN.,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: Heart size appears normal. Lungs clear.,History: Chest pain,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [12]:
# ── Cell 5: Load Qwen-Instruct as a CausalLM on GPU ──
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# 1. Load the tokenizer (allow custom code)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

# 2. Load the model as a CausalLM (half-precision, device_map for TPU/GPU)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 3. Build a text-generation pipeline on device 0
extractor = pipeline(
    "text-generation",      # causal generation
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,     # how many tokens to generate
    temperature=0.0,        # deterministic
    do_sample=False,         # greedy decoding
    return_full_text=False

)

print("✅ Qwen-Instruct loaded as CausalLM on", "cuda" if torch.cuda.is_available() else "cpu")


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Qwen-Instruct loaded as CausalLM on cuda


In [22]:
# ── Cell 6: Prompt Template & Few-Shot Examples (with escaped braces) ──

import json

# 1. Define the exact JSON schema
json_schema = {
    "Title": "string",
    "Clinical Indication": "string",
    "Findings": "string",
    "Impression": "string",
    "Recommendations": "string"
}

# 2. Two concrete few-shot examples (Instruction/Input/Output)
few_shot = """
### Example 1

## Instruction
Extract the five fields from the radiology report and output exactly one JSON object—no commentary, no code.

## Input
EXAM: CHEST XRAY; CLINICAL HISTORY: Shortness of breath; FINDINGS: Mild cardiomegaly; IMPRESSION: Mild enlargement of the heart; RECOMMENDATIONS: Follow-up in 6 months.

## Output
{"Title":"Chest X-ray","Clinical Indication":"Shortness of breath","Findings":"Mild cardiomegaly","Impression":"Mild enlargement of the heart","Recommendations":"Follow-up in 6 months"}

### Example 2

## Instruction
Extract the five fields from the radiology report and output exactly one JSON object—no commentary, no code.

## Input
EXAM: ABDOMEN CT; CLINICAL HISTORY: Abdominal pain; FINDINGS: No free fluid or masses; IMPRESSION: Normal CT abdomen; RECOMMENDATIONS: None.

## Output
{"Title":"Abdomen CT","Clinical Indication":"Abdominal pain","Findings":"No free fluid or masses","Impression":"Normal CT abdomen","Recommendations":""}
"""

# 3. Master prompt template with literal {report}
prompt_template = f"""## Instruction
You are an expert medical data-extraction assistant. Extract the following five fields from a free-text radiology report and output EXACTLY one JSON object—no commentary, no code.

## JSON Schema
{json.dumps(json_schema, indent=2)}

{few_shot}

## Input
{{report}}

## Output"""


In [23]:
import json

# 1. Pick a report
report_col = "ReportText" if "ReportText" in df_clean.columns else df_clean.columns[0]
sample = df_clean[report_col].iloc[0]

# 2. Build the prompt with our Instruction / Input / Output template
prompt = prompt_template.replace("{report}", sample)
print("----- Prompt -----\n", prompt)

# 3. Call the model (only supported call-time flags)
resp = extractor(
    prompt,
    max_new_tokens=512,
    do_sample=False,
    return_full_text=False
)[0]["generated_text"].strip()

print("\n----- Raw Model Output -----\n", resp)

# 4. Safely isolate and parse the JSON
start, end = resp.find("{"), resp.rfind("}")
json_str = resp[start:end+1] if (start != -1 and end != -1) else resp

try:
    parsed = json.loads(json_str)
    print("\nParsed JSON:\n", json.dumps(parsed, indent=2))
except json.JSONDecodeError as e:
    print("\nJSON parse error:", e)
    print("Raw JSON fragment:", json_str)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


----- Prompt -----
 ## Instruction
You are an expert medical data-extraction assistant. Extract the following five fields from a free-text radiology report and output EXACTLY one JSON object—no commentary, no code.

## JSON Schema
{
  "Title": "string",
  "Clinical Indication": "string",
  "Findings": "string",
  "Impression": "string",
  "Recommendations": "string"
}


### Example 1

## Instruction
Extract the five fields from the radiology report and output exactly one JSON object—no commentary, no code.

## Input
EXAM: CHEST XRAY; CLINICAL HISTORY: Shortness of breath; FINDINGS: Mild cardiomegaly; IMPRESSION: Mild enlargement of the heart; RECOMMENDATIONS: Follow-up in 6 months.

## Output
{"Title":"Chest X-ray","Clinical Indication":"Shortness of breath","Findings":"Mild cardiomegaly","Impression":"Mild enlargement of the heart","Recommendations":"Follow-up in 6 months"}

### Example 2

## Instruction
Extract the five fields from the radiology report and output exactly one JSON obj

In [29]:
# ── Cell 8: Sample-based Extraction & Eval Prep ──

import json
import pandas as pd
from difflib import SequenceMatcher
from sklearn.metrics import accuracy_score

# 1. Sample a random subset (e.g. 200 reports)
N = 200
sample_df = df_clean.sample(n=N, random_state=42).reset_index(drop=True)

# 2. Extract on that subset
report_col = "ReportText" if "ReportText" in df_clean.columns else df_clean.columns[0]
outputs = []
for rpt in sample_df[report_col].astype(str):
    # build prompt
    prompt = prompt_template.replace("{report}", rpt)

    # run the model
    out = extractor(
        prompt,
        max_new_tokens=512,
        do_sample=False,
        return_full_text=False
    )
    # out may be a list of strings or list of dicts or nested; normalize to string:
    if isinstance(out, list):
        raw = out[0] if isinstance(out[0], str) else (
              out[0]["generated_text"] if isinstance(out[0], dict) else str(out[0])
        )
    elif isinstance(out, dict):
        raw = out.get("generated_text", "")
    else:
        raw = str(out)

    # isolate JSON
    start, end = raw.find("{"), raw.rfind("}")
    js = raw[start:end+1] if (start!=-1 and end!=-1) else raw

    # parse or flag
    try:
        parsed = json.loads(js)
    except json.JSONDecodeError:
        parsed = {"error":"parse_failed", "raw": raw}

    outputs.append(parsed)

# 3. Build predictions DataFrame
preds_df = pd.DataFrame(outputs)

# 4. (Optional) peek at inputs vs outputs
display(pd.concat([
    sample_df[[report_col]].rename(columns={report_col:"Input Report"}),
    preds_df.rename(columns=lambda c: f"Pred_{c}")
], axis=1).head())


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

Unnamed: 0,Input Report,Pred_error,Pred_raw,Pred_Title,Pred_Clinical Indication,Pred_Findings,Pred_Impression,Pred_Recommendations,Pred_Technician,Pred_Technique,...,Pred_Infiltrate,Pred_Effusion,Pred_Masses,Pred_Cardiomediastinal Silhouette,Pred_Regional Osseous Structures,Pred_Signed By,Pred_PersonalName,Pred_SignedBy,Pred_Date/Time,Pred_Complaints
0,CHEST PA AND LATERAL CLINICAL INFORMATION: Che...,parse_failed,"\n{""Title"":""Chest Tightness and Shortness of B...",,,,,,,,...,,,,,,,,,,
1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 02/20/2016 ...,,,Chest Radiography,CHEST PAIN,,NORMAL SINGLE VIEW CHEST,,,,...,,,,,,,,,,
2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/18/2021 ...,,,Chest Radiography,"Cough, shortness of breath",Lungs/Pleura: No focal opacities evident.,Normal single view chest,,,,...,,,,,,,,,,
3,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/28/2021 ...,,,Radiography Report,Shortness of breath,COUGH,Normal single view chest,,,,...,,,,,,,,,,
4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/12/2021 ...,,,Chest Radiography,Shortness of breath,Lungs/Pleura: Mild pulmonary vascular congestion,Mild pulmonary edema,,,,...,,,,,,,,,,


In [31]:
print("Ground truth columns:", sample_df.columns.tolist())


Ground truth columns: ['Unnamed: 0', 'ReportText', 'findings', 'clinicaldata', 'ExamName', 'impression']


In [35]:
# ── Cell 10: Simple String‐Extraction Evaluation ──

!pip install -q evaluate rouge-score

import pandas as pd
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import evaluate

# Fields to evaluate
fields = ["Title", "Clinical Indication", "Findings", "Impression", "Recommendations"]

# Prepare ground/preds (assuming 'ground' and 'preds' from Cell 9 are in scope)
# ground[f], preds[f] are lists/Series of strings

# 1. Exact‐match accuracy
print("🔹 Exact-Match Accuracy")
for f in fields:
    gt = ground[f].astype(str).str.strip()
    pr = preds[f].astype(str).str.strip()
    acc = accuracy_score(gt, pr)
    print(f"  {f.ljust(20)} : {acc:.2%}")

# 2. Average SequenceMatcher similarity
print("\n🔹 Avg Character-Similarity")
for f in fields:
    ratios = [
        SequenceMatcher(None, g, p).ratio()
        for g,p in zip(ground[f].astype(str), preds[f].astype(str))
    ]
    print(f"  {f.ljust(20)} : {sum(ratios)/len(ratios):.2f}")

# 3. ROUGE scores
rouge = evaluate.load("rouge")
print("\n🔹 ROUGE Scores")
for f in fields:
    results = rouge.compute(
        predictions=preds[f].astype(str).tolist(),
        references= ground[f].astype(str).tolist()
    )
    print(f"  {f}: ROUGE-1={results['rouge1']:.3f}, ROUGE-2={results['rouge2']:.3f}, ROUGE-L={results['rougeL']:.3f}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
🔹 Exact-Match Accuracy
  Title                : 0.00%
  Clinical Indication  : 0.00%
  Findings             : 0.00%
  Impression           : 0.00%
  Recommendations      : 79.50%

🔹 Avg Character-Similarity
  Title                : 0.16
  Clinical Indication  : 0.43
  Findings             : 0.23
  Impression           : 0.50
  Recommendations      : 0.80


Downloading builder script: 0.00B [00:00, ?B/s]


🔹 ROUGE Scores
  Title: ROUGE-1=0.235, ROUGE-2=0.157, ROUGE-L=0.231
  Clinical Indication: ROUGE-1=0.515, ROUGE-2=0.355, ROUGE-L=0.515
  Findings: ROUGE-1=0.243, ROUGE-2=0.203, ROUGE-L=0.243
  Impression: ROUGE-1=0.571, ROUGE-2=0.523, ROUGE-L=0.570
  Recommendations: ROUGE-1=0.000, ROUGE-2=0.000, ROUGE-L=0.000


In [36]:
# ── Cell 11: Token-Level Precision, Recall, F1 ──

from statistics import mean

# 1. Helper to compute token‐level P/R/F1
def token_metrics(ref: str, pred: str):
    ref_tokens  = ref.split()
    pred_tokens = pred.split()
    # true positives = overlapping tokens
    tp = len(set(ref_tokens) & set(pred_tokens))
    precision = tp / len(pred_tokens) if pred_tokens else 0.0
    recall    = tp / len(ref_tokens)  if ref_tokens  else 0.0
    f1        = (2 * precision * recall / (precision + recall)
                 if (precision + recall) > 0 else 0.0)
    return precision, recall, f1

# 2. Compute per‐field metrics over your sample
token_results = {}
for f in fields:
    precs, recs, f1s = [], [], []
    for ref, pred in zip(ground[f].astype(str).str.strip(),
                         preds[f].astype(str).str.strip()):
        p, r, f1 = token_metrics(ref, pred)
        precs.append(p); recs.append(r); f1s.append(f1)
    token_results[f] = {
        "precision": mean(precs),
        "recall":    mean(recs),
        "f1":        mean(f1s)
    }

# 3. Display
print("🔹 Token‐Level Precision / Recall / F1 (macro‐averaged):")
for f, m in token_results.items():
    print(f"  {f.ljust(20)} : P={m['precision']:.2%}, R={m['recall']:.2%}, F1={m['f1']:.2%}")


🔹 Token‐Level Precision / Recall / F1 (macro‐averaged):
  Title                : P=20.73%, R=7.54%, F1=10.71%
  Clinical Indication  : P=42.57%, R=23.73%, F1=29.85%
  Findings             : P=49.29%, R=13.22%, F1=20.16%
  Impression           : P=55.39%, R=32.17%, F1=38.53%
  Recommendations      : P=0.00%, R=0.00%, F1=0.00%
