In [1]:
import pandas as pd
from dspy.retrieve.qdrant_rm import QdrantRM
from qdrant_client import QdrantClient
import dspy
from typing import List, Dict, Tuple
import numpy as np
import csv
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
reports = [
"""RADIOLOGY REPORT
Exam: Chest X-ray (2 views)
Date: XXXX, XXXX
Indication: Fever, cough
Findings: There is a focal area of consolidation in the right lower lobe. No pleural effusion or pneumothorax. The heart size is normal.
IMPRESSION: Right lower lobe pneumonia.
""",

"""RADIOLOGY REPORT
Exam: Chest radiograph
Date: XXXX, XXXX
Indication: Dyspnea
Findings: Bilateral interstitial opacities consistent with pulmonary edema. The cardiac silhouette is enlarged. No pneumothorax.
IMPRESSION: Cardiomegaly with pulmonary edema.
""",

"""RADIOLOGY REPORT
Exam: Chest X-ray
Date: XXXX, XXXX
Indication: Chest pain
Findings: No acute cardiopulmonary abnormality. Lungs are clear. Heart size and mediastinal contours are within normal limits.
IMPRESSION: Normal chest X-ray.
""",

"""RADIOLOGY REPORT
Exam: Chest radiograph
Date: XXXX, XXXX
Indication: Trauma
Findings: Large left pneumothorax with partial lung collapse. No rib fractures visualized.
IMPRESSION: Left pneumothorax, likely tension pneumothorax given the degree of lung collapse.
""",

"""RADIOLOGY REPORT
Exam: Chest X-ray
Date: XXXX, XXXX
Indication: Post-operative evaluation
Findings: Small bilateral pleural effusions, left greater than right. Mild pulmonary edema. Heart size is at the upper limits of normal.
IMPRESSION: Bilateral pleural effusions with mild pulmonary edema.
""",

"""RADIOLOGY REPORT
Exam: Portable chest X-ray
Date: XXXX, XXXX
Indication: Follow-up pneumonia
Findings: Persistent right lower lobe consolidation, improved from prior. No pneumothorax or pleural effusion.
IMPRESSION: Improving right lower lobe pneumonia.
""",

"""RADIOLOGY REPORT
Exam: Chest radiograph
Date: XXXX, XXXX
Indication: Shortness of breath
Findings: Diffuse bilateral airspace opacities consistent with pulmonary edema. The cardiac silhouette is markedly enlarged.
IMPRESSION: Severe cardiomegaly with pulmonary edema.
""",

"""RADIOLOGY REPORT
Exam: Chest X-ray
Date: XXXX, XXXX
Indication: COPD exacerbation
Findings: Hyperinflated lung fields. No focal consolidation, pneumothorax, or pleural effusion. Heart size is normal.
IMPRESSION: Changes consistent with COPD. No acute cardiopulmonary process.
""",

"""RADIOLOGY REPORT
Exam: Chest radiograph
Date: XXXX, XXXX
Indication: Fever, productive cough
Findings: Patchy consolidation in the right upper lobe. Small right pleural effusion. Heart size is normal.
IMPRESSION: Right upper lobe pneumonia with parapneumonic effusion.
""",

"""RADIOLOGY REPORT
Exam: Portable chest X-ray
Date: XXXX, XXXX
Indication: Assess ET tube placement
Findings: ET tube is in proper position. Diffuse bilateral opacities consistent with ARDS. No pneumothorax.
IMPRESSION: ARDS pattern. Properly positioned ET tube.
""",

"""RADIOLOGY REPORT
Exam: Chest X-ray
Date: XXXX, XXXX
Indication: Annual check-up
Findings: Clear lung fields. Normal heart size and mediastinal contours. No pleural effusion or pneumothorax.
IMPRESSION: Normal chest X-ray.
""",

"""RADIOLOGY REPORT
Exam: Chest radiograph
Date: XXXX, XXXX
Indication: Chest pain, shortness of breath
Findings: Large left pleural effusion with associated atelectasis. The heart is shifted to the right. No pneumothorax.
IMPRESSION: Large left pleural effusion with mediastinal shift.
""",

"""RADIOLOGY REPORT
Exam: Chest X-ray
Date: XXXX, XXXX
Indication: Follow-up cardiomegaly
Findings: Persistent cardiomegaly with no change from prior studies. No pulmonary edema, pleural effusion, or pneumothorax.
IMPRESSION: Stable cardiomegaly without acute cardiopulmonary process.
""",

"""RADIOLOGY REPORT
Exam: Portable chest radiograph
Date: XXXX, XXXX
Indication: Evaluate lung collapse
Findings: Complete opacification of the left hemithorax with rightward mediastinal shift, consistent with total left lung collapse. Small left pneumothorax noted.
IMPRESSION: Left lung collapse with small pneumothorax.
""",

"""RADIOLOGY REPORT
Exam: Chest X-ray
Date: XXXX, XXXX
Indication: Cough, fever
Findings: Patchy bilateral consolidations, more prominent in the lower lobes. No pleural effusion or pneumothorax. Heart size is normal.
IMPRESSION: Bilateral pneumonia, likely viral.
"""
]

In [5]:
ground_truth = [
["consolidation"],
["pulmonary edema", "cardiomegaly"],
[],
["pneumothorax"],
["pleural effusion", "pulmonary edema"],
["consolidation"],
["pulmonary edema", "cardiomegaly"],
[],
["consolidation", "pleural effusion"],
[],
[],
["pleural effusion"],
["cardiomegaly"],
["pneumothorax"],
["consolidation"]
]
classes = ["pulmonary edema", "consolidation", "pleural effusion", "pneumothorax", "cardiomegaly"]

In [6]:
class ClassifyText(dspy.Signature):
    """Classify the radiology into multiple labels from the given candidates. You should return the 
    extracted information as a single JSON string with a key for each candidate label and a value of
    1 if the report indicates the presence of the abnormality and 0 otherwise. There should be no 
    text or explanation, only the JSON. For example if there 
    were 3 candidates you could have the following output:

    {
        "label_1": 1,
        "label_2": 0,
        "label_3": 1
    }"""
    text = dspy.InputField()
    label_candidates = dspy.InputField(desc="List of candidate labels for the text")
    rad_labels = dspy.OutputField(desc="Dictionary of candidate labels, 1 or 0, for the text")

class RAGMultiLabelClassifier(dspy.Module):
    def __init__(self, num_candidates=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_candidates)
        self.classify = dspy.Predict(ClassifyText)

    def forward(self, text):
        retrieved_docs = ','.join(self.retrieve(text).passages)
        classification_result = self.classify(text=text, label_candidates=retrieved_docs)
        return classification_result.rad_labels
    
def build_retriever_client(labels: List[str], collection_name: str, k: int, vectorizer: str = None) -> QdrantRM:
    client = QdrantClient(":memory:")
    ids = list(range(len(labels)))
    
    if vectorizer:
        client.set_model(vectorizer)
        
    client.add(
        collection_name=collection_name,
        documents=labels,
        ids=ids
    )
    return QdrantRM(collection_name, client, k=k)

In [7]:
def clean_json_string(json_str: str) -> str:
    # Remove the backticks and the "json" text
    return json_str.replace('```json\n', '').replace('\n```', '')

def parse_ollama_output(output_str: str, clean_values: bool = True) -> List[str]:
    if clean_values:
        # Remove the backticks and the "json" text
        output_str = clean_json_string(output_str)
    output_dict = json.loads(output_str)
    predicted_classes = [key for key, value in output_dict.items() if value == 1]
    return predicted_classes

In [14]:
vectorizer = "intfloat/multilingual-e5-large"
ollama_model = 'gemma2'

In [15]:
retriever_model = build_retriever_client(labels=classes, 
                                         collection_name="rad", 
                                         k=3, 
                                         vectorizer=vectorizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fetching 6 files: 100%|█████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 77912.77it/s]


In [16]:
ollama_model = dspy.OllamaLocal(
    model=ollama_model, 
    model_type='text',
    max_tokens=512,
    temperature=0,
    top_p=1,
    frequency_penalty=0,
    top_k=5,
    format='json'
)

In [20]:
dspy.settings.configure(lm=ollama_model, rm=retriever_model)
classifier = RAGMultiLabelClassifier(num_candidates=3)

In [26]:
reports[1]

'RADIOLOGY REPORT\nExam: Chest radiograph\nDate: XXXX, XXXX\nIndication: Dyspnea\nFindings: Bilateral interstitial opacities consistent with pulmonary edema. The cardiac silhouette is enlarged. No pneumothorax.\nIMPRESSION: Cardiomegaly with pulmonary edema.\n'

In [29]:
ground_truth[1]

['pulmonary edema', 'cardiomegaly']

In [28]:
result_str = classifier(text=reports[1])

In [30]:
result_str

'{\n"cardiomegaly": 1,\n"pleural effusion": 0,\n"pulmonary edema": 1\n}'

In [31]:
predicted_classes = parse_ollama_output(result_str)

In [32]:
predicted_classes

['cardiomegaly', 'pulmonary edema']

## DSPy Optimizer

In [None]:
def accuracy(pred, gold):
    pred_set = set(pred)
    gold_set = set(gold)
    return int(pred_set == gold_set)

metric = dspy.Metric(accuracy)

In [None]:
teleprompter = dspy.Teleprompter(classifier)

In [None]:
def evaluate_model(model, dataset):
    total_accuracy = 0
    for example in dataset:
        result_str = model(text=example['text'])
        predicted_classes = parse_ollama_output(result_str)
        total_accuracy += accuracy(predicted_classes, example['ground_truth'])
    return total_accuracy / len(dataset)

In [None]:
dataset = [
    {'text': reports[0], 'ground_truth': ground_truth[0]},
    {'text': reports[1], 'ground_truth': ground_truth[1]},
    # Add more examples as needed
]

In [None]:
# Evaluate before compilation
print("Accuracy before compilation:", evaluate_model(classifier, dataset))

# Bootstrap Few-Shot optimizer
bootstrap_optimizer = dspy.BootstrapFewShot(metric=metric, num_samples=5, num_trials=2)
compiled_bootstrap = bootstrap_optimizer.compile(teleprompter, trainset=dataset)

print("Accuracy after BootstrapFewShot:", evaluate_model(compiled_bootstrap, dataset))

# Bayesian Signature Optimizer
bayesian_optimizer = dspy.BayesianSignatureOptimizer(metric=metric, num_samples=5, num_trials=2)
compiled_bayesian = bayesian_optimizer.compile(teleprompter, trainset=dataset)

print("Accuracy after BayesianSignatureOptimizer:", evaluate_model(compiled_bayesian, dataset))