In [1]:
import pandas as pd

df = pd.read_csv('data/patient-presentations-combined.csv')

df.head()

Unnamed: 0,Patient Presentation code,Patient Presentation name,combined_contexts
0,PRES001,Abdominal mass,Primary care context\nA palpable abdominal mas...
1,PRES002,Abdominal pain,Primary care context\nIdentifying an underlyin...
2,PRES003,Abdominal swelling,Primary care context\nPatients may describe a ...
3,PRES004,Abnormal development/developmental delay,Primary care context\nWhilst the primary care ...
4,PRES005,Abnormal eating or exercise behaviour,Primary care context\nGeneral practice is ofte...


In [2]:
# get first 10 rows
df_reduced = df.head(10)

In [2]:
care_contexts = [
    'Primary care context',
    'Secondary care context', 
    'Paediatric care context',
    'Geriatric care context',
]

In [3]:
# Add new column for each care context and populate with an empty string
for context in care_contexts:
    df[context] = ''



In [4]:
df['Flagged'] = ''

In [5]:
df.head()

Unnamed: 0,Patient Presentation code,Patient Presentation name,combined_contexts,Primary care context,Secondary care context,Paediatric care context,Geriatric care context,Flagged
0,PRES001,Abdominal mass,Primary care context\nA palpable abdominal mas...,,,,,
1,PRES002,Abdominal pain,Primary care context\nIdentifying an underlyin...,,,,,
2,PRES003,Abdominal swelling,Primary care context\nPatients may describe a ...,,,,,
3,PRES004,Abnormal development/developmental delay,Primary care context\nWhilst the primary care ...,,,,,
4,PRES005,Abnormal eating or exercise behaviour,Primary care context\nGeneral practice is ofte...,,,,,


In [6]:
df_reduced = df.head(10)

In [7]:
import re

def extract_care_contexts(text):
    """Extract care contexts from combined text and return as dictionary."""
    
    # Initialize all contexts as empty strings
    contexts = {
        'primary': '',
        'secondary': '',
        'paediatric': '',
        'geriatric': ''
    }
    
    # Find all context sections using regex with lookahead
    patterns = {
        'primary': r'Primary care context\n(.*?)(?=\n(?:Secondary|Paediatric|Geriatric) care context|\Z)',
        'secondary': r'Secondary care context\n(.*?)(?=\n(?:Primary|Paediatric|Geriatric) care context|\Z)',
        'paediatric': r'Paediatric care context\n(.*?)(?=\n(?:Primary|Secondary|Geriatric) care context|\Z)',
        'geriatric': r'Geriatric care context\n(.*?)(?=\n(?:Primary|Secondary|Paediatric) care context|\Z)'
    }
    
    for context_key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        if match:
            contexts[context_key] = match.group(1).strip()
    
    return contexts

In [7]:
example = extract_care_contexts(df['combined_contexts'][0])

In [8]:
from openai import OpenAI
from utils import TemplateManager

from dotenv import load_dotenv
load_dotenv()

import os
api_key = os.getenv('OPENAI_API_KEY')

In [9]:
import asyncio
from openai import AsyncOpenAI

class Reformatron:
    def __init__(self, api_key):
        self.client = AsyncOpenAI(api_key=api_key)
        self.prompt_manager = TemplateManager('./prompts/')
        self.user_prompt = self.prompt_manager.get_template('input-prompt.jinja')
        self.system_prompt = self.prompt_manager.get_template('system-prompt.jinja')

    async def reformat(self, args):
        """Reformat the care contexts using OpenAI API."""
        
        response = await self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": self.system_prompt.render()},
                {"role": "user", "content": self.user_prompt.render(**args)}
            ],
            max_tokens=2048,
            temperature=0.0,
        )
        
        return response.choices[0].message.content

In [10]:
reformatron = Reformatron(api_key)
output = reformatron.reformat(example)

# Evaluation

In [10]:
from difflib import SequenceMatcher

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'(\d)-([a-z])', r'\1§\2', text)
    text = re.sub(r'[^a-z0-9\s§]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('§', '-')
    text = text.strip()
    
    return text

def evaluate(original, reformatted):
    cleaned_original = clean_text(original)
    cleaned_output = clean_text(reformatted)

    original_words = cleaned_original.split()
    output_words = cleaned_output.split()

    word_ratio = SequenceMatcher(None, original_words, output_words).ratio()
    character_ratio = SequenceMatcher(None, cleaned_original, cleaned_output).ratio()

    return {
        'word_ratio': word_ratio,
        'character_ratio': character_ratio,
    }

In [12]:
evaluate(df['combined_contexts'][0], output)

{'word_ratio': 0.980225988700565, 'character_ratio': 0.3238216021982668}

In [13]:
print(output)

# Primary care context
A palpable abdominal mass may be apparent to a patient/parent or identified by the GP when a patient presents with other symptoms such as a change in bowel habit, urinary symptoms, weight loss, vomiting or pain. 

Think about the symptoms you would ask about, and the key features that are of importance in different groups such as:

- children
- patients who ovulate
- patients who have undergone surgery 
- patients with underlying diagnoses such as inflammatory bowel disease or prostatism.

Practice abdominal examination where you can, learning to identify and characterize the location, extent and consistency of abdominal masses including organomegaly. 

Speak to your GP tutor about what investigations you might utilise including:

- bloods
- FIT samples
- urinalysis
- imaging.

Review the 2WW criteria for referral directly to specialities such as Gynaecology, Gastroenterology, Urology and USS.

# Secondary care context
Presentations of abdominal masses in seconda

In [126]:
evaluate(df['combined_contexts'][1], output)

{'word_ratio': 0.9938837920489296, 'character_ratio': 0.993006993006993}

In [11]:

def add_entry(df, entry, index, evaluation_score, threshold=0.9):
    # get content
    parsed_content = {}
    current_heading_key = None
    current_lines = []

    for line in entry.splitlines():
        # Check if the line is a markdown header (e.g., "# Header Name")
        # This regex matches lines starting with one or more '#' followed by a space
        match = re.match(r'^(#+)\s+(.*)', line)
        if match:
            # If we were accumulating content for a previous header, store it
            if current_heading_key:
                parsed_content[current_heading_key] = "\n".join(current_lines).strip()

            # Set the new header key (text after '# ' and stripped)
            current_heading_key = match.group(2).strip()
            current_lines = []  # Reset lines for the new section
        elif current_heading_key:
            # If it's not a header line and we are under a heading, accumulate the line
            current_lines.append(line)

    # Store content for the last section after the loop finishes
    if current_heading_key:
        parsed_content[current_heading_key] = "\n".join(current_lines).strip()

    # Populate df
    for heading, content_text in parsed_content.items():
        if heading in df.columns:
            df.loc[index, heading] = content_text
        else:
            print(f"Warning: Header '{heading}' from text not found in DataFrame columns.")

    if evaluation_score['word_ratio'] >= threshold:
        df.loc[index, 'Flagged'] = 'No'
    else:
        print(f"Entry at index {index} did not meet the threshold. Word ratio: {evaluation_score['word_ratio']}")
        df.loc[index, 'Flagged'] = 'Yes'

In [12]:
async def process_single_entry(df, reformatron, index, row):
    """Process a single entry asynchronously."""
    combined_contexts = row['combined_contexts']
    if pd.isna(combined_contexts) or not combined_contexts.strip():
        return
    
    # Extract care contexts
    contexts = extract_care_contexts(combined_contexts)
    
    # Reformat using OpenAI API
    reformatted_text = await reformatron.reformat(contexts)

    evaluation_score = evaluate(combined_contexts, reformatted_text)
    
    # Add entry to DataFrame
    add_entry(df, reformatted_text, index, evaluation_score)

async def async_pipeline(df, reformatron, max_concurrent=10):
    """
    Process DataFrame entries asynchronously with controlled concurrency.
    
    Args:
        df: DataFrame to process
        reformatron: Reformatron instance
        max_concurrent: Maximum number of concurrent API calls (default: 10)
    """
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def process_with_semaphore(index, row):
        async with semaphore:
            await process_single_entry(df, reformatron, index, row)
    
    # Create tasks for all entries
    tasks = [
        process_with_semaphore(index, row) 
        for index, row in df.iterrows()
    ]
    
    # Execute all tasks concurrently
    await asyncio.gather(*tasks)
    
    return df



In [18]:

reformatron = Reformatron(api_key=api_key)

result_df = await async_pipeline(df, reformatron, max_concurrent=200)
    



  def __init__(self, callback, args, loop, context=None):


Entry at index 39 did not meet the threshold. Word ratio: 0.7894201424211598
Entry at index 51 did not meet the threshold. Word ratio: 0.8773006134969326


In [19]:
result_df.head()

Unnamed: 0,Patient Presentation code,Patient Presentation name,combined_contexts,Primary care context,Secondary care context,Paediatric care context,Geriatric care context,Flagged
0,PRES001,Abdominal mass,Primary care context\nA palpable abdominal mas...,A palpable abdominal mass may be apparent to a...,Presentations of abdominal masses in secondary...,,Older patients with abdominal masses are commo...,No
1,PRES002,Abdominal pain,Primary care context\nIdentifying an underlyin...,Identifying an underlying cause in patients wh...,Abdominal pain is one of the most common prese...,Abdominal pain is one of the most frequent com...,"Again, the usual differentials apply, but comm...",No
2,PRES003,Abdominal swelling,Primary care context\nPatients may describe a ...,Patients may describe a sensation of swelling ...,Abdominal swelling is a common and potentially...,,"AAA; ascites; malignancy, heart failure;",No
3,PRES004,Abnormal development/developmental delay,Primary care context\nWhilst the primary care ...,Whilst the primary care team are involved in c...,'Childhood Development’ is usually defined as ...,It is important to understand the normal patte...,,No
4,PRES005,Abnormal eating or exercise behaviour,Primary care context\nGeneral practice is ofte...,General practice is often the first place that...,Eating disorders are common in children and yo...,,"In older people, consider the following differ...",No


In [19]:
def pipeline(df, reformatron):
    for index, row in df.iterrows():
        combined_contexts = row['combined_contexts']
        if pd.isna(combined_contexts) or not combined_contexts.strip():
            continue
        
        # Extract care contexts
        contexts = extract_care_contexts(combined_contexts)
        
        # Reformat using OpenAI API
        reformatted_text = reformatron.reformat(contexts)

        evaluation_score = evaluate(combined_contexts, reformatted_text)
        
        # Add entry to DataFrame
        add_entry(df, reformatted_text, index, evaluation_score)
    
    return df

In [16]:
print(df['Primary care context'][0])

A palpable abdominal mass may be apparent to a patient/parent or identified by the GP when a patient presents with other symptoms such as a change in bowel habit, urinary symptoms, weight loss, vomiting or pain. 

Think about the symptoms you would ask about, and the key features that are of importance in different groups such as:

- children
- patients who ovulate
- patients who have undergone surgery 
- patients with underlying diagnoses such as inflammatory bowel disease or prostatism.

Practice abdominal examination where you can, learning to identify and characterize the location, extent and consistency of abdominal masses including organomegaly. 

Speak to your GP tutor about what investigations you might utilise including:

- bloods
- FIT samples
- urinalysis
- imaging.

Review the 2WW criteria for referral directly to specialities such as Gynaecology, Gastroenterology, Urology and USS.


# Visualize potential differences

In [83]:
from rouge_score import rouge_scorer
import html
from IPython.display import HTML
from typing import List, Tuple, Set

def find_ngram_positions(text: str, n: int) -> List[Tuple[str, int]]:
    """Find all n-grams and their starting positions in the text."""
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = " ".join(words[i:i + n])
        ngrams.append((ngram, i))
    return ngrams

def get_matching_ngrams(reference: str, candidate: str, n: int) -> Set[str]:
    """Find all matching n-grams between reference and candidate texts."""
    ref_ngrams = set(ngram for ngram, _ in find_ngram_positions(reference, n))
    cand_ngrams = set(ngram for ngram, _ in find_ngram_positions(candidate, n))
    return ref_ngrams.intersection(cand_ngrams)

def highlight_matches(text: str, matches: Set[str], n: int) -> str:
    """Highlight matching n-grams in the text using HTML spans."""
    words = text.split()
    highlighted = words.copy()
    
    # Find positions of matching n-grams
    ngram_positions = find_ngram_positions(text, n)
    matching_positions = []
    
    for ngram, pos in ngram_positions:
        if ngram in matches:
            matching_positions.extend(range(pos, pos + n))
    
    # Apply highlighting
    for i in range(len(words)):
        if i in matching_positions:
            highlighted[i] = f'<span style="background-color: yellow">{html.escape(words[i])}</span>'
        else:
            highlighted[i] = html.escape(words[i])
    
    return " ".join(highlighted)

def visualize_rouge_matches(reference: str, candidate: str, n: int = 1):
    """
    Visualize matching n-grams between reference and candidate texts.
    
    Args:
        reference: Reference text
        candidate: Candidate text to compare against reference
        n: Size of n-grams to compare (default: 1 for unigrams)
    
    Returns:
        IPython.display.HTML object with highlighted matching sequences
    """
    # Find matching n-grams
    matches = get_matching_ngrams(reference, candidate, n)
    
    # Highlight matches in both texts
    highlighted_ref = highlight_matches(reference, matches, n)
    highlighted_cand = highlight_matches(candidate, matches, n)
    
    # Calculate Rouge score
    scorer = rouge_scorer.RougeScorer([f'rouge{n}'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    rouge_score = scores[f'rouge{n}'].fmeasure
    
    # Create HTML output
    html_output = f"""
    <div style="font-family: monospace; white-space: pre-wrap;">
        <h3>Rouge-{n} Score: {rouge_score:.3f}</h3>
        <div style="margin: 10px 0;">
            <strong>Reference:</strong><br>
            {highlighted_ref}
        </div>
        <div style="margin: 10px 0;">
            <strong>Candidate:</strong><br>
            {highlighted_cand}
        </div>
    </div>
    """
    
    return HTML(html_output)

In [123]:
display(visualize_rouge_matches(cleaned_output, cleaned_original, n=3))

In [90]:
similarity

0.8450644683999154

In [91]:
def word_level_similarity(text1, text2):
    """More forgiving of spelling corrections."""
    words1 = text1.split()
    words2 = text2.split()
    return SequenceMatcher(None, words1, words2).ratio()

word_level_similarity(cleaned_output, cleaned_original)

0.980225988700565

In [93]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def bleu_similarity(original, reformatted):
    """BLEU score for content preservation."""
    reference = [original.split()]
    candidate = reformatted.split()
    
    # Use smoothing to handle short texts better
    smoothie = SmoothingFunction().method1
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

bleu_score = bleu_similarity(cleaned_original, cleaned_output)
print(f"BLEU score: {bleu_score:.4f}")

BLEU score: 0.9464


In [None]:
from difflib import SequenceMatcher

text1 = cleaned_original.split()
text2 = cleaned_output.split()

ratio = SequenceMatcher(None, text1, text2).ratio()
print(f"Similarity ratio: {ratio}")

Similarity ratio: 0.980225988700565


In [None]:
if len(text1) == len(text2):
    for i in range(len(text2)):
        if text1[i] != text2[i]:
            print(f"Word at index {i} changed: '{text1[i]}' -> '{text2[i]}'")
else:
    print("Word lists have different lengths, direct comparison might be misleading.")
    # Consider using difflib in this case (see below)

Word at index 69 changed: 'or' -> 'patients'
Word at index 70 changed: 'have' -> 'with'
Word at index 117 changed: 'and' -> 'imaging'
Word at index 118 changed: 'imaging' -> 'review'
Word at index 119 changed: 'review' -> 'the'
Word at index 120 changed: 'the' -> '2ww'
Word at index 121 changed: '2ww' -> 'criteria'
Word at index 122 changed: 'criteria' -> 'for'
Word at index 123 changed: 'for' -> 'referral'
Word at index 124 changed: 'referral' -> 'directly'
Word at index 125 changed: 'directly' -> 'to'
Word at index 126 changed: 'to' -> 'specialities'
Word at index 127 changed: 'specialities' -> 'such'
Word at index 128 changed: 'such' -> 'as'
Word at index 129 changed: 'as' -> 'gynaecology'
Word at index 130 changed: 'gynaecology' -> 'gastroenterology'
Word at index 131 changed: 'gastroenterology' -> 'urology'
Word at index 132 changed: 'urology' -> 'and'
Word at index 133 changed: 'and' -> 'uss'
Word at index 134 changed: 'uss' -> 'secondary'
Word at index 135 changed: 'secondary' -