<a href="https://colab.research.google.com/github/profitter261/Healthcare-AI-ML-App/blob/main/Report_analysis_clinicalbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================================
# INSTALLATION (Run these first)
# ============================================================================

!pip install transformers
!pip install nltk
!pip install rouge-score
!pip install bert-score
!pip install sacrebleu
!pip install torch
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
# ============================================================================
# IMPORTS
# ============================================================================
import pandas as pd
import numpy as np
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

# NLP and Metrics
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import sacrebleu


# Transformers for models
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BartForConditionalGeneration,
    pipeline
)

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)


print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!


In [None]:
# ============================================================================
# SAMPLE CLINICAL NOTES DATA
# ============================================================================

# Sample clinical notes with reference summaries
sample_data = [
    {
        "note": """Patient: John Doe, Age: 65, Male
Admission Date: 01/15/2024, Discharge Date: 01/20/2024
Chief Complaint: Chest pain and shortness of breath
Diagnosis: Acute Myocardial Infarction (AMI), Coronary Artery Disease
Hospital Course: Patient presented to ED with severe chest pain radiating to left arm.
ECG showed ST-segment elevation. Patient underwent emergency cardiac catheterization with
placement of drug-eluting stent in LAD. Post-procedure recovery was uneventful.
Medications: Aspirin 81mg daily, Clopidogrel 75mg daily, Atorvastatin 80mg daily,
Metoprolol 50mg BID, Lisinopril 10mg daily
Follow-up: Cardiology clinic in 2 weeks, repeat echocardiogram in 1 month
Discharge Instructions: Strict cardiac diet, cardiac rehabilitation referral,
avoid strenuous activity for 4 weeks.""",
        "reference": "65-year-old male admitted with acute MI, treated with emergency cardiac catheterization and stent placement in LAD. Discharged on aspirin, clopidogrel, statin, beta-blocker, and ACE inhibitor. Follow-up with cardiology in 2 weeks."
    },
    {
        "note": """Patient: Jane Smith, Age: 45, Female
Admission Date: 02/10/2024, Discharge Date: 02/14/2024
Chief Complaint: Severe abdominal pain
Diagnosis: Acute Appendicitis
Hospital Course: Patient presented with RLQ pain, fever, and elevated WBC.
CT scan confirmed acute appendicitis. Laparoscopic appendectomy performed on 02/11/2024
without complications. Patient tolerated regular diet and ambulating well.
Medications: Ciprofloxacin 500mg BID for 7 days, Acetaminophen PRN for pain
Follow-up: Surgical clinic in 10 days for wound check
Discharge Instructions: Rest, avoid heavy lifting for 2 weeks, wound care instructions provided.""",
        "reference": "45-year-old female with acute appendicitis successfully treated with laparoscopic appendectomy. Discharged on antibiotics with surgical follow-up in 10 days. Advised to avoid heavy lifting."
    },
    {
        "note": """Patient: Robert Johnson, Age: 72, Male
Admission Date: 03/05/2024, Discharge Date: 03/12/2024
Chief Complaint: Weakness and confusion
Diagnosis: Pneumonia, Sepsis, Type 2 Diabetes Mellitus
Hospital Course: Patient admitted with fever, productive cough, and altered mental status.
Chest X-ray showed right lower lobe infiltrate. Blood cultures positive for Streptococcus pneumoniae.
Treated with IV antibiotics and fluid resuscitation. Blood glucose control optimized with insulin.
Medications: Levofloxacin 750mg daily (5 more days), Metformin 1000mg BID, Insulin glargine 20 units at bedtime
Follow-up: Primary care in 1 week, repeat chest X-ray in 4 weeks
Discharge Instructions: Complete antibiotic course, monitor blood glucose, increased fluid intake.""",
        "reference": "72-year-old male with pneumonia and sepsis, treated with IV antibiotics. Diabetes management optimized. Discharged on oral antibiotics with PCP follow-up in 1 week and repeat imaging in 4 weeks."
    }
]

print(f"✓ Loaded {len(sample_data)} sample clinical notes")

✓ Loaded 3 sample clinical notes


In [None]:
# ============================================================================
# MODEL IMPLEMENTATIONS
# ============================================================================

class ClinicalSummarizer:
    """Base class for clinical note summarization models"""

    def __init__(self, model_name: str):
        self.model_name = model_name

    def summarize(self, text: str) -> str:
        raise NotImplementedError

class ExtractiveModel(ClinicalSummarizer):
    """Extractive summarization using sentence scoring"""

    def __init__(self):
        super().__init__("Extractive")

    def summarize(self, text: str) -> str:
        sentences = nltk.sent_tokenize(text)
        keywords = ['diagnosis', 'treatment', 'medication', 'discharge',
                   'patient', 'condition', 'procedure', 'follow-up',
                   'admitted', 'presented']

        scored_sentences = []
        for sentence in sentences:
            score = sum(1 for keyword in keywords if keyword in sentence.lower())
            if score > 0:
                scored_sentences.append((sentence, score))

        scored_sentences.sort(key=lambda x: x[1], reverse=True)
        summary_sentences = [s[0] for s in scored_sentences[:3]]

        return ' '.join(summary_sentences)

class BARTModel(ClinicalSummarizer):
    """BART-based abstractive summarization"""

    def __init__(self):
        super().__init__("BART-Large-CNN")
        print("Loading BART model...")
        self.summarizer = pipeline("summarization",
                                   model="facebook/bart-large-cnn",
                                   device=-1)  # -1 for CPU, 0 for GPU

    def summarize(self, text: str) -> str:
        # BART works better with shorter inputs
        max_input = 1024
        if len(text.split()) > max_input:
            text = ' '.join(text.split()[:max_input])

        result = self.summarizer(text,
                                max_length=150,
                                min_length=50,
                                do_sample=False)
        return result[0]['summary_text']

class T5Model(ClinicalSummarizer):
    """T5-based summarization"""

    def __init__(self):
        super().__init__("T5-Base")
        print("Loading T5 model...")
        self.tokenizer = AutoTokenizer.from_pretrained("t5-base")
        self.model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

    def summarize(self, text: str) -> str:
        input_text = "summarize: " + text
        inputs = self.tokenizer(input_text,
                               return_tensors="pt",
                               max_length=512,
                               truncation=True)

        outputs = self.model.generate(inputs.input_ids,
                                     max_length=150,
                                     min_length=50,
                                     length_penalty=2.0,
                                     num_beams=4)

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

class BioGPTModel(ClinicalSummarizer):
    """Clinical-focused model (using distilbart as proxy)"""

    def __init__(self):
        super().__init__("Clinical-BART")
        print("Loading Clinical BART model...")
        self.summarizer = pipeline("summarization",
                                   model="sshleifer/distilbart-cnn-12-6",
                                   device=-1)

    def summarize(self, text: str) -> str:
        max_input = 1024
        if len(text.split()) > max_input:
            text = ' '.join(text.split()[:max_input])

        result = self.summarizer(text,
                                max_length=130,
                                min_length=40,
                                do_sample=False)
        return result[0]['summary_text']

In [None]:
# ============================================================================
# MODEL IMPLEMENTATIONS
# ============================================================================

class ClinicalSummarizer:
    """Base class for clinical note summarization models"""

    def __init__(self, model_name: str):
        self.model_name = model_name

    def summarize(self, text: str) -> str:
        raise NotImplementedError

class ExtractiveModel(ClinicalSummarizer):
    """Extractive summarization using sentence scoring"""

    def __init__(self):
        super().__init__("Extractive")

    def summarize(self, text: str) -> str:
        sentences = nltk.sent_tokenize(text)
        keywords = ['diagnosis', 'treatment', 'medication', 'discharge',
                   'patient', 'condition', 'procedure', 'follow-up',
                   'admitted', 'presented']

        scored_sentences = []
        for sentence in sentences:
            score = sum(1 for keyword in keywords if keyword in sentence.lower())
            if score > 0:
                scored_sentences.append((sentence, score))

        scored_sentences.sort(key=lambda x: x[1], reverse=True)
        summary_sentences = [s[0] for s in scored_sentences[:3]]

        return ' '.join(summary_sentences)

class BARTModel(ClinicalSummarizer):
    """BART-based abstractive summarization"""

    def __init__(self):
        super().__init__("BART-Large-CNN")
        print("Loading BART model...")
        self.summarizer = pipeline("summarization",
                                   model="facebook/bart-large-cnn",
                                   device=-1)  # -1 for CPU, 0 for GPU

    def summarize(self, text: str) -> str:
        # BART works better with shorter inputs
        max_input = 1024
        if len(text.split()) > max_input:
            text = ' '.join(text.split()[:max_input])

        result = self.summarizer(text,
                                max_length=150,
                                min_length=50,
                                do_sample=False)
        return result[0]['summary_text']

class T5Model(ClinicalSummarizer):
    """T5-based summarization"""

    def __init__(self):
        super().__init__("T5-Base")
        print("Loading T5 model...")
        self.tokenizer = AutoTokenizer.from_pretrained("t5-base")
        self.model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

    def summarize(self, text: str) -> str:
        input_text = "summarize: " + text
        inputs = self.tokenizer(input_text,
                               return_tensors="pt",
                               max_length=512,
                               truncation=True)

        outputs = self.model.generate(inputs.input_ids,
                                     max_length=150,
                                     min_length=50,
                                     length_penalty=2.0,
                                     num_beams=4)

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

class BioGPTModel(ClinicalSummarizer):
    """Clinical-focused model (using distilbart as proxy)"""

    def __init__(self):
        super().__init__("Clinical-BART")
        print("Loading Clinical BART model...")
        self.summarizer = pipeline("summarization",
                                   model="sshleifer/distilbart-cnn-12-6",
                                   device=-1)

    def summarize(self, text: str) -> str:
        max_input = 1024
        if len(text.split()) > max_input:
            text = ' '.join(text.split()[:max_input])

        result = self.summarizer(text,
                                max_length=130,
                                min_length=40,
                                do_sample=False)
        return result[0]['summary_text']

In [None]:
# ============================================================================
# EVALUATION METRICS
# ============================================================================

class MetricsCalculator:
    """Calculate BLEU, ROUGE, and BERTScore metrics"""

    def __init__(self):
        self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                              use_stemmer=True)
        self.smoothing = SmoothingFunction()

    def calculate_bleu(self, reference: str, hypothesis: str) -> float:
        """Calculate BLEU score"""
        ref_tokens = nltk.word_tokenize(reference.lower())
        hyp_tokens = nltk.word_tokenize(hypothesis.lower())

        # BLEU with smoothing for short sentences
        score = sentence_bleu([ref_tokens], hyp_tokens,
                             smoothing_function=self.smoothing.method1)
        return score

    def calculate_sacrebleu(self, reference: str, hypothesis: str) -> float:
        """Calculate SacreBLEU score"""
        bleu = sacrebleu.sentence_bleu(hypothesis, [reference])
        return bleu.score / 100.0  # Normalize to 0-1

    def calculate_rouge(self, reference: str, hypothesis: str) -> Dict:
        """Calculate ROUGE scores (includes F1)"""
        scores = self.rouge.score(reference, hypothesis)
        return {
            'rouge1_f1': scores['rouge1'].fmeasure,
            'rouge2_f1': scores['rouge2'].fmeasure,
            'rougeL_f1': scores['rougeL'].fmeasure,
        }

    def calculate_bertscore(self, references: List[str],
                           hypotheses: List[str]) -> Dict:
        """Calculate BERTScore (batch operation for efficiency)"""
        P, R, F1 = bert_score(hypotheses, references,
                             lang="en", verbose=False)

        return {
            'precision': P.mean().item(),
            'recall': R.mean().item(),
            'f1': F1.mean().item()
        }

In [None]:
# ============================================================================
# MAIN EVALUATION PIPELINE
# ============================================================================

def evaluate_models():
    """Evaluate all models on sample data"""

    print("\n" + "="*70)
    print("INITIALIZING MODELS")
    print("="*70)

    # Initialize models
    models = {
        'Extractive': ExtractiveModel(),
        'BART-Large': BARTModel(),
        'T5-Base': T5Model(),
        'Clinical-BART': BioGPTModel()
    }

    print("\n✓ All models loaded successfully!\n")

    # Initialize metrics calculator
    metrics_calc = MetricsCalculator()

    # Store results
    results = []
    all_references = []
    all_hypotheses = {model_name: [] for model_name in models.keys()}

    print("="*70)
    print("GENERATING SUMMARIES")
    print("="*70 + "\n")

    # Generate summaries for each note
    for idx, item in enumerate(sample_data, 1):
        print(f"Processing Note {idx}/{len(sample_data)}...")
        note = item['note']
        reference = item['reference']
        all_references.append(reference)

        for model_name, model in models.items():
            try:
                summary = model.summarize(note)
                all_hypotheses[model_name].append(summary)

                # Calculate individual metrics
                bleu = metrics_calc.calculate_bleu(reference, summary)
                sacrebleu = metrics_calc.calculate_sacrebleu(reference, summary)
                rouge_scores = metrics_calc.calculate_rouge(reference, summary)

                results.append({
                    'Note': idx,
                    'Model': model_name,
                    'BLEU': bleu,
                    'SacreBLEU': sacrebleu,
                    'ROUGE-1 F1': rouge_scores['rouge1_f1'],
                    'ROUGE-2 F1': rouge_scores['rouge2_f1'],
                    'ROUGE-L F1': rouge_scores['rougeL_f1'],
                    'Summary': summary
                })

            except Exception as e:
                print(f"  Error with {model_name}: {str(e)}")
                all_hypotheses[model_name].append("")

    print("\n✓ All summaries generated!\n")

    # Calculate BERTScore for all models
    print("="*70)
    print("CALCULATING BERTSCORE")
    print("="*70 + "\n")

    bertscore_results = {}
    for model_name, hypotheses in all_hypotheses.items():
        if all(h for h in hypotheses):  # Check if all summaries exist
            bert_scores = metrics_calc.calculate_bertscore(all_references, hypotheses)
            bertscore_results[model_name] = bert_scores
            print(f"✓ {model_name}: BERTScore F1 = {bert_scores['f1']:.4f}")

    # Create results dataframe
    df_results = pd.DataFrame(results)

    # Add BERTScore to results
    for model_name, scores in bertscore_results.items():
        model_mask = df_results['Model'] == model_name
        df_results.loc[model_mask, 'BERTScore F1'] = scores['f1']
        df_results.loc[model_mask, 'BERTScore Precision'] = scores['precision']
        df_results.loc[model_mask, 'BERTScore Recall'] = scores['recall']

    return df_results, all_references, all_hypotheses

In [None]:
# ============================================================================
# RUN EVALUATION
# ============================================================================

print("\n" + "="*70)
print("CLINICAL NOTES SUMMARIZATION - MODEL COMPARISON")
print("="*70)

df_results, references, hypotheses = evaluate_models()


CLINICAL NOTES SUMMARIZATION - MODEL COMPARISON

INITIALIZING MODELS
Loading BART model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Loading T5 model...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loading Clinical BART model...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu



✓ All models loaded successfully!

GENERATING SUMMARIES

Processing Note 1/3...
Processing Note 2/3...
Processing Note 3/3...

✓ All summaries generated!

CALCULATING BERTSCORE



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Extractive: BERTScore F1 = 0.8425


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ BART-Large: BERTScore F1 = 0.8666


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ T5-Base: BERTScore F1 = 0.8422


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Clinical-BART: BERTScore F1 = 0.8577


In [None]:
# ============================================================================
# RESULTS ANALYSIS
# ============================================================================

print("\n" + "="*70)
print("DETAILED RESULTS BY NOTE")
print("="*70 + "\n")

# Display detailed results
print(df_results[['Note', 'Model', 'BLEU', 'SacreBLEU', 'ROUGE-1 F1',
                  'ROUGE-2 F1', 'ROUGE-L F1', 'BERTScore F1']].to_string(index=False))

print("\n" + "="*70)
print("AVERAGE SCORES BY MODEL")
print("="*70 + "\n")

# Calculate average scores
avg_scores = df_results.groupby('Model').agg({
    'BLEU': 'mean',
    'SacreBLEU': 'mean',
    'ROUGE-1 F1': 'mean',
    'ROUGE-2 F1': 'mean',
    'ROUGE-L F1': 'mean',
    'BERTScore F1': 'mean',
    'BERTScore Precision': 'mean',
    'BERTScore Recall': 'mean'
}).round(4)

print(avg_scores)


DETAILED RESULTS BY NOTE

 Note         Model     BLEU  SacreBLEU  ROUGE-1 F1  ROUGE-2 F1  ROUGE-L F1  BERTScore F1
    1    Extractive 0.023744   0.030920    0.323529    0.089552    0.176471      0.842457
    1    BART-Large 0.039663   0.056258    0.305882    0.096386    0.258824      0.866621
    1       T5-Base 0.051427   0.070488    0.285714    0.088235    0.228571      0.842182
    1 Clinical-BART 0.061811   0.086035    0.352941    0.151515    0.323529      0.857704
    2    Extractive 0.021747   0.026149    0.291262    0.118812    0.252427      0.842457
    2    BART-Large 0.034408   0.042463    0.405797    0.149254    0.347826      0.866621
    2       T5-Base 0.012820   0.021366    0.212121    0.031250    0.151515      0.842182
    2 Clinical-BART 0.019974   0.028209    0.280702    0.072727    0.245614      0.857704
    3    Extractive 0.020702   0.025891    0.336283    0.090090    0.265487      0.842457
    3    BART-Large 0.063929   0.065124    0.305556    0.085714    0.2500

In [None]:
# ============================================================================
# VISUALIZATION
# ============================================================================

print("\n" + "="*70)
print("MODEL RANKING")
print("="*70 + "\n")

# Rank models by different metrics
metrics_to_rank = ['BLEU', 'ROUGE-1 F1', 'ROUGE-L F1', 'BERTScore F1']

for metric in metrics_to_rank:
    print(f"\n{metric} Rankings:")
    ranked = avg_scores.sort_values(metric, ascending=False)[metric]
    for i, (model, score) in enumerate(ranked.items(), 1):
        print(f"  {i}. {model:20s} {score:.4f}")


MODEL RANKING


BLEU Rankings:
  1. Clinical-BART        0.0539
  2. BART-Large           0.0460
  3. T5-Base              0.0418
  4. Extractive           0.0221

ROUGE-1 F1 Rankings:
  1. BART-Large           0.3391
  2. Clinical-BART        0.3362
  3. Extractive           0.3170
  4. T5-Base              0.2356

ROUGE-L F1 Rankings:
  1. BART-Large           0.2855
  2. Clinical-BART        0.2835
  3. Extractive           0.2315
  4. T5-Base              0.1963

BERTScore F1 Rankings:
  1. BART-Large           0.8666
  2. Clinical-BART        0.8577
  3. Extractive           0.8425
  4. T5-Base              0.8422


In [None]:
# ============================================================================
# SAMPLE SUMMARIES COMPARISON
# ============================================================================

print("\n" + "="*70)
print("SAMPLE SUMMARIES COMPARISON (Note 1)")
print("="*70 + "\n")

print("REFERENCE:")
print(references[0])
print("\n" + "-"*70 + "\n")

for model_name in hypotheses.keys():
    print(f"{model_name}:")
    print(hypotheses[model_name][0])
    print("\n" + "-"*70 + "\n")


SAMPLE SUMMARIES COMPARISON (Note 1)

REFERENCE:
65-year-old male admitted with acute MI, treated with emergency cardiac catheterization and stent placement in LAD. Discharged on aspirin, clopidogrel, statin, beta-blocker, and ACE inhibitor. Follow-up with cardiology in 2 weeks.

----------------------------------------------------------------------

Extractive:
Patient: John Doe, Age: 65, Male
Admission Date: 01/15/2024, Discharge Date: 01/20/2024
Chief Complaint: Chest pain and shortness of breath
Diagnosis: Acute Myocardial Infarction (AMI), Coronary Artery Disease
Hospital Course: Patient presented to ED with severe chest pain radiating to left arm. Medications: Aspirin 81mg daily, Clopidogrel 75mg daily, Atorvastatin 80mg daily,
Metoprolol 50mg BID, Lisinopril 10mg daily
Follow-up: Cardiology clinic in 2 weeks, repeat echocardiogram in 1 month
Discharge Instructions: Strict cardiac diet, cardiac rehabilitation referral,
avoid strenuous activity for 4 weeks. Patient underwent emer

**Model Performance Summary based on Evaluation Metrics:**

Based on the evaluation performed on the sample clinical notes, here's a summary of the performance of the four pre-trained models:

*   **BART-Large:** Generally performed well across most metrics, achieving the highest average ROUGE-1 F1, ROUGE-L F1, and BERTScore F1 scores. It also ranked second in average BLEU score. This suggests BART-Large is quite effective at generating abstractive summaries that are relevant and capture the main points.

*   **Clinical-BART:** This model, a smaller version of BART fine-tuned on medical data (represented by distilbart in this case), showed promising results, particularly with the highest average BLEU score and ranking second in average ROUGE-L F1 and BERTScore F1. This indicates its potential benefit for clinical text, although its ROUGE-1 F1 was slightly lower than BART-Large and Extractive.

*   **Extractive:** This simple rule-based model had the lowest average BLEU and SacreBLEU scores, and the second lowest BERTScore F1. However, it surprisingly ranked second in average ROUGE-1 F1. This suggests that while it might include relevant keywords (contributing to ROUGE-1), the overall fluency and structure of its summaries (reflected in lower BLEU and ROUGE-L) are not as good as the abstractive models.

*   **T5-Base:** This model generally had the lowest average scores across most metrics, including the lowest ROUGE-1 F1, ROUGE-2 F1, ROUGE-L F1, and the second lowest BERTScore F1. This suggests that T5-Base, in its base form without specific fine-tuning, might not be as suitable for this clinical summarization task compared to the BART-based models.

In summary, **BART-Large** and **Clinical-BART** appear to be the most promising models for this task based on these sample evaluations, with BART-Large performing slightly better overall across the abstractive metrics, and Clinical-BART showing strength with BLEU and competitive performance otherwise. The **Extractive** model is a simpler baseline but its ROUGE-1 score suggests it can capture important terms. **T5-Base** performed the least well in this comparison.

It's important to remember that these conclusions are based on a very small sample size, and a more comprehensive evaluation on a larger, diverse dataset of clinical notes would be needed to draw more definitive conclusions.

In [None]:
from transformers import pipeline, AutoTokenizer
import nltk

# Download punkt for sentence splitting if you haven't already
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)

class HybridSummarizer:
    def __init__(self, model_name="sshleifer/distilbart-cnn-12-6"):
        """
        Initializes the summarization pipeline and tokenizer.
        Uses a smaller BART-like model for efficiency in this example.
        """
        print(f"Loading summarization model: {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Use the pipeline for simplicity in a backend service
        self.abstractive_pipeline = pipeline(
            "summarization",
            model=model_name,
            tokenizer=self.tokenizer,
            device=-1  # Use CPU (-1) or GPU (0, 1, etc.)
        )
        # Max input length for BART-like models is typically around 1024 tokens
        self.max_input_length = self.tokenizer.model_max_length
        print(f"Model loaded with max input length: {self.max_input_length}")

    def chunk_text_by_sentences(self, text, max_len):
        """
        Splits text into chunks of sentences, ensuring each chunk is within
        the model's max token limit. This acts as the 'extractive' pre-step.
        """
        sentences = nltk.sent_tokenize(text)
        chunks = []
        current_chunk_sentences = []
        current_chunk_length = 0

        for sent in sentences:
            sent_len = len(self.tokenizer.encode(sent, add_special_tokens=False))

            # If adding the next sentence exceeds the limit, finalize the current chunk
            # and start a new one. We reserve some space for special tokens.
            if current_chunk_length + sent_len + 10 > max_len:
                if current_chunk_sentences:
                    chunks.append(" ".join(current_chunk_sentences))
                    current_chunk_sentences = []
                    current_chunk_length = 0

            # If the sentence itself is too long, we might need a more sophisticated
            # splitter, but for simplicity, we skip it or allow it as a single chunk.
            if sent_len < max_len - 10:
                current_chunk_sentences.append(sent)
                current_chunk_length += sent_len + 1 # +1 for the space

        if current_chunk_sentences:
            chunks.append(" ".join(current_chunk_sentences))

        return chunks

    def summarize(self, text, max_summary_length=150, min_summary_length=30):
        """
        Hybrid summarization function.
        1. Chunks long text (Extractive Pre-step).
        2. Abstractively summarizes chunks individually.
        3. Re-summarizes the concatenated summaries if multiple chunks exist.
        """

        # Check token length of the original document
        input_tokens = self.tokenizer.encode(text, return_tensors='pt', truncation=False)
        original_length = input_tokens.shape[1]

        if original_length <= self.max_input_length:
            print("Text is short enough, performing direct abstractive summarization.")
            chunks = [text]
        else:
            print(f"Text too long ({original_length} tokens). Chunking for hybrid approach.")
            # Step 1 (Extractive Pre-step): Split the long text into manageable chunks
            chunks = self.chunk_text_by_sentences(text, self.max_input_length)

        # Step 2: Abstractively summarize each chunk
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            print(f"Summarizing chunk {i+1}/{len(chunks)}...")
            # Use the abstractive pipeline on the (extractive) chunks
            summary = self.abstractive_pipeline(
                chunk,
                max_length=max_summary_length,
                min_length=min_summary_length // len(chunks), # Adjust min length per chunk
                do_sample=False  # Use beam search for more deterministic output
            )[0]['summary_text']
            chunk_summaries.append(summary)

        final_summary = " ".join(chunk_summaries)

        # Step 3: Re-summarize if multiple summaries were generated (Abstractive Final Step)
        if len(chunks) > 1:
            print("Re-summarizing combined chunks for final coherence.")
            # We want the final summary to be shorter and coherent
            # Re-summarize the concatenated chunk summaries
            final_summary = self.abstractive_pipeline(
                final_summary,
                max_length=max_summary_length,
                min_length=min_summary_length,
                do_sample=False
            )[0]['summary_text']

        return final_summary

# Instantiate the summarizer
summarizer = HybridSummarizer()

## Example Clinical Note
clinical_note = """
Patient John Doe, a 68-year-old male with a history of COPD, T2DM, and chronic renal insufficiency,
was admitted on 2024-10-06 with increasing shortness of breath and fever for 3 days.
Initial vitals: T 101.5F, HR 105, BP 110/70, RR 24, SpO2 88% on room air.
Chest X-ray showed a new left lower lobe infiltrate, consistent with pneumonia.
Blood work revealed leukocytosis (WBC 18.5) and elevated C-reactive protein (CRP 150).
The patient was started empirically on Ceftriaxone and Azithromycin after blood cultures were drawn.
Respiratory status initially worsened, requiring high-flow nasal cannula.
On hospital day 3, the fever resolved, and oxygen requirements began to trend down.
Repeat chest X-ray on day 4 showed improvement in the infiltrate.
The patient is tolerating oral intake and ambulating without difficulty in the room.
Pending cultures were negative. He is transitioning to oral antibiotics for discharge.
His blood glucose has been managed with a sliding scale insulin protocol.
He requires a follow-up appointment with his primary care physician in one week.
Education on new medications and 'red flag' symptoms for readmission was provided.
""" * 5 # Repeat the note 5 times to simulate a very long document

# Generate the summary
final_summary = summarizer.summarize(
    clinical_note,
    max_summary_length=150,
    min_summary_length=40
)

## Result
print("\n--- Final Summary ---")
print(final_summary)

Loading summarization model: sshleifer/distilbart-cnn-12-6...


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1417 > 1024). Running this sequence through the model will result in indexing errors


Model loaded with max input length: 1024
Text too long (1417 tokens). Chunking for hybrid approach.
Summarizing chunk 1/2...
Summarizing chunk 2/2...
Re-summarizing combined chunks for final coherence.

--- Final Summary ---
 Patient John Doe, a 68-year-old male with a history of COPD, T2DM, and chronic renal insufficiency, was admitted on 2024-10-06 with increasing shortness of breath and fever for 3 days . On hospital day 3, fever resolved, and oxygen requirements began to trend down . Chest X-ray showed a new left lower lobe infiltrate, consistent with pneumonia . Pending cultures were negative . He is transitioning to oral antibiotics for discharge .


In [None]:
import nltk
from typing import List

class ClinicalSummarizer:
    """Base class for clinical note summarization models"""

    def __init__(self, model_name: str):
        self.model_name = model_name

    def summarize(self, text: str) -> str:
        raise NotImplementedError

class ExtractiveModel(ClinicalSummarizer):
    """Extractive summarization using sentence scoring"""

    def __init__(self):
        super().__init__("Extractive")

    def summarize(self, text: str) -> str:
        sentences = nltk.sent_tokenize(text)
        keywords = ['diagnosis', 'treatment', 'medication', 'discharge',
                   'patient', 'condition', 'procedure', 'follow-up',
                   'admitted', 'presented']

        scored_sentences = []
        for sentence in sentences:
            score = sum(1 for keyword in keywords if keyword in sentence.lower())
            if score > 0:
                scored_sentences.append((sentence, score))

        scored_sentences.sort(key=lambda x: x[1], reverse=True)
        summary_sentences = [s[0] for s in scored_sentences[:3]]

        return ' '.join(summary_sentences)

# To use this locally, you would instantiate the class:
# extractive_summarizer = ExtractiveModel()
# summary = extractive_summarizer.summarize("Your clinical note text here...")
# print(summary)

In [None]:
# Initialize the specific model needed for saving
try:
    # Re-initialize the Clinical-BART model
    clinical_bart_model_pipeline = pipeline("summarization",
                                   model="sshleifer/distilbart-cnn-12-6",
                                   device=-1) # -1 for CPU, 0 for GPU

    output_dir = "./clinical_bart_saved_model"

    # Save the model and tokenizer
    clinical_bart_model_pipeline.model.save_pretrained(output_dir)
    clinical_bart_model_pipeline.tokenizer.save_pretrained(output_dir)

    print(f"Clinical-BART model saved to {output_dir}")

except Exception as e:
    print(f"Error saving Clinical-BART model: {e}")

Device set to use cpu


Clinical-BART model saved to ./clinical_bart_saved_model
