In [1]:
!pip install transformers
!pip install rouge_score
import pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
from rouge_score import rouge_scorer  # For evaluating ROUGE score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=f47f89740fe48ce7cf64d5e4d6b4132f48712597d3a3393e2c99a7647e472e54
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
# Load the Pegasus tokenizer and model
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [3]:
# Function to summarize a given text using Pegasus
def summarize_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    # Generate the summary
    summary_ids = model.generate(inputs.input_ids, max_length=60, num_beams=5, early_stopping=True)
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [4]:
# Step 1: Load the CSV file
csv_file_path = 'medical_text_summarization_detailed_500.csv'  # Provide the correct path to the file
df = pd.read_csv(csv_file_path)

In [5]:
df.tail()

Unnamed: 0,Original Text,Summarized Text
495,60-year-old female presented with a diagnosis ...,"60-year-old female with stroke, initial vitals..."
496,70-year-old female presented with a diagnosis ...,"70-year-old female with rheumatoid arthritis, ..."
497,80-year-old female presented with a diagnosis ...,"80-year-old female with heart failure, initial..."
498,75-year-old female presented with a diagnosis ...,"75-year-old female with Alzheimer's disease, i..."
499,40-year-old male presented with a diagnosis of...,"40-year-old male with Alzheimer's disease, ini..."


In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


In [7]:
# Store scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate over the dataset and summarize
for index, row in df.iterrows():
    original_text = row['Original Text']  # Assuming your CSV has this column
    reference_summary = row['Summarized Text']  # Assuming your CSV has this column

    # Generate summary
    generated_summary = summarize_text(original_text)

    # Calculate ROUGE scores
    scores = scorer.score(reference_summary, generated_summary)

    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

In [8]:
# Step 3: Calculate average ROUGE scores
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Step 4: Display the average scores
print(f"Average ROUGE-1 Score: {avg_rouge1:.4f}")
print(f"Average ROUGE-2 Score: {avg_rouge2:.4f}")
print(f"Average ROUGE-L Score: {avg_rougeL:.4f}")

Average ROUGE-1 Score: 0.2050
Average ROUGE-2 Score: 0.1087
Average ROUGE-L Score: 0.1852


In [9]:
# Step 5: User Input Text Summarization
user_input = input("Enter text to summarize: ")
generated_summary = summarize_text(user_input)
print(f"Summarized Text: {generated_summary}")

Enter text to summarize: A 65-year-old female with a history of chronic obstructive pulmonary disease (COPD), hypertension, and type 2 diabetes was admitted to the hospital with shortness of breath and chest tightness. She was diagnosed with acute exacerbation of COPD and treated with intravenous steroids, bronchodilators, and oxygen therapy. During her stay, her blood pressure was managed with medication, and her diabetes was controlled through insulin therapy. She showed gradual improvement and was discharged after 7 days with a plan for follow-up with her pulmonologist. The patient was advised to continue her medications and use a home nebulizer.
Summarized Text: Here is a case report of a COPD patient who was discharged from the hospital with gradual improvement.
