In [1]:
!pip install pandas transformers torch rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=55a8223a4094dd753d11b48e2e71ab67992a287ab12e2c4c79ec810f05defcc8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
import pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from rouge_score import rouge_scorer
import torch


In [3]:
# Load the Pegasus tokenizer and model
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [4]:
# Function to summarize a given text using Pegasus
def summarize_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    # Generate the summary
    summary_ids = model.generate(inputs.input_ids, max_length=60, num_beams=5, early_stopping=True)
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [5]:
# Load the CSV file
csv_file_path = 'medical_text_summarization_detailed_1000.csv'  # Provide the correct path to the file
df = pd.read_csv(csv_file_path)


In [6]:
df.head()


Unnamed: 0,Original Text,Summarized Text
0,80-year-old female presented with a diagnosis ...,"80-year-old female with asthma, initial vitals..."
1,75-year-old female presented with a diagnosis ...,"75-year-old female with diabetes, initial vita..."
2,60-year-old female presented with a diagnosis ...,"60-year-old female with hypertension, initial ..."
3,55-year-old male presented with a diagnosis of...,"55-year-old male with stroke, initial vitals: ..."
4,30-year-old male presented with a diagnosis of...,"30-year-old male with stroke, initial vitals: ..."


In [7]:
# Summarize texts from the CSV
df['Summarized Text'] = df['Original Text'].apply(summarize_text)


In [8]:
# Function to calculate ROUGE scores
def calculate_rouge(reference, generated):
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    # Calculate the ROUGE scores
    scores = scorer.score(reference, generated)
    return scores


In [10]:
# Calculate ROUGE scores for each row
df['ROUGE Scores'] = df.apply(lambda row: calculate_rouge(row['Original Text'], row['Summarized Text']), axis=1)


In [11]:
# Save the updated dataframe with summaries and ROUGE scores to a new CSV
df.to_csv('medical_text_summarization_detailed_with_rouge.csv', index=False)

print("Summaries and ROUGE scores have been generated and saved to 'medical_text_summarization_detailed_with_rouge.csv'.")


Summaries and ROUGE scores have been generated and saved to 'medical_text_summarization_detailed_with_rouge.csv'.


In [12]:
# Step 3: Allow user to input text for summarization
def user_input_summarization():
    user_text = input("Enter the medical text you want to summarize: ")
    summary = summarize_text(user_text)
    print(f"Summary: {summary}")

    # Calculate and display ROUGE scores using the original input as reference
    scores = calculate_rouge(user_text, summary)
    print(f"\nROUGE Scores: {scores}")

# Call the user input function at the end
user_input_summarization()


Enter the medical text you want to summarize: A 68-year-old male patient with a history of chronic heart failure, hypertension, and type 2 diabetes was admitted to the hospital with complaints of worsening shortness of breath, fatigue, and swelling in the lower extremities over the past week. On examination, the patient was noted to have bilateral pitting edema, elevated jugular venous pressure, and rales in the lower lung fields. Echocardiography revealed a reduced ejection fraction of 35%, indicating a decline in cardiac function. The patient was started on intravenous diuretics and low-dose beta-blockers to manage fluid overload and improve cardiac output. He was also given dietary counseling to reduce sodium intake. Over the next five days, his symptoms improved, with reduced edema and improved breathing. The patient was discharged on oral medications including diuretics, ACE inhibitors, and insulin, with instructions for close follow-up with his cardiologist and primary care provi