BART Approach with CNN/DailyMail dataset

In [None]:
!pip install transformers
!pip install datasets

# Import required libraries
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset

# Load pre-trained BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Load the test split of the CNN/DailyMail dataset (version 3.0.0)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
texts = dataset['article'] #Extract the 'article' column from the dataset, which contains news articles

# Get the number of articles
num_articles = len(texts)
print(f"\nNumber of articles in the dataset: {num_articles}")

# Function to generate a summary using the BART model
def summarize(text):
    # Tokenize input text
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)

    # Generate summary ids
    summary_ids = model.generate(inputs['input_ids'], max_length=250, min_length=40,
                                 length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary and return
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return summary

# Function to get the user's article choice
def pick_article(index):
    try:
        article = texts[index]  # Pick the ith article
        summary = summarize(article)  # Generate the summary

        # Print the original article and the generated summary
        print(f"\nOriginal Article {index + 1}:\n", article)
        print("\nGenerated Summary:\n", summary)
        print("\n" + "-"*80 + "\n")
    except IndexError:
        print(f"Article index {index} is out of range. Please select a valid index.")

# Prompt the user to pick an article index
try:
    # Set a default index for testing
    index = input(f"Enter the index of the article you want to summarize (0 to {num_articles - 1}) or press Enter to use default (0): ")
    if index.strip() == "":  # If the user presses enter without input
        index = 0  # Default to index 0
    else:
        index = int(index)

    pick_article(index)
except ValueError:
    print("Invalid input. Please enter a valid number.")




Number of articles in the dataset: 11490
Enter the index of the article you want to summarize (0 to 11489) or press Enter to use default (0): 1

Original Article 2:
 (CNN)Never mind cats having nine lives. A stray pooch in Washington State has used up at least three of her own after being hit by a car, apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive. That's according to Washington State University, where the dog -- a friendly white-and-black bully breed mix now named Theia -- has been receiving care at the Veterinary Teaching Hospital. Four days after her apparent death, the dog managed to stagger to a nearby farm, dirt-covered and emaciated, where she was found by a worker who took her to a vet for help. She was taken in by Moses Lake, Washington, resident Sara Mellado. "Considering everything that she's been through, she's incredibly gentle and loving," Mellado said, according to WSU News. "She's a true miracle 

Google T5 Approach with CNN/DailyMail dataset


In [None]:
!pip install transformers
!pip install datasets

# Import required libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Load the test split of the CNN/DailyMail dataset (version 3.0.0)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
texts = dataset['article'] #Extract the 'article' column from the dataset, which contains news articles

# Get the number of articles
num_articles = len(texts)
print(f"\nNumber of articles in the dataset: {num_articles}")

# Function to generate a summary using the T5 model
def summarize(text):
    # Prepend "summarize: " to the input text as T5 expects a task-specific prefix
    input_text = "summarize: " + text

    # Tokenize input text
    inputs = tokenizer.encode(input_text, max_length=512, return_tensors='pt', truncation=True)

    # Generate summary ids
    summary_ids = model.generate(inputs, max_length=200, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary and return
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return summary

# Function to get the user's article choice
def pick_article(index):
    try:
        article = texts[index]  # Pick the ith article
        summary = summarize(article)  # Generate the summary

        # Print the original article and the generated summary
        print(f"\nOriginal Article {index + 1}:\n", article)
        print("\nGenerated Summary:\n", summary)
        print("\n" + "-"*80 + "\n")
    except IndexError:
        print(f"Article index {index} is out of range. Please select a valid index.")

# Prompt the user to pick an article index
try:
    # Set a default index for testing
    index = input(f"Enter the index of the article you want to summarize (0 to {num_articles - 1}) or press Enter to use default (0): ")
    if index.strip() == "":  # If the user presses enter without input
        index = 0  # Default to index 0
    else:
        index = int(index)

    pick_article(index)
except ValueError:
    print("Invalid input. Please enter a valid number.")



Number of articles in the dataset: 11490
Enter the index of the article you want to summarize (0 to 11489) or press Enter to use default (0): 3

Original Article 4:
 (CNN)Five Americans who were monitored for three weeks at an Omaha, Nebraska, hospital after being exposed to Ebola in West Africa have been released, a Nebraska Medicine spokesman said in an email Wednesday. One of the five had a heart-related issue on Saturday and has been discharged but hasn't left the area, Taylor Wilson wrote. The others have already gone home. They were exposed to Ebola in Sierra Leone in March, but none developed the deadly virus. They are clinicians for Partners in Health, a Boston-based aid group. They all had contact with a colleague who was diagnosed with the disease and is being treated at the National Institutes of Health in Bethesda, Maryland. As of Monday, that health care worker is in fair condition. The Centers for Disease Control and Prevention in Atlanta has said the last of 17 patients

LLM Approach using LangChain with CNN/DailyMail dataset

In [None]:
%pip install --upgrade --quiet tiktoken langchain langgraph beautifulsoup4 langchain langchain-google-genai langchain-huggingface datasets
import os
os.environ["GOOGLE_API_KEY"] = "<Google API key>"  # Replace <Google API key> with your Google api key
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

# Load the test split of the CNN/DailyMail dataset (version 3.0.0)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
texts = dataset['article'] #Extract the 'article' column from the dataset, which contains news articles

def load_llm(model="gemini-1.5-flash"):
    if model == "gemini-1.5-pro":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-pro",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2)
        return llm
    elif model == "gemini-1.5-flash":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2)
        return llm
    else:
        raise ValueError("Invalid model name")

def get_prompt_template():
    # Define prompt with dynamic word count input
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {num_words} words:\n\n",
            ),
            ("human", "{context}")
        ]
    )
    return prompt

def summarize_text(text, num_words=50, model="gemini-1.5-flash"):
    # Load LLM
    llm = load_llm(model)
    # Get Prompt Template
    prompt = get_prompt_template()
    # Instantiate chain
    chain = prompt | llm
    # Invoke chain with the desired word count
    result = chain.invoke({
        "context": text,
        "num_words": num_words
    })
    # Return result
    return result.content

# Get the number of articles
num_articles = len(texts)
print(f"\nNumber of articles in the dataset: {num_articles}")

# Function to get the user's article choice
def pick_article(index):
    try:
        article = texts[index]  # Pick the ith article
        summary = summarize_text(article, num_words = 50, model = "gemini-1.5-flash")  # Generate the summary

        # Print the original article and the generated summary
        print(f"\nOriginal Article {index + 1}:\n", article)
        print("\nGenerated Summary:\n", summary)
        print("\n" + "-"*80 + "\n")
    except IndexError:
        print(f"Article index {index} is out of range. Please select a valid index.")

# Prompt the user to pick an article index
try:
    # Set a default index for testing
    index = input(f"Enter the index of the article you want to summarize (0 to {num_articles - 1}) or press Enter to use default (0): ")
    if index.strip() == "":  # If the user presses enter without input
        index = 0  # Default to index 0
    else:
        index = int(index)

    pick_article(index)
except ValueError:
    print("Invalid input. Please enter a valid number.")




Number of articles in the dataset: 11490
Enter the index of the article you want to summarize (0 to 11489) or press Enter to use default (0): 0

Original Article 1:
 (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neit