In [None]:
!pip install evaluate

In [None]:
from google.colab import userdata
import os
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HUGGINGFACEHUB_API_TOKEN')

In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
import evaluate

In [None]:
%pip install --upgrade --quiet tiktoken langchain langgraph beautifulsoup4 langchain langchain-google-genai langchain-huggingface

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [None]:
from google.colab import files
uploaded = files.upload()

Saving CNNnews.csv to CNNnews (1).csv


In [None]:
!pip install rouge_score
!pip install sacrebleu

In [None]:
from langchain_core.prompts import ChatPromptTemplate

In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer, BartForConditionalGeneration, BartTokenizer
import evaluate

# Load the dataset
df = pd.read_csv('CNNnews.csv')

# Initialize T5 and BART models and tokenizers
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Initialize ROUGE and BLEU metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

# Define functions for T5 and BART summarizations
def generate_summary_t5(text, max_input_length=512, max_output_length=150):
    inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)
    summary_ids = t5_model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def generate_summary_bart(text, max_input_length=1024, max_output_length=150):
    inputs = bart_tokenizer.encode(text, return_tensors="pt", max_length=max_input_length, truncation=True)
    summary_ids = bart_model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Define functions for Gemini model
def load_llm(model="gemini-1.5-pro"):
    if model == "gemini-1.5-pro":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-pro",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2)
        return llm
    elif model == "gemini-1.5-flash":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2)
        return llm
    else:
        raise ValueError("Invalid model name")

def get_prompt_template():
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "Write a concise summary of the following in {num_words} words:\n\n"),
            ("human", "{context}")
        ]
    )
    return prompt

def summarize_text(text, num_words=50, model="gemini-1.5-pro"):
    llm = load_llm(model)
    prompt = get_prompt_template()
    chain = prompt | llm
    result = chain.invoke({
        "context": text,
        "num_words": num_words
    })
    return result.content

# Function to calculate ROUGE and BLEU scores
def calculate_scores(summary, reference):
    rouge_result = rouge_metric.compute(predictions=[summary], references=[reference])
    bleu_result = bleu_metric.compute(predictions=[summary], references=[[reference]])
    return rouge_result['rouge1'], rouge_result['rouge2'], rouge_result['rougeL'], bleu_result['bleu']

# Dictionary to store results for each model
results = {
    "T5": {"summaries": [], "rouge1_scores": [], "rouge2_scores": [], "rougeL_scores": [], "bleu_scores": []},
    "BART": {"summaries": [], "rouge1_scores": [], "rouge2_scores": [], "rougeL_scores": [], "bleu_scores": []},
    "Gemini": {"summaries": [], "rouge1_scores": [], "rouge2_scores": [], "rougeL_scores": [], "bleu_scores": []}
}

# Loop through each article and generate summaries and scores
for index, row in df.iterrows():
    article_text = str(row['article'])  # Convert to string to avoid TypeError
    reference_summary = str(row['highlights'])  # Convert to string as well

    # Skip rows with empty articles or summaries
    if article_text.lower() == 'nan' or reference_summary.lower() == 'nan':
        continue  # Skip to the next row if there are empty values

    # T5 Summarization and Scoring
    t5_summary = generate_summary_t5(article_text)
    rouge1, rouge2, rougeL, bleu = calculate_scores(t5_summary, reference_summary)
    results["T5"]["summaries"].append(t5_summary)
    results["T5"]["rouge1_scores"].append(rouge1)
    results["T5"]["rouge2_scores"].append(rouge2)
    results["T5"]["rougeL_scores"].append(rougeL)
    results["T5"]["bleu_scores"].append(bleu)

    # BART Summarization and Scoring
    bart_summary = generate_summary_bart(article_text)
    rouge1, rouge2, rougeL, bleu = calculate_scores(bart_summary, reference_summary)
    results["BART"]["summaries"].append(bart_summary)
    results["BART"]["rouge1_scores"].append(rouge1)
    results["BART"]["rouge2_scores"].append(rouge2)
    results["BART"]["rougeL_scores"].append(rougeL)
    results["BART"]["bleu_scores"].append(bleu)

    # Gemini Summarization and Scoring
    gemini_summary = summarize_text(article_text, num_words=50)
    rouge1, rouge2, rougeL, bleu = calculate_scores(gemini_summary, reference_summary)
    results["Gemini"]["summaries"].append(gemini_summary)
    results["Gemini"]["rouge1_scores"].append(rouge1)
    results["Gemini"]["rouge2_scores"].append(rouge2)
    results["Gemini"]["rougeL_scores"].append(rougeL)
    results["Gemini"]["bleu_scores"].append(bleu)

# Create DataFrames for each model with summaries and scores
t5_df = df.head(len(results["T5"]["summaries"])).copy()
t5_df['T5_summary'] = results["T5"]["summaries"]
t5_df['T5_rouge1_score'] = results["T5"]["rouge1_scores"]
t5_df['T5_rouge2_score'] = results["T5"]["rouge2_scores"]
t5_df['T5_rougeL_score'] = results["T5"]["rougeL_scores"]
t5_df['T5_bleu_score'] = results["T5"]["bleu_scores"]

bart_df = df.head(len(results["BART"]["summaries"])).copy()
bart_df['BART_summary'] = results["BART"]["summaries"]
bart_df['BART_rouge1_score'] = results["BART"]["rouge1_scores"]
bart_df['BART_rouge2_score'] = results["BART"]["rouge2_scores"]
bart_df['BART_rougeL_score'] = results["BART"]["rougeL_scores"]
bart_df['BART_bleu_score'] = results["BART"]["bleu_scores"]

gemini_df = df.head(len(results["Gemini"]["summaries"])).copy()
gemini_df['Gemini_summary'] = results["Gemini"]["summaries"]
gemini_df['Gemini_rouge1_score'] = results["Gemini"]["rouge1_scores"]
gemini_df['Gemini_rouge2_score'] = results["Gemini"]["rouge2_scores"]
gemini_df['Gemini_rougeL_score'] = results["Gemini"]["rougeL_scores"]
gemini_df['Gemini_bleu_score'] = results["Gemini"]["bleu_scores"]

# Save the results to separate sheets in an Excel file
output_path = '/content/cnn-news-with-summaries-and-scores.xlsx'
with pd.ExcelWriter(output_path) as writer:
    t5_df.to_excel(writer, sheet_name="T5", index=False)
    bart_df.to_excel(writer, sheet_name="BART", index=False)
    gemini_df.to_excel(writer, sheet_name="Gemini", index=False)

print(f"Results saved to {output_path}")




Results saved to /content/cnn-news-with-summaries-and-scores.xlsx
