In [None]:
!pip install torch transformers datasets sumy scikit-learn nltk rouge-score openpyxl
!pip install torch transformers datasets
!pip install torch transformers datasets nltk sumy scikit-learn sentence-transformers rouge-score openpyxl
!pip install sumy
!pip install sacrebleu
!pip install rouge_score




In [None]:
import os
import pandas as pd
from transformers import pipeline
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import openai




In [None]:

def load_dataset(dataset_path="./cnn_dailymail.csv"):
    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset not found at {dataset_path}. Please ensure the file exists.")
    return pd.read_csv(dataset_path)



In [None]:

def generate_summaries(text, summarizers):
    summaries = {}
    for name, summarizer in summarizers.items():
        summaries[name] = summarizer(text) if callable(summarizer) else summarizer(text)
    return summaries

def summarize_with_llm(text):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    response = openai.ChatCompletion.create(
        model="gemini-1.5-pro",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specialized in summarization."},
            {"role": "user", "content": f"Summarize this article: {text}"}
        ],
        temperature=0.7,
        max_tokens=200
    )
    return response['choices'][0]['message']['content'].strip()


def calculate_scores(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, candidate)
    bleu = BLEU().corpus_score([candidate], [[reference]])
    return rouge_scores, bleu


def process_and_save(dataset_path, output_excel="summaries_scores.xlsx"):
    df = load_dataset(dataset_path)
    results = []


    transformers_summarizers = {
        "T5": pipeline("summarization", model="t5-small"),
        "BART": pipeline("summarization", model="facebook/bart-large-cnn")
    }
    statistical_summarizers = {
        "Luhn": summarize_with_luhn,
        "KMeans": summarize_with_kmeans,
        "LexRank": summarize_with_lexrank
    }
    llm_summarizers = {
        "LLM-GPT": summarize_with_llm
    }

    summarizers = {**transformers_summarizers, **statistical_summarizers, **llm_summarizers}

    for index, row in df.iterrows():
        text = row["article"]
        reference = row["highlights"]
        summaries = generate_summaries(text, summarizers)

        for method, summary in summaries.items():
            rouge_scores, bleu_score = calculate_scores(reference, summary)
            results.append({
                "Article": text,
                "Reference": reference,
                "Method": method,
                "Summary": summary,
                "Rouge-1": rouge_scores['rouge1'].fmeasure,
                "Rouge-2": rouge_scores['rouge2'].fmeasure,
                "Rouge-L": rouge_scores['rougeL'].fmeasure,
                "BLEU": bleu_score.score
            })
        if index >= 10:
            break

    results_df = pd.DataFrame(results)
    results_df.to_excel(output_excel, index=False)



In [None]:

def summarize_with_luhn(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LuhnSummarizer()
    return " ".join(str(sentence) for sentence in summarizer(parser.document, 3))

def summarize_with_kmeans(text):
    vectorizer = TfidfVectorizer(stop_words="english")
    sentences = text.split(". ")
    X = vectorizer.fit_transform(sentences)
    kmeans = KMeans(n_clusters=1)
    kmeans.fit(X)
    centers = kmeans.cluster_centers_
    closest = sorted(((i, c) for i, c in enumerate(X.dot(centers[0]))), key=lambda x: x[1], reverse=True)
    return ". ".join(sentences[i[0]] for i in closest[:3])

def summarize_with_lexrank(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    return " ".join(str(sentence) for sentence in summarizer(parser.document, 3))



In [None]:
if __name__ == "__main__":
    dataset_path = "./cnn_dailymail.csv"
    process_and_save(dataset_path, output_excel="summary.xlsx")
