In [None]:
pip install sumy

In [None]:
pip install rouge-score

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from nltk.corpus import stopwords
import nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from google.colab import files
uploaded = files.upload()


Saving CNNnews.csv to CNNnews.csv


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [None]:
# Define summarization and scoring functions
def frequency(text, lines):
    stopwords1 = set(stopwords.words("english"))
    words = word_tokenize(text)
    freqTable = {word.lower(): words.count(word) for word in words if word.lower() not in stopwords1}
    sentences = sent_tokenize(text)
    sentenceValue = {sentence: sum(freqTable.get(word.lower(), 0) for word in word_tokenize(sentence)) for sentence in sentences}
    sorted_sentences = sorted(sentenceValue.items(), key=lambda item: item[1], reverse=True)
    return ' '.join([sentence for sentence, score in sorted_sentences[:lines]])

def lsa(text, lines):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, lines)
    return ' '.join([str(sentence) for sentence in summary])

def luhn(text, lines):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, lines)
    return ' '.join([str(sentence) for sentence in summary])

def lexrank(text, lines):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, lines)
    return ' '.join([str(sentence) for sentence in summary])

'''def calculate_bleu(reference_text, summary):
    smoothing = SmoothingFunction().method1  # Apply smoothing
    reference = [word_tokenize(reference_text.lower())]
    candidate = word_tokenize(summary.lower())
    return sentence_bleu(reference, candidate, smoothing_function=smoothing)'''


def calculate_bleu(reference_text, summary):
    # Apply smoothing
    smoothing = SmoothingFunction().method1

    # Tokenize the reference and candidate texts
    reference = [word_tokenize(reference_text.lower())]
    candidate = word_tokenize(summary.lower())

    # Calculate BLEU score with 1-gram (unigram) weights
    return sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothing)


def calculate_rouge(reference_text, summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, summary)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

# Define summarization methods and create a function to apply them
methods = {
    "frequency": frequency,
    "lsa": lsa,
    "luhn": luhn,
    "lexrank": lexrank
}

# Load the dataset
cnn_data = pd.read_csv("CNNnews.csv")


# Prepare to store the results for each method in separate sheets
summary_results = {}

for method_name, summarization_function in methods.items():
    # Create copies to avoid modifying original DataFrame
    cnn_copy = cnn_data.copy()

    summaries, bleu_scores, rouge_scores = [], [], []
    for _, row in cnn_copy.iterrows():
        # Ensure text data is a string
        article_text = str(row['article'])
        reference_text = str(row['highlights'])

        # Generate summary
        summary = summarization_function(article_text, lines=5)  # Assuming 5 sentences in summary
        summaries.append(summary)

        # Calculate BLEU and ROUGE scores
        bleu_score = calculate_bleu(reference_text, summary)
        rouge_score = calculate_rouge(reference_text, summary)

        # Append scores to respective lists
        bleu_scores.append(bleu_score)
        rouge_scores.append(rouge_score)

    # Add summaries and scores to the DataFrame
    cnn_copy[f'{method_name}_summary'] = summaries
    cnn_copy[f'{method_name}_bleu'] = bleu_scores
    cnn_copy[f'{method_name}_rouge1'] = [score['rouge1'] for score in rouge_scores]
    cnn_copy[f'{method_name}_rouge2'] = [score['rouge2'] for score in rouge_scores]
    cnn_copy[f'{method_name}_rougeL'] = [score['rougeL'] for score in rouge_scores]

    # Store each method's result in the dictionary
    summary_results[method_name] = cnn_copy

# Save the results in separate sheets in a single Excel file
output_path = '/content/CNNnews_with_summaries_and_scores.xlsx'
with pd.ExcelWriter(output_path) as writer:
    for method, df in summary_results.items():
        df.to_excel(writer, sheet_name=method, index=False)

print(f"Results saved to {output_path}")

Results saved to /content/CNNnews_with_summaries_and_scores.xlsx
