In [None]:
# Import required libraries
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
train_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
validation_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv')
test_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')

# Sample the training data for computational efficiency
train_df = train_df.sample(30000).reset_index(drop=True)

# Define preprocessing functions
def clean_text(text):
    """Clean text by removing special characters, multiple spaces, and converting to lowercase."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9. ]', '', text)  # Remove special characters
    return text

def preprocess_and_tokenize(text):
    """Preprocess the text and tokenize into sentences."""
    cleaned_text = clean_text(text)
    sentences = sent_tokenize(cleaned_text)
    return sentences

# Preprocess articles
train_df['sentences'] = train_df['article'].apply(preprocess_and_tokenize)

# Define a function for extractive summarization
def extractive_summary(article_sentences, top_n=5):
    """
    Perform extractive summarization using TF-IDF to score sentences.
    - `article_sentences`: List of sentences in the article
    - `top_n`: Number of sentences to extract for the summary
    """
    # Flatten the list of sentences into a single text for vectorization
    flat_text = ' '.join(article_sentences)

    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(article_sentences)

    # Compute sentence scores as the sum of TF-IDF weights for each word
    sentence_scores = tfidf_matrix.sum(axis=1).A.flatten()

    # Rank sentences by their scores and extract the top N
    ranked_sentences = sorted(
        ((score, idx) for idx, score in enumerate(sentence_scores)),
        reverse=True
    )

    # Select the top N sentences
    top_sentence_indices = [idx for _, idx in ranked_sentences[:top_n]]

    # Return the top sentences in the order they appear in the text
    summary = ' '.join([article_sentences[idx] for idx in sorted(top_sentence_indices)])
    return summary

# Apply summarization on a subset of the training data
sample_articles = train_df['sentences'].head(5)  # Select a few articles for summarization

# Generate summaries
summaries = []
for article_sentences in sample_articles:
    summaries.append(extractive_summary(article_sentences, top_n=5))

# Display original and summarized text for comparison
for idx, (original, summary) in enumerate(zip(sample_articles, summaries)):
    print(f"Article {idx+1}:\n{' '.join(original)}\n")
    print(f"Extractive Summary {idx+1}:\n{summary}\n")
    print("="*80)

# Save the summaries to a CSV file for evaluation
train_df['summary'] = train_df['sentences'].apply(lambda x: extractive_summary(x, top_n=5))
train_df[['article', 'summary']].to_csv('train_summaries.csv', index=False)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Article 1:
cnn  americans have been repeatedly shocked by school violence this year first in nevada where in october a student shot and killed a teacher and wounded two students before taking his own life then days later by the news that the body of a young teacher was found behind her school in massachusetts. the images of traumatized parents and a campus surrounded by police tape shake us profoundly  our hearts break for the families of those who died. for them this is the beginning of an unwanted journey. in my education research i have focused on the question of what happens in the lives of the people still connected to a school that has endured such a trauma long after the media and law enforcement move on. school shootings affect teachers schoo

In [None]:
pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=02c0973366b62c63af4a1c991435fc44d0df4edab75e3cc0b0ffb5892bb7836d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
from rouge_score import rouge_scorer

In [None]:
# Uninstall the previous installed nltk library

!pip install -U nltk



# This upgraded nltkto version 3.5 in which meteor_score is there.

!pip install nltk==3.5

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.9.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.9.1
Collecting nltk==3.5
  Downloading nltk-3.5.zip (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... 

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
from rouge_score import rouge_scorer

from nltk.translate.meteor_score import meteor_score

from bert_score import score as bert_score

In [None]:
from nltk.tokenize import sent_tokenize

test_df["clean_article"] = test_df["article"].apply(lambda x: [sent_tokenize(x.lower())])
test_df["clean_highlights"] = test_df["highlights"].apply(lambda x: [sent_tokenize(x.lower())])

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')  # Ensure this is also downloaded for tokenization

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import os

import nltk



# Set the directory for NLTK data

nltk_data_dir = '/kaggle/working/nltk_data'

os.makedirs(nltk_data_dir, exist_ok=True)



# Set the NLTK_DATA environment variable to point to the new directory

os.environ['NLTK_DATA'] = nltk_data_dir



# Download the required NLTK packages

nltk.download('wordnet', download_dir=nltk_data_dir)

nltk.download('omw-1.4', download_dir=nltk_data_dir)  # Optional for better word matching

nltk.download('punkt', download_dir=nltk_data_dir)



# Check if the downloads are successful by verifying the corpus location

print("NLTK data directory set to:", nltk_data_dir)

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
NLTK data directory set to: /kaggle/working/nltk_data


In [None]:
import zipfile



# Path to the WordNet zip file

wordnet_zip_path = '/kaggle/working/nltk_data/corpora/wordnet.zip'

wordnet_extract_path = '/kaggle/working/nltk_data/corpora/'



# Extract if it exists

if os.path.exists(wordnet_zip_path):

    with zipfile.ZipFile(wordnet_zip_path, 'r') as zip_ref:

        zip_ref.extractall(wordnet_extract_path)

    print("Extracted WordNet corpus.")

else:

    print("WordNet zip file not found.")

Extracted WordNet corpus.


In [None]:
import os



# Path to check if the WordNet corpus exists

wordnet_path = '/kaggle/working/nltk_data/corpora/wordnet'

if os.path.exists(wordnet_path):

    print("WordNet corpus found in:", wordnet_path)

else:

    print("WordNet corpus not found. Please check the directory structure.")

WordNet corpus found in: /kaggle/working/nltk_data/corpora/wordnet


In [None]:
# Verify the WordNet corpus

try:

    # Test loading wordnet explicitly

    wordnet.ensure_loaded()

    print("WordNet loaded successfully.")

except LookupError as e:

    print("Failed to load WordNet:", e)

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
WordNet loaded successfully.


In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from evaluate import load
import numpy as np
import re

# Download necessary NLTK data
nltk.download('punkt')

# Load metrics
rouge_metric = load("rouge")
meteor_metric = load("meteor")
bertscore_metric = load("bertscore")


# Initialize the summarizer
text_rank_summarizer = extractive_summary()

# Define evaluation function
def evaluate_performance(dataset):
    # Convert tokenized sentences back to strings
    references = [' '.join([' '.join(sentence) for sentence in highlight]) for highlight in dataset["clean_highlights"]]
    predictions = [' '.join([' '.join(sentence) for sentence in article]) for article in dataset["clean_article"]]

    # Generate summaries for each article
    generated_summaries = []
    for _, row in dataset.iterrows():
        article_text = ' '.join([' '.join(sentence) for sentence in row["clean_article"]])
        summary = text_rank_summarizer.generate_summary(article_text)  # Generate a summary for each article
        generated_summaries.append(summary)

    # Compute ROUGE Scores
    rouge_scores = rouge_metric.compute(predictions=generated_summaries, references=references)
    rouge_results = {
        "ROUGE-1": round(rouge_scores["rouge1"] * 100, 2),
        "ROUGE-2": round(rouge_scores["rouge2"] * 100, 2),
        "ROUGE-L": round(rouge_scores["rougeL"] * 100, 2),
        "ROUGE-Lsum": round(rouge_scores["rougeLsum"] * 100, 2)
    }

    # Compute METEOR Score
    meteor_score = meteor_metric.compute(predictions=generated_summaries, references=references)
    meteor_result = round(meteor_score["meteor"] * 100, 2)

    # Compute BERTScore
    bertscore_result = bertscore_metric.compute(predictions=generated_summaries, references=references, lang="en")
    bertscore_f1 = round(np.mean(bertscore_result["f1"]) * 100, 2)

    # Display the results
    evaluation_results = {
        "ROUGE-1": rouge_results["ROUGE-1"],
        "ROUGE-2": rouge_results["ROUGE-2"],
        "ROUGE-L": rouge_results["ROUGE-L"],
        "METEOR": meteor_result,
        "BERTScore (F1)": bertscore_f1
    }

    return evaluation_results

# Example usage: Evaluate on the test dataset
# Ensure 'clean_article' and 'clean_highlights' columns exist and are preprocessed as required.
results = evaluate_performance(test_df)
print("Evaluation Results:", results)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Results: {'ROUGE-1': 35.82, 'ROUGE-2': 16.7, 'ROUGE-L': 23.16, 'METEOR': 37.53, 'BERTScore (F1)': 86.69}
