### **Requirements Installation**

In [None]:
pip install numpy pandas scikit-learn nltk transformers torch datasets rouge-score sacrebleu

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (fro

### **Loading the Dataset - Using Kaggle**

In [None]:
from google.colab import files
files.upload()  # Upload the kaggle.json file


Saving kaggle (1).json to kaggle (1).json


{'kaggle (1).json': b'{"username":"sowmya1730","key":"ac259db664b8f55787859f59e2e29038"}'}

In [None]:
!pip install kaggle  # Install Kaggle API client

# Make the .kaggle directory and move the kaggle.json file to the directory
import os
os.makedirs("/root/.kaggle", exist_ok=True)
!cp kaggle.json /root/.kaggle/

# Set permissions to make the file readable by the API client
!chmod 600 /root/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail

Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 98% 493M/503M [00:05<00:00, 98.9MB/s]
100% 503M/503M [00:05<00:00, 100MB/s] 


In [None]:
# Unzip the downloaded dataset
!unzip /content/newspaper-text-summarization-cnn-dailymail.zip -d /content/cnn_dailymail # Extract to a subdirectory

# Load the CSV files (train, test, and validation data) using pandas
import pandas as pd

# Adjust the file paths to point to the subdirectory
train_data = pd.read_csv('/content/cnn_dailymail/train.csv')
test_data = pd.read_csv('/content/cnn_dailymail/test.csv')
validation_data = pd.read_csv('/content/cnn_dailymail/validation.csv')

Archive:  /content/newspaper-text-summarization-cnn-dailymail.zip
  inflating: /content/cnn_dailymail/cnn_dailymail/test.csv  
  inflating: /content/cnn_dailymail/cnn_dailymail/train.csv  
  inflating: /content/cnn_dailymail/cnn_dailymail/validation.csv  


### **Exploratory Data Analysis**

In [None]:
# Check the first few rows of the dataset
print("Train Data:")
print(train_data.head())

print("Test Data:")
print(test_data.head())

print("Validation Data:")
print(validation_data.head())

Train Data:
                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   

                                          highlights  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  
2  Craig Eccleston-Todd, 27, had drunk at least t...  
3  Nina dos Santos says Europe must be ready to a...  
4  Fleetwood top of League One after 2-0 win at S...  
Test Data:
                           

### **Data Preprocessing**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab resource

# Preprocess function: cleaning text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return ' '.join(tokens)

# Apply preprocessing to the 'article' column in both train and test datasets
train_data['article'] = train_data['article'].apply(preprocess_text)
test_data['article'] = test_data['article'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### **TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def tfidf_summarization(text, num_sentences=3):
    # Split into sentences
    sentences = nltk.sent_tokenize(text)
    # Calculate TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    # Calculate sentence similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)
    # Rank sentences based on similarity scores
    sentence_scores = similarity_matrix.sum(axis=1)
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_scores)[-num_sentences:]]
    # Return summary
    return ' '.join(ranked_sentences)

# Test TF-IDF summarization
sample_text = test_data.iloc[0]['article']
tfidf_summary = tfidf_summarization(sample_text)
print("TF-IDF Summary:", tfidf_summary)


TF-IDF Summary: ever noticed plane seats appear getting smaller smaller increasing numbers people taking skies experts questioning packed planes putting passengers risk say shrinking space aeroplanes uncomfortable 's putting health safety danger squabbling arm rest shrinking space planes putting health safety danger week u.s consumer advisory group set department transportation said public hearing government happy set standards animals flying planes n't stipulate minimum amount space humans 'in world animals rights space food humans said charlie leocha consumer representative committee 'it time dot faa take stand humane treatment passengers could crowding planes lead serious issues fighting space overhead lockers crashing elbows seat back kicking tests conducted faa use planes 31 inch pitch standard airlines decreased many economy seats united airlines 30 inches room airlines offer little 28 inches cynthia corbertt human factors researcher federal aviation administration conducts tests

### **LSA**

In [None]:
from sklearn.decomposition import TruncatedSVD

def lsa_summarization(text, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    # Decompose TF-IDF matrix with SVD
    svd = TruncatedSVD(n_components=1, n_iter=100)
    svd_matrix = svd.fit_transform(tfidf_matrix)
    # Rank sentences based on singular value scores
    ranked_sentences = [sentences[i] for i in np.argsort(svd_matrix[:, 0])[-num_sentences:]]
    return ' '.join(ranked_sentences)

# Test LSA summarization
lsa_summary = lsa_summarization(sample_text)
print("LSA Summary:", lsa_summary)


LSA Summary: ever noticed plane seats appear getting smaller smaller increasing numbers people taking skies experts questioning packed planes putting passengers risk say shrinking space aeroplanes uncomfortable 's putting health safety danger squabbling arm rest shrinking space planes putting health safety danger week u.s consumer advisory group set department transportation said public hearing government happy set standards animals flying planes n't stipulate minimum amount space humans 'in world animals rights space food humans said charlie leocha consumer representative committee 'it time dot faa take stand humane treatment passengers could crowding planes lead serious issues fighting space overhead lockers crashing elbows seat back kicking tests conducted faa use planes 31 inch pitch standard airlines decreased many economy seats united airlines 30 inches room airlines offer little 28 inches cynthia corbertt human factors researcher federal aviation administration conducts tests qu

  self.explained_variance_ratio_ = exp_var / full_var


### **BERT**

In [None]:
from transformers import pipeline

# Load BERT-based summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def bert_summarization(text, max_length=130, min_length=30):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Test BERT summarization
bert_summary = bert_summarization(sample_text)
print("BERT Summary:", bert_summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

BERT Summary: Ever noticed plane seats appear getting smaller smaller increasing numbers people taking skies? Experts questioning packed planes putting passengers risk say shrinking space aeroplanes uncomfortable 's putting health safety danger squabbling arm rest.


### **T5**

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

def t5_summarization(text, max_length=150):
    # Tokenize and encode text
    input_ids = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    # Generate summary
    summary_ids = t5_model.generate(input_ids, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    # Decode summary
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test T5 summarization
t5_summary = t5_summarization(sample_text)
print("T5 Summary:", t5_summary)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5 Summary: faa use planes 31 inches row seats standard airlines decreased many economy seats united airlines 30 inches room airlines offer little 28 inches cynthia corbertt human factors researcher faa administration conducts tests quickly passengers leave plane tests conducted using planes 31 inches row seats standard airlines decreased reported detroit news distance two seats one point seat seat behind known pitch airlines stick pitch 31 inches fall united airlines 30 inches space gulf air economy seats 29 32 inches air asia offers 29 inches spirit airlines offers 28 inches brit


### **Evaluation**

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet') # Download the 'wordnet' dataset

from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

# Function to evaluate summaries
def evaluate_summary(reference, summary):
    # Tokenize the summary and reference
    summary_tokens = word_tokenize(summary)
    reference_tokens = word_tokenize(reference)

    # ROUGE evaluation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, summary)
    # BLEU evaluation
    bleu_score = corpus_bleu([reference], [summary]).score
    # METEOR evaluation
    meteor = meteor_score([word_tokenize(reference)], word_tokenize(summary))  # Tokenized inputs for METEOR
    return rouge_scores, bleu_score, meteor

# Test evaluation on generated summaries
reference_summary = test_data.iloc[0]['highlights']
print("TF-IDF Evaluation:", evaluate_summary(reference_summary, tfidf_summary))
print("LSA Evaluation:", evaluate_summary(reference_summary, lsa_summary))
print("BERT Evaluation:", evaluate_summary(reference_summary, bert_summary))
print("T5 Evaluation:", evaluate_summary(reference_summary, t5_summary))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


TF-IDF Evaluation: ({'rouge1': Score(precision=0.10648148148148148, recall=0.6764705882352942, fmeasure=0.184), 'rouge2': Score(precision=0.037209302325581395, recall=0.24242424242424243, fmeasure=0.06451612903225806), 'rougeL': Score(precision=0.09259259259259259, recall=0.5882352941176471, fmeasure=0.16)}, 0.0, 0.27773289009957375)
LSA Evaluation: ({'rouge1': Score(precision=0.10648148148148148, recall=0.6764705882352942, fmeasure=0.184), 'rouge2': Score(precision=0.037209302325581395, recall=0.24242424242424243, fmeasure=0.06451612903225806), 'rougeL': Score(precision=0.09259259259259259, recall=0.5882352941176471, fmeasure=0.16)}, 0.0, 0.27773289009957375)
BERT Evaluation: ({'rouge1': Score(precision=0.36363636363636365, recall=0.35294117647058826, fmeasure=0.35820895522388063), 'rouge2': Score(precision=0.0625, recall=0.06060606060606061, fmeasure=0.06153846153846154), 'rougeL': Score(precision=0.30303030303030304, recall=0.29411764705882354, fmeasure=0.29850746268656714)}, 0.0, 0