                                                            * Downloading libraries *                                                                               

In [None]:
!pip3 install nltk rouge-score
!pip3 install torch==2.1.0 torchtext==0.16.0
!pip3 install pandas
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install bert-extractive-summarizer
!pip3 install numpy==1.22.4
!pip3 install bert-extractive-summarizer transformers
!pip3 install bert_score
!pip3 install numpy

In [None]:

#!pip3 install huggingface_hub

In [None]:
import numpy
print(numpy.__version__)

In [None]:
!pip3 cache purge

                                                                Importing libraries                                                                                 

In [None]:
import torch
import torchtext
import sentencepiece as spm
import torch.nn as nn
import torch.optim as optim
import nltk
import random
import numpy as np
import pandas as pd
from summarizer import Summarizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from transformers import pipeline, BertTokenizer, BertModel
from bert_score import score as bert_score


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

There are 2 types of summarization : 
1. Abstractive text summarization: The summary usually uses different words and phrases to concisely convey the same meaning as the original text.

2. Extractive summarization: The summary contains the most important sentences from the original input text sentences without any paraphrasing or changes. The sentences deemed unnecessary are discarded.

Models tried for Use Case 3 : 

1. BART (Bidirectional and Auto-Regressive Transformers) Model 
2. T5 (Text-to-Text Transfer Transformer) Model 
3. BERT (Bidirectional Encoder Representations from Transformers)
5. PEGASUS (Pre-training with Extracted Gap-sentences for Abstractive Summarization)

Examples of models for Extractive Text Summarization include : 

1. BERT : bert-base-uncased
2. distilbert-base-uncased
3. Sentence-BERT (extractive)

                                                            Extractive BERT based pre-trained model                                                                 

                                            METRICS EVALUATION FOR TEXT SUMMARIZATION                                                                                           

In [None]:
# Loading the dataset from csv
def load_data(file_name):
    df=pd.read_csv(file_name)
    return df;

def print_table(df):
    # Determine the max length for each column
    col_widths = {col: max(df[col].apply(lambda x: len(str(x)))) for col in df.columns}
    
    # Print the table header with padded columns
    header = " | ".join([col.ljust(col_widths[col]) for col in df.columns])
    print(header)
    print("-" * len(header))  # Add a separator line
    
    # Print each row with padded columns
    for index, row in df.iterrows():
        row_str = " | ".join([str(value).ljust(col_widths[col]) for col, value in row.items()])
        print(row_str)
    
# Load the dataset from CSV
df = load_data('test.csv')


# Summarization and Evaluation function
def summarize_and_evaluate(text, expected_summary,predicted_summary):

    
    # Ensure neither summary is None or empty
    if not predicted_summary or not expected_summary:
        print("One of the summaries is empty!")
        return 0  # or some default value

    # Evaluate model performance
    P, R, F1 = bert_score([predicted_summary], [expected_summary], lang="en", rescale_with_baseline=True)
    
    bert_score_value = F1.mean().item()
    #factual_consistency_score = calculate_factual_consistency(predicted_summary, expected_summary)
    return bert_score_value


def main():
        
        results=[]
      
        # Iterate over the rows of the dataframe
        for index, row in df.iterrows():
                text = row['text']
                expected_summary = row['summary']

                # Step 1: Initialize the Summarizer with a BERT-based model, which is 'bert-base-uncased'        
                distilbert_model = Summarizer('distilbert-base-uncased')
                
                # Step 2: Perform Extractive Summarization
                predicted_summary = distilbert_model(text, ratio=0.2)
                #bert_score_val, factcc_val = summarize_and_evaluate(text, expected_summary,predicted_summary)
                #bert_score_val,factcc_val = summarize_and_evaluate(text, expected_summary,predicted_summary)
                bert_score_val = summarize_and_evaluate(text, expected_summary,predicted_summary)


                '''
                results.append({'S.No':index+1,'Extractive Summary': predicted_summary,'Original Text': text,
                                'BERTScore': round(bert_score_val, 2), 'FactCC': round(factcc_val, 2), 'SummaC': round(summac_val, 2)
                                })
                '''   
                results.append({'S.No':index+1,'Extractive Summary': predicted_summary,
                                'BERTScore': round(bert_score_val, 2),
                                'Original Text': text
                                })
                

        # Convert the results list into a DataFrame
        results_df = pd.DataFrame(results)

        # Print the results in a padded tabular format
        print_table(results_df)
    
if __name__ == "__main__":
    main()


                                                            MODEL CREATED FOR EXTRACTIVE SUMMARIZATION                                                            

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from tqdm import tqdm
import pandas as pd
from bert_score import score


In [38]:
# Load the dataset from CSV
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df['text'].tolist(), df['summary'].tolist()

# Define the Dataset class
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=512):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        # Tokenize text and summary
        text_encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        summary_encoding = self.tokenizer(
            summary, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": text_encoding["input_ids"].squeeze(0),
            "attention_mask": text_encoding["attention_mask"].squeeze(0),
            "labels": summary_encoding["input_ids"].squeeze(0),
        }

# Custom collate function for dynamic padding
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=1)
    attention_mask = torch.nn.utils.rnn.pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Train function
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

# Evaluation function
def evaluate_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

    return total_loss / len(dataloader)

# Main training loop
def main():
    # Load dataset
    train_texts, train_summaries = load_dataset("test.csv")
    val_texts, val_summaries = load_dataset("test.csv")

    # Initialize tokenizer
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

    # Create datasets and dataloaders
    train_dataset = SummarizationDataset(train_texts, train_summaries, tokenizer)
    val_dataset = SummarizationDataset(val_texts, val_summaries, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

    # Initialize model
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Training
    epochs = 3
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # Train
        train_loss = train_epoch(model, train_dataloader, optimizer, device)
        print(f"Train Loss: {train_loss:.4f}")

        # Validate
        val_loss = evaluate_epoch(model, val_dataloader, device)
        print(f"Validation Loss: {val_loss:.4f}")

    # Save the model
    model.save_pretrained("summarization_model")
    tokenizer.save_pretrained("summarization_model")

if __name__ == "__main__":
    main()




Epoch 1/3


Training: 100%|██████████| 1/1 [00:12<00:00, 12.25s/it]


Train Loss: 15.5183


Evaluating: 100%|██████████| 1/1 [00:03<00:00,  3.09s/it]


Validation Loss: 12.8229
Epoch 2/3


Training: 100%|██████████| 1/1 [00:12<00:00, 12.31s/it]


Train Loss: 13.0396


Evaluating: 100%|██████████| 1/1 [00:03<00:00,  3.13s/it]


Validation Loss: 11.9617
Epoch 3/3


Training: 100%|██████████| 1/1 [00:12<00:00, 12.15s/it]


Train Loss: 12.3155


Evaluating: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]


Validation Loss: 11.6287
