In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
!pip install nltk rouge-score bert-score

In [3]:
import nltk
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score

In [4]:
import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="Your max_length is set to *")

## Reading the Dataset

In [7]:
dataset = load_dataset("samsum")

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
df= dataset['test'].to_pandas()

## Analysing the dataset

In [None]:
df= df.sample(n= 150, replace= False).reset_index(drop= True)   
# you may proceed with the complete test data 

In [10]:
df.head()

Unnamed: 0,id,dialogue,summary
0,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...
1,13729565,Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...,Eric and Rob are going to watch a stand-up on ...
2,13680171,"Lenny: Babe, can you help me with something?\r...",Lenny can't decide which trousers to buy. Bob ...
3,13729438,"Will: hey babe, what do you want for dinner to...",Emma will be home soon and she will let Will k...
4,13828600,"Ollie: Hi , are you in Warsaw\r\nJane: yes, ju...",Jane is in Warsaw. Ollie and Jane has a party....


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 819 entries, 0 to 818
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        819 non-null    object
 1   dialogue  819 non-null    object
 2   summary   819 non-null    object
dtypes: object(3)
memory usage: 19.3+ KB


In [12]:
print('Dialogue:\n',df['dialogue'][0])
print('\nSummary:\n',df['summary'][0])

Dialogue:
 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
 Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


## Model Metric Evaluation

In [13]:
def calculate_redundancy(summaries):
    
    total_tokens = sum(len(summary.split()) for summary in summaries)
    unique_tokens = len(set(token for summary in summaries for token in summary.split()))
    redundancy_score = 1 - (unique_tokens / total_tokens)
    
    return redundancy_score

In [14]:
def calc_metrics(actual_summaries, pred_summaries):
    
    # Calculate BLEU Score
    actual_summaries_tokenized = [[ref.split()] for ref in actual_summaries]         # tokenizing the actual summary
    pred_summaries_tokenized = [output.split() for output in pred_summaries]         # tokenizing the predicted summary
    bleu_score = corpus_bleu(actual_summaries_tokenized, pred_summaries_tokenized)   # comparing the tokens to calculate BLEU score
    
    
    # Calculate BERT Score
    P, R, F1 = score(actual_summaries, pred_summaries, lang='en', verbose=False)     # returns Precision, Recall and F1 score
    bert_score = F1.mean().item()                                                    # takes the mean of F1 scores across all examples (.item() used to convert PyTorch tensor into scalar value) 
    
    
    # Calculate ROUGE Scores
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)          # creates an object 'rouge' that will be used to compute ROUGE scores with ROUGE-1, ROUGE-2, and ROUGE-L metrics, using stemming
             
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_L_scores = []
    for pred, actual in zip(pred_summaries, actual_summaries):
        rouge_scores = rouge.score(pred, actual)                                                # returns a dictionary of mentioned ROUGE scores each of which contain precison, recall and F1 score
        rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge_2_scores.append(rouge_scores['rouge2'].fmeasure)
        rouge_L_scores.append(rouge_scores['rougeL'].fmeasure)
    
    rouge_1_f1 = sum(rouge_1_scores) / len(rouge_1_scores)                                      # calculating the average rouge scores (considering F1 score)
    rouge_2_f1 = sum(rouge_2_scores) / len(rouge_2_scores)
    rouge_L_f1 = sum(rouge_L_scores) / len(rouge_L_scores)
    
    # Calculate Redundancy Score
    redundancy_score = calculate_redundancy(pred_summaries)
    
    
    return bleu_score, bert_score, rouge_1_f1, rouge_2_f1, rouge_L_f1, redundancy_score        # returning all the calculated metrics

## Testing on different models

###### 1. facebook/bart-large-cnn

In [1]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summarizer(df['dialogue'][0], max_length= 130, min_length=30, truncation= True)   


: 

In [None]:
predictions= []
for i in range(0,len(df)):
    pred= summarizer(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [None]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

In [None]:
BLEU_scores= []
BERT_scores= []
Rouge_1_scores= []
Rouge_2_scores= []
Rouge_L_scores= []
Redundancy_scores= []
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

#### 2. sshleifer/distilbart-cnn-12-6

In [None]:
from transformers import pipeline

pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']       # setting max output length to 130


In [None]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [None]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

In [None]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

In [None]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

#### 3. philschmid/bart-large-cnn-samsum

In [None]:
pipe = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
print(pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)) 


In [None]:
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']

In [None]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [None]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

In [None]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

#### 4. google/pegasus-cnn_dailymail

In [None]:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text'] 

In [None]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [None]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

In [None]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

#### 5. knkarthick/MEETING_SUMMARY

In [None]:
pipe = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text'] 

In [None]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [None]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

## Topsis to find best model

In [None]:
models= ['facebook/bart-large-cnn','sshleifer/distilbart-cnn-12-6','philschmid/bart-large-cnn-samsum','google/pegasus-cnn_dailymail','knkarthick/MEETING_SUMMARY']

In [None]:
scores= [BLEU_scores,BERT_scores,Rouge_1_scores,Rouge_2_scores,Rouge_L_scores,Redundancy_scores]
for score in scores:
    for i in range(len(score)):
        score[i]= np.round(score[i],3)

In [None]:
df_topsis= pd.DataFrame({
    'Model': models,
    'BLEU': BLEU_scores,
    'BERT': BERT_scores,
    'Rouge-1': Rouge_1_scores,
    'Rouge-2': Rouge_2_scores,
    'Rouge-L': Rouge_L_scores,
    'Redundancy': Redundancy_scores
})

In [None]:
weights= [1,1,1,1,1,1]             # assuming equal weights (you may choose weights according to your priorities)        
impacts= ['+','+','+','+','+','-']

In [None]:
def normalize(matrix):
    norm_matrix = matrix / np.sqrt(np.sum(matrix**2, axis=0))                    # normalize the matrix
    return norm_matrix

def weighted_normalize(norm_matrix, weights):
    weighted_norm_matrix = norm_matrix * weights                                 # calculate the weighted normalized matrix
    return weighted_norm_matrix

def ideal_best_worst(weighted_norm_matrix, impacts):
    ideal_solution = np.max(weighted_norm_matrix, axis=0) * impacts              # calculate the ideal_best and ideal_worst solutions
    ideal_worst_solution = np.min(weighted_norm_matrix, axis=0) * impacts
    return ideal_solution, ideal_worst_solution

def euclidean_distances(weighted_norm_matrix, ideal_solution, ideal_worst_solution):
    dist_to_ideal = np.sqrt(np.sum((weighted_norm_matrix - ideal_solution)**2, axis=1))           # Calculate the Euclidean distances to the ideal_best and ideal_worst solutions.
    dist_to_ideal_worst = np.sqrt(np.sum((weighted_norm_matrix - ideal_worst_solution)**2, axis=1))
    return dist_to_ideal, dist_to_ideal_worst

def performance_score(dist_to_ideal, dist_to_ideal_worst):
    score = dist_to_ideal_worst / (dist_to_ideal + dist_to_ideal_worst)            # calculate the topsis score for each model
    return score

def topsis(matrix, weights, impacts):                                              # perform TOPSIS analysis
    # Step 1: Normalize the decision matrix
    norm_matrix = normalize(matrix)
    
    # Step 2: Calculate the weighted normalized decision matrix
    weighted_norm_matrix = weighted_normalize(norm_matrix, weights)
    
    # Step 3: Determine the ideal_best and ideal_worst solutions
    ideal_solution, ideal_worst_solution = ideal_best_worst(weighted_norm_matrix, impacts)
    
    # Step 4: Calculate the Euclidean distances to the ideal_best and ideal_worst solutions
    dist_to_ideal, dist_to_ideal_worst = euclidean_distances(weighted_norm_matrix, ideal_solution, ideal_worst_solution)
    
    # Step 5: Calculate the performance score for each alternative/model
    score = performance_score(dist_to_ideal, dist_to_ideal_worst)
    
    # Step 6: Rank the alternatives/models based on their performance scores
    sorted_indices = np.argsort(score)[::-1]                                       # Indices of scores sorted in descending order
    rankings = np.empty_like(sorted_indices)                                       # Create an empty array to store rankings
    rankings[sorted_indices] = np.arange(len(score)) + 1                           # Assign ranks
    
    return score, rankings

In [None]:
df_metrics= df_topsis.drop('Model',axis=1)
impacts_as_integers = [1 if impact == '+' else -1 for impact in impacts]

In [None]:
topsis_score, rankings = topsis(df_metrics, weights, impacts_as_integers)

In [None]:
for i in range(len(topsis_score)):
    topsis_score[i] = np.round(topsis_score[i], 3)

In [None]:
df_topsis['TOPSIS Score'] = topsis_score
df_topsis['TOPSIS Rank'] = rankings

In [None]:
df_topsis