In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
!pip install nltk rouge-score bert-score



In [4]:
import nltk
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score

In [5]:
import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="Your max_length is set to *")

## Reading the Dataset

In [6]:
pip install py7zr

Note: you may need to restart the kernel to use updated packages.


In [7]:
dataset = load_dataset("samsum")

  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
df= dataset['test'].to_pandas()

In [10]:
df= df.sample(n= 150, replace= False).reset_index(drop= True)  

## Analysing the dataset

In [11]:
df.head()

Unnamed: 0,id,dialogue,summary
0,13816051,Harriette: Have you ever gone ghost hunting? ;...,Jamie has never gone ghost hunting but Harriet...
1,13681603,Ella: so? \r\nMolly: ?\r\nElla: come on! pics ...,Chuck told Ella that he'd met up with Molly. H...
2,13729101,"Brenda: Hello, is this Sandra Donovan?\r\nSand...",Sandra and Brenda used to work together in the...
3,13863098,Cristina: Hey\nCristina: Do you think we can m...,Gaya and Cristina are going to meet at Gaya's ...
4,13681441,Joanna: They are sending emails about Lewandow...,Lewandowska has measles. There are vaccination...


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        150 non-null    object
 1   dialogue  150 non-null    object
 2   summary   150 non-null    object
dtypes: object(3)
memory usage: 3.6+ KB


In [13]:
df.describe()

Unnamed: 0,id,dialogue,summary
count,150,150,150
unique,150,150,150
top,13816051,Harriette: Have you ever gone ghost hunting? ;...,Jamie has never gone ghost hunting but Harriet...
freq,1,1,1


In [14]:
print('Dialogue:\n',df['dialogue'][0])
print('\nSummary:\n',df['summary'][0])

Dialogue:
 Harriette: Have you ever gone ghost hunting? ;o
Jamie: Ghost hunting? Nah, not really... Have you?
Harriette: Yeah, once when I was in high school! There was a run-down building in the neighbourhood and we went to investigate it with my friends
Jamie: How was it? Did you find something?
Harriette: We didn't see any ghosts, haha
Harriette: But let me tell you that I never thought I'd freak out this much at hearing a cat meow
Harriette: There's just something about the atmosphere... that makes you overreact and find normal but unexpected things really creepy
Jamie: I guess that's part of the experience? :p
Harriette: Yeah, if I could choose again, I'd probably still decide to go - I don't regret it! But I definitely wouldn't try something like that alone ^^;

Summary:
 Jamie has never gone ghost hunting but Harriette did with her friends once in high school. They did not see any ghosts and she only got frightened by a cat's miaowing.


## Model Metric Evaluation

In [15]:
def calculate_redundancy(summaries):
    
    total_tokens = sum(len(summary.split()) for summary in summaries)
    unique_tokens = len(set(token for summary in summaries for token in summary.split()))
    redundancy_score = 1 - (unique_tokens / total_tokens)
    
    return redundancy_score

In [16]:
def calc_metrics(actual_summaries, pred_summaries):
    
    # Calculate BLEU Score
    actual_summaries_tokenized = [[ref.split()] for ref in actual_summaries]         # tokenizing the actual summary
    pred_summaries_tokenized = [output.split() for output in pred_summaries]         # tokenizing the predicted summary
    bleu_score = corpus_bleu(actual_summaries_tokenized, pred_summaries_tokenized)   # comparing the tokens to calculate BLEU score
    
    
    # Calculate BERT Score
    P, R, F1 = score(actual_summaries, pred_summaries, lang='en', verbose=False)     # returns Precision, Recall and F1 score
    bert_score = F1.mean().item()                                                    # takes the mean of F1 scores across all examples (.item() used to convert PyTorch tensor into scalar value) 
    
    
    # Calculate ROUGE Scores
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)          # creates an object 'rouge' that will be used to compute ROUGE scores with ROUGE-1, ROUGE-2, and ROUGE-L metrics, using stemming
             
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_L_scores = []
    for pred, actual in zip(pred_summaries, actual_summaries):
        rouge_scores = rouge.score(pred, actual)                                                # returns a dictionary of mentioned ROUGE scores each of which contain precison, recall and F1 score
        rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge_2_scores.append(rouge_scores['rouge2'].fmeasure)
        rouge_L_scores.append(rouge_scores['rougeL'].fmeasure)
    
    rouge_1_f1 = sum(rouge_1_scores) / len(rouge_1_scores)                                      # calculating the average rouge scores (considering F1 score)
    rouge_2_f1 = sum(rouge_2_scores) / len(rouge_2_scores)
    rouge_L_f1 = sum(rouge_L_scores) / len(rouge_L_scores)
    
    # Calculate Redundancy Score
    redundancy_score = calculate_redundancy(pred_summaries)
    
    
    return bleu_score, bert_score, rouge_1_f1, rouge_2_f1, rouge_L_f1, redundancy_score        # returning all the calculated metrics

## Testing on different models

##### 1. facebook/bart-large-cnn

In [17]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summarizer(df['dialogue'][0], max_length= 130, min_length=30, truncation= True)   

2024-02-04 06:37:07.022931: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-04 06:37:07.023046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-04 06:37:07.026859: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[{'summary_text': "Harriette went ghost hunting with her friends when she was in high school. They investigated a run-down building in the neighbourhood and didn't see any ghosts. Harriette says she doesn't regret going ghost hunting, but she wouldn't do it alone."}]

In [18]:
summarizer(df['dialogue'][0], max_length= 30, min_length=10, truncation= True)[0]['summary_text']

'Harriette went ghost hunting with her friends when she was in high school. They investigated a run-down building in the neighbourhood.'

In [19]:
predictions= []
for i in range(0,len(df)):
    pred= summarizer(df['dialogue'][i], max_length=30, min_length=10, truncation= True)[0]['summary_text']
    predictions.append(pred)

Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max

In [20]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['summary'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
BLEU_scores= []
BERT_scores= []
Rouge_1_scores= []
Rouge_2_scores= []
Rouge_L_scores= []
Redundancy_scores= []
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

BLEU Score:  0.06819376766320177
BERT Score:  0.8777881860733032
Rouge-1 Score:  0.3537191342343953
Rouge-2 Score:  0.11681459839731756
Rouge-L Score:  0.2743427144308629
Redundancy Score:  0.5372358850017319


#### 2. sshleifer/distilbart-cnn-12-6

In [22]:
pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
pipe(df['dialogue'][0], max_length= 30, min_length=10, truncation= True)[0]['summary_text']       


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

' Harriette: Have you ever gone ghost hunting? ;o\xa0\xa0 Jamie: Ghost hunting? Nah, not really... Have you?'

In [23]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['dialogue'][i], max_length=30, min_length=10, truncation= True)[0]['summary_text']
    predictions.append(pred)

Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max

In [24]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['summary'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

BLEU Score:  0.04917266108914351
BERT Score:  0.8652205467224121
Rouge-1 Score:  0.31109114879000405
Rouge-2 Score:  0.09968754622698285
Rouge-L Score:  0.24437495396197187
Redundancy Score:  0.539986559139785


In [26]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

#### 3. philschmid/bart-large-cnn-samsum

In [27]:
pipe = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
print(pipe(df['dialogue'][0], max_length= 30, min_length=10, truncation= True)) 


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

[{'summary_text': 'Harriette went ghost hunting once when she was in high school with her friends in a run-down building in the neighbourhood'}]


In [28]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['dialogue'][i], max_length=30, min_length=10, truncation= True)[0]['summary_text']
    predictions.append(pred)

Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max

In [29]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['summary'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

BLEU Score:  0.1716887652393266
BERT Score:  0.9205511212348938
Rouge-1 Score:  0.5321249492047778
Rouge-2 Score:  0.2916850985556861
Rouge-L Score:  0.44859674661969223
Redundancy Score:  0.5589910496338486


#### 4. google/pegasus-cnn_dailymail

In [31]:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe(df['dialogue'][0], max_length= 30, min_length=10, truncation= True)[0]['summary_text'] 

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

"Harriette went ghost hunting with her friends in high school .<n>She never thought she'd freak out at hearing a cat meow "

In [32]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['dialogue'][i], max_length=30, min_length=10, truncation= True)[0]['summary_text']
    predictions.append(pred)

Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 30, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max

In [33]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['summary'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

BLEU Score:  0.050751401318902996
BERT Score:  0.8598005175590515
Rouge-1 Score:  0.31919036232745535
Rouge-2 Score:  0.09517286377370723
Rouge-L Score:  0.2538947214047014
Redundancy Score:  0.5382047477744807


#### 5. knkarthick/MEETING_SUMMARY

In [35]:
pipe = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
pipe(df['dialogue'][0], max_length= 30, min_length=10, truncation= True)[0]['summary_text'] 

config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

"Harriette went ghost hunting once when she was in high school. She didn't find any ghosts but she did find it"

In [36]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['dialogue'][i], max_length=30, min_length=10, truncation= True)[0]['summary_text']
    predictions.append(pred)

Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max

In [37]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['summary'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

BLEU Score:  0.16257379211637224
BERT Score:  0.9168060421943665
Rouge-1 Score:  0.5114647009961298
Rouge-2 Score:  0.2738689357642878
Rouge-L Score:  0.43198515492010814
Redundancy Score:  0.5599679101484156


## Topsis to find best model

In [39]:
models= ['facebook/bart-large-cnn','sshleifer/distilbart-cnn-12-6','philschmid/bart-large-cnn-samsum','google/pegasus-cnn_dailymail','knkarthick/MEETING_SUMMARY']

In [40]:
scores= [BLEU_scores,BERT_scores,Rouge_1_scores,Rouge_2_scores,Rouge_L_scores,Redundancy_scores]
for score in scores:
    for i in range(len(score)):
        score[i]= np.round(score[i],3)

In [41]:
df_topsis= pd.DataFrame({
    'Model': models,
    'BLEU': BLEU_scores,
    'BERT': BERT_scores,
    'Rouge-1': Rouge_1_scores,
    'Rouge-2': Rouge_2_scores,
    'Rouge-L': Rouge_L_scores,
    'Redundancy': Redundancy_scores
})

In [42]:
weights= [1,1,1,1,1,1]             # assuming equal weights (you may choose weights according to your priorities)        
impacts= ['+','+','+','+','+','-']

In [43]:
def normalize(matrix):
    norm_matrix = matrix / np.sqrt(np.sum(matrix**2, axis=0))                    # normalize the matrix
    return norm_matrix

def weighted_normalize(norm_matrix, weights):
    weighted_norm_matrix = norm_matrix * weights                                 # calculate the weighted normalized matrix
    return weighted_norm_matrix

def ideal_best_worst(weighted_norm_matrix, impacts):
    ideal_solution = np.max(weighted_norm_matrix, axis=0) * impacts              # calculate the ideal_best and ideal_worst solutions
    ideal_worst_solution = np.min(weighted_norm_matrix, axis=0) * impacts
    return ideal_solution, ideal_worst_solution

def euclidean_distances(weighted_norm_matrix, ideal_solution, ideal_worst_solution):
    dist_to_ideal = np.sqrt(np.sum((weighted_norm_matrix - ideal_solution)**2, axis=1))           # Calculate the Euclidean distances to the ideal_best and ideal_worst solutions.
    dist_to_ideal_worst = np.sqrt(np.sum((weighted_norm_matrix - ideal_worst_solution)**2, axis=1))
    return dist_to_ideal, dist_to_ideal_worst

def performance_score(dist_to_ideal, dist_to_ideal_worst):
    score = dist_to_ideal_worst / (dist_to_ideal + dist_to_ideal_worst)            # calculate the topsis score for each model
    return score

def topsis(matrix, weights, impacts):                                              # perform TOPSIS analysis
    # Step 1: Normalize the decision matrix
    norm_matrix = normalize(matrix)
    
    # Step 2: Calculate the weighted normalized decision matrix
    weighted_norm_matrix = weighted_normalize(norm_matrix, weights)
    
    # Step 3: Determine the ideal_best and ideal_worst solutions
    ideal_solution, ideal_worst_solution = ideal_best_worst(weighted_norm_matrix, impacts)
    
    # Step 4: Calculate the Euclidean distances to the ideal_best and ideal_worst solutions
    dist_to_ideal, dist_to_ideal_worst = euclidean_distances(weighted_norm_matrix, ideal_solution, ideal_worst_solution)
    
    # Step 5: Calculate the performance score for each alternative/model
    score = performance_score(dist_to_ideal, dist_to_ideal_worst)
    
    # Step 6: Rank the alternatives/models based on their performance scores
    sorted_indices = np.argsort(score)[::-1]                                       # Indices of scores sorted in descending order
    rankings = np.empty_like(sorted_indices)                                       # Create an empty array to store rankings
    rankings[sorted_indices] = np.arange(len(score)) + 1                           # Assign ranks
    
    return score, rankings

In [44]:
df_metrics= df_topsis.drop('Model',axis=1)
impacts_as_integers = [1 if impact == '+' else -1 for impact in impacts]

In [45]:
topsis_score, rankings = topsis(df_metrics, weights, impacts_as_integers)

In [46]:
for i in range(len(topsis_score)):
    topsis_score[i] = np.round(topsis_score[i], 3)

In [47]:
df_topsis['TOPSIS Score'] = topsis_score
df_topsis['TOPSIS Rank'] = rankings

In [48]:
df_topsis

Unnamed: 0,Model,BLEU,BERT,Rouge-1,Rouge-2,Rouge-L,Redundancy,TOPSIS Score,TOPSIS Rank
0,facebook/bart-large-cnn,0.068,0.878,0.354,0.117,0.274,0.537,0.445,3
1,sshleifer/distilbart-cnn-12-6,0.049,0.865,0.311,0.1,0.244,0.54,0.43,5
2,philschmid/bart-large-cnn-samsum,0.172,0.921,0.532,0.292,0.449,0.559,0.561,1
3,google/pegasus-cnn_dailymail,0.051,0.86,0.319,0.095,0.254,0.538,0.431,4
4,knkarthick/MEETING_SUMMARY,0.163,0.917,0.511,0.274,0.432,0.56,0.552,2
