In [None]:
pip install transformers torch



In [12]:
!pip install transformers datasets scikit-learn pandas numpy



In [13]:
import time
import numpy as np
import pandas as pd

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [14]:
dataset = load_dataset("sst2")


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [16]:
model_names = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "albert-base-v2",
    "cardiffnlp/twitter-roberta-base-sentiment"
]


In [18]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    save_strategy="no",
    report_to="none"
)

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [19]:
results = []

for model_name in model_names:
    print(f"\nEvaluating {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        ignore_mismatched_sizes=True
    )

    def tokenize_function(batch):
        return tokenizer(
            batch["sentence"],
            padding="max_length",
            truncation=True,
            max_length=128
        )

    tokenized_eval = dataset["validation"].map(
        tokenize_function,
        batched=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_eval,
        compute_metrics=compute_metrics
    )

    start = time.time()
    metrics = trainer.evaluate()
    end = time.time()

    inference_time = end - start
    model_size = sum(p.numel() for p in model.parameters()) * 4 / (1024**2)

    results.append({
        "Model": model_name,
        "Accuracy": metrics["eval_accuracy"],
        "Precision": metrics["eval_precision"],
        "Recall": metrics["eval_recall"],
        "F1": metrics["eval_f1"],
        "Inference_Time": inference_time,
        "Model_Size_MB": model_size
    })



Evaluating distilbert-base-uncased...


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]




Evaluating bert-base-uncased...


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Evaluating roberta-base...


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]




Evaluating albert-base-v2...


Loading weights:   0%|          | 0/25 [00:00<?, ?it/s]

AlbertForSequenceClassification LOAD REPORT from: albert-base-v2
Key                          | Status     | 
-----------------------------+------------+-
predictions.LayerNorm.weight | UNEXPECTED | 
predictions.dense.weight     | UNEXPECTED | 
predictions.LayerNorm.bias   | UNEXPECTED | 
predictions.decoder.bias     | UNEXPECTED | 
predictions.bias             | UNEXPECTED | 
predictions.dense.bias       | UNEXPECTED | 
classifier.bias              | MISSING    | 
classifier.weight            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]




Evaluating cardiffnlp/twitter-roberta-base-sentiment...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment
Key                             | Status     |                                                                                     
--------------------------------+------------+-------------------------------------------------------------------------------------
roberta.embeddings.position_ids | UNEXPECTED |                                                                                     
classifier.out_proj.weight      | MISMATCH   | Reinit due to size mismatch ckpt: torch.Size([3, 768]) vs model:torch.Size([2, 768])
classifier.out_proj.bias        | MISMATCH   | Reinit due to size mismatch ckpt: torch.Size([3]) vs model:torch.Size([2])          

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISMATCH	:ckpt weights were loaded, but they did not match the original empty weight shapes.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]



In [20]:
df = pd.DataFrame(results)
df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Inference_Time,Model_Size_MB
0,distilbert-base-uncased,0.474771,0.488372,0.662162,0.562141,238.796822,255.413094
1,bert-base-uncased,0.490826,0.0,0.0,0.0,376.536411,417.647469
2,roberta-base,0.508028,0.523364,0.378378,0.439216,366.512647,475.491219
3,albert-base-v2,0.508028,0.508834,0.972973,0.668213,414.392729,44.575203
4,cardiffnlp/twitter-roberta-base-sentiment,0.440367,0.47284,0.862613,0.610845,370.891386,475.491219


In [21]:
criteria_cols = [
    "Accuracy",
    "Precision",
    "Recall",
    "F1",
    "Inference_Time",
    "Model_Size_MB"
]

decision_matrix = df[criteria_cols].values


In [22]:
weights = np.array([0.25, 0.15, 0.15, 0.25, 0.10, 0.10])

# +1 = benefit, -1 = cost
impacts = np.array([1, 1, 1, 1, -1, -1])


In [23]:
# Step 1: Normalize
norm = np.sqrt((decision_matrix ** 2).sum(axis=0))
normalized = decision_matrix / norm

# Step 2: Weighted normalized matrix
weighted = normalized * weights

# Step 3: Ideal best & worst
ideal_best = np.max(weighted * impacts, axis=0)
ideal_worst = np.min(weighted * impacts, axis=0)

# Step 4: Distance measures
dist_best = np.sqrt(((weighted - ideal_best) ** 2).sum(axis=1))
dist_worst = np.sqrt(((weighted - ideal_worst) ** 2).sum(axis=1))

# Step 5: TOPSIS score
topsis_score = dist_worst / (dist_best + dist_worst)

df["TOPSIS_Score"] = topsis_score
df["Rank"] = df["TOPSIS_Score"].rank(ascending=False)

df.sort_values("Rank")


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Inference_Time,Model_Size_MB,TOPSIS_Score,Rank
3,albert-base-v2,0.508028,0.508834,0.972973,0.668213,414.392729,44.575203,0.733218,1.0
0,distilbert-base-uncased,0.474771,0.488372,0.662162,0.562141,238.796822,255.413094,0.71139,2.0
4,cardiffnlp/twitter-roberta-base-sentiment,0.440367,0.47284,0.862613,0.610845,370.891386,475.491219,0.693662,3.0
2,roberta-base,0.508028,0.523364,0.378378,0.439216,366.512647,475.491219,0.614191,4.0
1,bert-base-uncased,0.490826,0.0,0.0,0.0,376.536411,417.647469,0.406477,5.0
