In [1]:
!pip install transformers==4.46.2 datasets==3.1.0 torch==2.5.1+cu121 sklearn-pandas==2.2.0 numpy==1.26.4

Collecting transformers==4.46.2
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     -------------------------- ----------- 30.7/44.1 kB 660.6 kB/s eta 0:00:01
     -------------------------------------- 44.1/44.1 kB 721.1 kB/s eta 0:00:00
Collecting datasets==3.1.0
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)


ERROR: Could not find a version that satisfies the requirement torch==2.5.1+cu121 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1)
ERROR: No matching distribution found for torch==2.5.1+cu121


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
import datasets
import time
import os

from torch.utils.data import DataLoader
import torch

The WANDB_API_KEY is nessesary to use the Trainer class from HuggingFace and is what's used in the example notebook. However, I think we may all have to put in our own api key secret into google colab for it to work.

In [6]:
from google.colab import userdata
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')

In [8]:
langs = ['java', 'python', 'pharo']

# langs = ['java'] # Using Java as the only language for testing purposes.
# langs = ['python']
# langs = ['pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = datasets.load_dataset('NLBSE/nlbse25-code-comment-classification')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

In [9]:
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [10]:
ds['java_test'][0]

{'index': 5,
 'class': 'AbstractContractGetFileStatusTest.java',
 'comment_sentence': 'accept everything.',
 'partition': 1,
 'combo': 'accept everything. | AbstractContractGetFileStatusTest.java',
 'labels': [0, 0, 1, 0, 0, 0, 0]}

In [11]:
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

The next few functions are to preprocess the trainng and validation sets, and to let the Trainer class to evaluate how well the trainng is going after each epoch.

In [12]:
# To tokenize the text in the 'combo' column of the training dataset of each language.

def tokenize_dataset(examples):
  return tokenizer(examples['combo'], truncation=True, padding="max_length", max_length=128)

In [13]:
# Tokenizes the text 'combo' column of the dataset, changes the values of the
# labels column to float instead of int, and sets to format of the lables column
# to torch tesnsors, which is required by the Trainer class.

def preprocess_dataset(input_dataset):
  processed_dataset = input_dataset.map(tokenize_dataset, batched=True, load_from_cache_file=False)
  processed_dataset = processed_dataset.cast_column("labels", datasets.features.Sequence(datasets.features.Value("float32")))
  processed_dataset.set_format('pt')
  return processed_dataset

In [14]:
# To compute eveluation metrics during the training process.

def compute_metrics(pred):
    logits, labels = pred
    preds = torch.sigmoid(torch.tensor(logits)) > 0.5
    labels = torch.tensor(labels)

    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')
    f1 = f1_score(labels, preds, average='micro')

    return {"precision": precision, "recall": recall, "f1": f1}

In [None]:
# Training the models for each language and saving the the models and thier
# tokenizers after training.

# trying different pre-trained models such as roberta-base, distilbert-base, or codebert-large

learningRates = [5e-5, 5e-7]
epochs = [3, 5, 10]
batch_sizes = [8, 16]
weights_of_decay = [0.01, 0.001]

for lr in learningRates:
  for epoch in epochs:
    for bs in batch_sizes:
      for wd in weights_of_decay:
        print(f'------------------ Starting model ==> learning rate: {lr}, epochs: {epoch}, batch size: {bs}, weights of decay: {wd} ---------------------')

        for lang in langs:
          num_labels = len(labels[lang])
          model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")

          dataset = preprocess_dataset(ds[f'{lang}_train'])
          train_valalidation_split = dataset.train_test_split(test_size=0.2, seed=27)

          train_dataset =  train_valalidation_split['train']
          validation_dataset = train_valalidation_split['test']

          training_args = TrainingArguments(
            output_dir=f'./results_{lang}_lr-{lr}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}',
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f'./logs_{lang}_lr-{lr}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}',
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=epoch,
            weight_decay=wd,
            learning_rate=lr,
            logging_steps=10,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
          )

          trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset= validation_dataset,
            processing_class=tokenizer,
            compute_metrics=compute_metrics,
          )

          trainer.train()
          metrics = trainer.evaluate()

          print("Evaluation Metrics:", metrics)

          trainer.model.save_pretrained(f'./models/{lang}_lr-{lr}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}')
          tokenizer.save_pretrained(f'./tokenizers/{lang}_lr-{lr}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}')

------------------ Starting model ==> learning rate: 5e-05, epochs: 3, batch size: 8, weights of decay: 0.01 ---------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1029,0.101885,0.930526,0.844586,0.885476
2,0.0342,0.085496,0.925,0.895541,0.910032
3,0.0169,0.077915,0.928712,0.904459,0.916425


Evaluation Metrics: {'eval_loss': 0.07791507989168167, 'eval_precision': 0.9287115761935906, 'eval_recall': 0.9044585987261147, 'eval_f1': 0.9164246531139078, 'eval_runtime': 9.9919, 'eval_samples_per_second': 152.424, 'eval_steps_per_second': 19.116, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3787,0.329733,0.820717,0.496386,0.618619
2,0.2384,0.271413,0.80597,0.650602,0.72
3,0.1829,0.261462,0.809798,0.677108,0.737533


Evaluation Metrics: {'eval_loss': 0.2614622712135315, 'eval_precision': 0.8097982708933718, 'eval_recall': 0.6771084337349398, 'eval_f1': 0.7375328083989501, 'eval_runtime': 2.5163, 'eval_samples_per_second': 149.822, 'eval_steps_per_second': 19.075, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1298 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2653,0.298666,0.853448,0.325658,0.471429
2,0.2265,0.252382,0.85567,0.546053,0.666667
3,0.143,0.240018,0.835616,0.601974,0.699809


Evaluation Metrics: {'eval_loss': 0.24001796543598175, 'eval_precision': 0.8356164383561644, 'eval_recall': 0.6019736842105263, 'eval_f1': 0.6998087954110899, 'eval_runtime': 1.76, 'eval_samples_per_second': 147.729, 'eval_steps_per_second': 18.75, 'epoch': 3.0}
------------------ Starting model ==> learning rate: 5e-05, epochs: 3, batch size: 8, weights of decay: 0.001 ---------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1316,0.111189,0.920447,0.840127,0.878455
2,0.0457,0.089475,0.932478,0.879618,0.905277
3,0.0199,0.080256,0.921875,0.901911,0.911784


Evaluation Metrics: {'eval_loss': 0.08025563508272171, 'eval_precision': 0.921875, 'eval_recall': 0.9019108280254777, 'eval_f1': 0.9117836445589182, 'eval_runtime': 10.1129, 'eval_samples_per_second': 150.6, 'eval_steps_per_second': 18.887, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3777,0.335848,0.79845,0.496386,0.612184
2,0.231,0.279782,0.773639,0.650602,0.706806
3,0.1811,0.266165,0.809659,0.686747,0.743155


Evaluation Metrics: {'eval_loss': 0.266165167093277, 'eval_precision': 0.8096590909090909, 'eval_recall': 0.6867469879518072, 'eval_f1': 0.7431551499348109, 'eval_runtime': 2.5027, 'eval_samples_per_second': 150.634, 'eval_steps_per_second': 19.179, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1298 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2653,0.298663,0.853448,0.325658,0.471429
2,0.2264,0.252377,0.85567,0.546053,0.666667
3,0.143,0.240034,0.835616,0.601974,0.699809


Evaluation Metrics: {'eval_loss': 0.2400340735912323, 'eval_precision': 0.8356164383561644, 'eval_recall': 0.6019736842105263, 'eval_f1': 0.6998087954110899, 'eval_runtime': 1.7416, 'eval_samples_per_second': 149.291, 'eval_steps_per_second': 18.948, 'epoch': 3.0}
------------------ Starting model ==> learning rate: 5e-05, epochs: 3, batch size: 16, weights of decay: 0.01 ---------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0953,0.10092,0.921089,0.840127,0.878748
2,0.0413,0.083624,0.931011,0.88535,0.907607
3,0.0303,0.074778,0.931417,0.90828,0.919703


Evaluation Metrics: {'eval_loss': 0.074777752161026, 'eval_precision': 0.9314173742651861, 'eval_recall': 0.9082802547770701, 'eval_f1': 0.9197033215091905, 'eval_runtime': 9.8646, 'eval_samples_per_second': 154.39, 'eval_steps_per_second': 9.732, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4128,0.355853,0.846473,0.491566,0.621951
2,0.3444,0.309186,0.867925,0.554217,0.676471
3,0.2786,0.287646,0.80805,0.628916,0.707317


Evaluation Metrics: {'eval_loss': 0.28764623403549194, 'eval_precision': 0.8080495356037152, 'eval_recall': 0.6289156626506024, 'eval_f1': 0.7073170731707317, 'eval_runtime': 2.4408, 'eval_samples_per_second': 154.455, 'eval_steps_per_second': 9.833, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1298 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3278,0.319282,0.863636,0.3125,0.458937
2,0.2599,0.268762,0.883562,0.424342,0.573333
3,0.234,0.24936,0.860697,0.569079,0.685149


Evaluation Metrics: {'eval_loss': 0.24935965240001678, 'eval_precision': 0.8606965174129353, 'eval_recall': 0.569078947368421, 'eval_f1': 0.6851485148514852, 'eval_runtime': 1.7216, 'eval_samples_per_second': 151.022, 'eval_steps_per_second': 9.875, 'epoch': 3.0}
------------------ Starting model ==> learning rate: 5e-05, epochs: 3, batch size: 16, weights of decay: 0.001 ---------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0899,0.104214,0.923131,0.833758,0.876171


Evaluating the predictions.

In [None]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, texts):
      self.texts = texts

  def __len__(self):
      return len(self.texts)

  def __getitem__(self, idx):
      return self.texts[idx]

In [16]:
models = []

for lr in learningRates:
  for epoch in epochs:
    for bs in batch_sizes:
      for wd in weights_of_decay:
        models.append((AutoModelForSequenceClassification.from_pretrained(f'./models/java_lr-{lr}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}'),
                      AutoTokenizer.from_pretrained(f'./tokenizers/java_lr-{lr}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}')))

SyntaxError: expected ':' (<ipython-input-16-df6e85c9b76b>, line 4)

In [None]:
def get_predictions(input_text_list, tokenizer, model, threshold=0.5, batch_size=8):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    dataset = TextDataset(input_text_list)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            encoding = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

            input_ids = encoding["input_ids"].to(device)
            attention_mask = encoding["attention_mask"].to(device)

            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            probs = torch.sigmoid(logits)

            # Create binary predictions based on the threshold
            binary_predictions = (probs >= threshold).int()

            # If no label meets the threshold, pick the one with the highest probability
            for i, row in enumerate(binary_predictions):
                if row.sum() == 0: # If no lable is chosen
                    max_idx = torch.argmax(probs[i])
                    binary_predictions[i, max_idx] = 1

            all_predictions.append(binary_predictions.cpu().numpy())

    # Collect all predictions into a single matrix of size (num labels, num input samples)
    prediction_matrix = np.vstack(all_predictions)
    return prediction_matrix

In [None]:
results = get_predictions(ds['java_test']['combo'], tokenizer, model, threshold=0.5)

In [None]:
results[0:20]

SequenceClassifierOutput(loss=None, logits=tensor([[-4.3638, -5.7541, -0.2655, -3.3848, -0.9917, -3.4193, -1.4983]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
ds['java_test']['labels'][0]

[0, 0, 1, 0, 0, 0, 0]

Below is non-functional code I still haven't finished. We need to implement a way for the logits returned from the model to be converted into binary (0 - 1) predictions for the labels, and for those labels to be compaired to the ground truth labels of the testing datasets for each language. The competition also wants us to complete this step using profiling that they have set up, and which is partially shown below. The link to the reference notebook for the competition is [here](https://colab.research.google.com/drive/1RULzasjO_nrqiXLrGze-PznFlHKtGQ4s?usp=sharing). Also, another notebook that is fairly close to what I believe we are trying to do can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb).

In [None]:
total_flops = 0
total_time = 0
scores = []

for model, tokenizer in models:
    print(f'------------------ Profiling Model: {model} ---------------------')
    for lan in langs:
        # to load trained models:
        # model = AutoModelForSequenceClassification.from_pretrained(f'./models/{lan}_lr-{lr}_epoch-{epoch}')
        # to load pretrained models from Hub:
        # model = SetFitModel.from_pretrained(f"NLBSE/nlbse25_{lan}")
        with torch.profiler.profile(with_flops=True) as p:
            begin = time.time()
            for i in range(10):
              y_pred = model(ds[f'{lan}_test']['combo']).numpy().T
            total = time.time() - begin
            total_time = total_time + total
        total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)
        y_true = np.array(ds[f'{lan}_test']['labels']).T
        for i in range(len(y_pred)):
            assert(len(y_pred[i]) == len(y_true[i]))
            tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
            tn = sum([true == pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
            fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
            fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = (2*tp) / (2*tp + fp + fn)
            scores.append({'lan': lan, 'cat': labels[lan][i],'precision': precision,'recall': recall,'f1': f1})
    print("Compute in GFLOPs:", total_flops/10)
    print("Avg runtime in seconds:", total_time/10)
    scores = pd.DataFrame(scores)
    print(f'Scores: {scores}')
    max_avg_runtime = 5
    max_avg_flops = 5000

    # s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)×0.2+((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs)×0.2
    def score(avg_f1, avg_runtime, avg_flops):
        return (0.6 * avg_f1 +
          0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
          0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

    avg_f1 = scores.f1.mean()
    avg_runtime = total_time/10
    avg_flops = total_flops/10

    round(score(avg_f1, avg_runtime, avg_flops), 2)