In [1]:
!pip install transformers==4.46.2 datasets==3.1.0 torch==2.5.1+cu121 sklearn-pandas==2.2.0 numpy==1.26.4 fvcore==0.1.5.post20221221

Collecting transformers==4.46.2
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==3.1.0
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting fvcore==0.1.5.post20221221
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.1.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==3.1.0)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from data

In [None]:
# !pip list | grep fvcore

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
from fvcore.nn import FlopCountAnalysis
import numpy as np
import pandas as pd
import datasets
import torch
import time
import os

The WANDB_API_KEY is nessesary to use the Trainer class from HuggingFace and is what's used in the example notebook. However, I think we may all have to put in our own api key secret into google colab for it to work.

In [4]:
from google.colab import userdata
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
langs = ['java', 'python', 'pharo']

# langs = ['java'] # Using Java as the only language for testing purposes.
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = datasets.load_dataset('NLBSE/nlbse25-code-comment-classification')

training = True

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

In [7]:
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [8]:
ds['java_test'][0]

{'index': 5,
 'class': 'AbstractContractGetFileStatusTest.java',
 'comment_sentence': 'accept everything.',
 'partition': 1,
 'combo': 'accept everything. | AbstractContractGetFileStatusTest.java',
 'labels': [0, 0, 1, 0, 0, 0, 0]}

In [9]:
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The next few functions are to preprocess the trainng and validation sets, and to let the Trainer class to evaluate how well the trainng is going after each epoch.

In [10]:
# To tokenize the text in the 'combo' column of the training dataset of each language.

def tokenize_dataset(examples):
    return tokenizer(examples['combo'], truncation=True, padding="max_length", max_length=128)

In [11]:
# Tokenizes the text 'combo' column of the dataset, changes the values of the
# labels column to float instead of int, and sets to format of the lables column
# to torch tesnsors, which is required by the Trainer class.

def preprocess_dataset(input_dataset):
  processed_dataset = input_dataset.map(tokenize_dataset, batched=True, load_from_cache_file=False)
  processed_dataset = processed_dataset.cast_column("labels", datasets.features.Sequence(datasets.features.Value("float32")))
  processed_dataset.set_format('pt')
  return processed_dataset

Evaluating the predictions.

Below is non-functional code I still haven't finnished. We need to implement a way for the logits returned from the model to be converted into binary (0 - 1) predictions for the labels, and for those labels to be compaired to the ground truth labels of the testing datasets for each language. The competition also wants us to complete this step using profiling that they have set up, and which is partially shown below. The link to the reference notebook for the competition is [here](https://colab.research.google.com/drive/1RULzasjO_nrqiXLrGze-PznFlHKtGQ4s?usp=sharing). Also, another notebook that is fairly close to what I believe we are trying to do can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb).

In [12]:
# scores must be global so that it can later be referenced for printing the results
scores = pd.DataFrame(columns=['lan', 'cat', 'precision', 'recall', 'f1'])

In [13]:
def compute_metrics(eval_pred, lang, categories):
    global scores
    scores.drop(scores[scores['lan'] == lang].index, inplace=True)

    predictions, true_labels = eval_pred
    predictions = (predictions > 0.5).astype(int)
    num_classes = len(categories)

    metrics_list = []

    for i in range(num_classes):
        y_pred = predictions[:, i]
        y_true = true_labels[:, i]

        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        metrics_list.append({
            'lan': lang,
            'cat': categories[i],
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

    temp_scores = pd.DataFrame(metrics_list)
    scores = pd.concat([scores, temp_scores], ignore_index=True)

    avg_f1 = temp_scores['f1'].mean()
    avg_precision = temp_scores['precision'].mean()
    avg_recall = temp_scores['recall'].mean()

    return {'f1': avg_f1, 'precision': avg_precision, 'recall': avg_recall}

In [14]:
def compute_metrics_validate(eval_pred, lang, categories):

    predictions, true_labels = eval_pred
    predictions = (predictions > 0.5).astype(int)
    num_classes = len(categories)

    metrics_list = []

    for i in range(num_classes):
        y_pred = predictions[:, i]
        y_true = true_labels[:, i]

        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        metrics_list.append({
            'lan': lang,
            'cat': categories[i],
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

    temp_scores = pd.DataFrame(metrics_list)

    avg_f1 = temp_scores['f1'].mean()
    avg_precision = temp_scores['precision'].mean()
    avg_recall = temp_scores['recall'].mean()

    return {'f1': avg_f1, 'precision': avg_precision, 'recall': avg_recall}

In [15]:
def measure_runtime_and_flops(trainer, test_dataset):
  print(f' \n** Evaluating runtime and flops for {lang} test set. **\n')
  total_time = 0
  total_flops = 0

  for i in range(10):
    for batch in test_dataset:
        inputs = {key: val.unsqueeze(0).to(trainer.model.device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
        with torch.profiler.profile(with_flops=True, activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]) as prof:
            start_time = time.time()
            _ = trainer.model(**inputs)
            end_time = time.time()

        total_time += (end_time - start_time)
        total_flops += sum(k.flops for k in prof.key_averages()) / 1e9

    avg_runtime = total_time / 10
    avg_flops = total_flops / 10
    return avg_runtime, avg_flops, total_flops, total_time

In [16]:
# Wrapper is needed because model can only take eval_pred for compute_metrics, but lang and labels are necessary for output
def compute_metrics_wrapper(eval_pred):
  if training:
    return compute_metrics_validate(eval_pred, lang, labels[lang])
  else:
    print(f'\n ** Evaluating metrics for {lang} test set. ** \n')
    return compute_metrics(eval_pred, lang, labels[lang])


In [17]:
hyperparams = {
    'java': {
        'epochs': 20,
        'lr': 5e-5,
        'weight_decay': 0.01,
        'batch_size': 8
    },
    'python': {
        'epochs': 20,
        'lr': 5e-5,
        'weight_decay': 0.01,
        'batch_size': 8
    },
    'pharo': {
        'epochs': 20,
        'lr': 5e-5,
        'weight_decay': 0.01,
        'batch_size': 8
    }
}

In [18]:
total_flops = 0
total_time = 0
metrics_dict = {}

# Main Training Loop
for lang in langs:

    training = True

    num_labels = len(labels[lang])
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    dataset = preprocess_dataset(ds[f'{lang}_train'])
    test_dataset = preprocess_dataset(ds[f'{lang}_test'])

    train_val_split = dataset.train_test_split(test_size=0.2, seed=27)
    train_dataset = train_val_split['train']
    validation_dataset = train_val_split['test']

    training_args = TrainingArguments(
        output_dir=f'./results_{lang}',
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f'./logs_{lang}',
        per_device_train_batch_size=hyperparams[lang]['batch_size'],
        per_device_eval_batch_size=hyperparams[lang]['batch_size'],
        num_train_epochs=hyperparams[lang]['epochs'],
        weight_decay=hyperparams[lang]['weight_decay'],
        learning_rate=hyperparams[lang]['lr'],
        logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        seed=27
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics_wrapper,
    )

    print(f'\n** Training {lang} model. **\n')
    trainer.train()
    trainer.evaluate()

    training = False

    test_trainer = Trainer(
        model=trainer.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics_wrapper,
    )

    test_trainer.evaluate()

    avg_runtime, avg_flops, total_flops, total_time = measure_runtime_and_flops(test_trainer, test_dataset)

    metrics_dict[lang] = {
        'avg_runtime': avg_runtime,
        'avg_flops': avg_flops,
        'total_flops': total_flops,
        'total_time': total_time
    }

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7614 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1725 [00:00<?, ? examples/s]




** Training java model. **



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjhegartydrafting[0m ([33mjhegartydrafting-colorado-state-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0799,0.107004,0.660209,0.676564,0.645816
2,0.0658,0.07747,0.81544,0.868526,0.778412
3,0.0681,0.094319,0.836832,0.863544,0.813449
4,0.0069,0.10126,0.859357,0.90118,0.82607
5,0.0064,0.110703,0.825567,0.852982,0.802278
6,0.0053,0.113511,0.85145,0.876829,0.829869
7,0.0154,0.11878,0.851013,0.888679,0.827004
8,0.0116,0.137463,0.849689,0.884586,0.825432
9,0.002,0.147468,0.851478,0.848551,0.85967
10,0.0009,0.141837,0.858746,0.872011,0.848818



 ** Evaluating metrics for java test set. ** 

 
** Evaluating runtime and flops for java test set. **



  scores = pd.concat([scores, temp_scores], ignore_index=True)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1884 [00:00<?, ? examples/s]

Map:   0%|          | 0/406 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/406 [00:00<?, ? examples/s]


** Training python model. **



Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3687,0.355785,0.299211,0.55487,0.253492
2,0.2894,0.304751,0.56516,0.832371,0.457645
3,0.1596,0.330568,0.593131,0.818109,0.548481
4,0.1197,0.355309,0.686322,0.736592,0.650651
5,0.1116,0.373666,0.691917,0.728655,0.684211
6,0.0493,0.390442,0.724131,0.762269,0.696417
7,0.0231,0.430667,0.718768,0.751187,0.697528
8,0.0273,0.42539,0.730551,0.763755,0.703783
9,0.0138,0.487791,0.72144,0.75922,0.690643
10,0.03,0.464477,0.730703,0.771713,0.698659



 ** Evaluating metrics for python test set. ** 

 
** Evaluating runtime and flops for python test set. **



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1298 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1298 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/289 [00:00<?, ? examples/s]


** Training pharo model. **



Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2827,0.286494,0.196784,0.37614,0.163384
2,0.2304,0.225211,0.460665,0.612485,0.379572
3,0.1471,0.224179,0.490177,0.595676,0.428364
4,0.0912,0.203777,0.635372,0.814955,0.557491
5,0.0732,0.216843,0.632299,0.845316,0.555604
6,0.0448,0.224984,0.61966,0.82022,0.560091
7,0.0288,0.253507,0.626361,0.818251,0.568126
8,0.0237,0.247107,0.667984,0.784686,0.63498
9,0.0226,0.257101,0.656699,0.813992,0.601905
10,0.0105,0.253498,0.681161,0.819165,0.617008



 ** Evaluating metrics for pharo test set. ** 

 
** Evaluating runtime and flops for pharo test set. **



In [19]:
scores

Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.877419,0.914798,0.895719
1,java,Ownership,1.0,1.0,1.0
2,java,Expand,0.520548,0.372549,0.434286
3,java,usage,0.922481,0.828306,0.872861
4,java,Pointer,0.769565,0.961957,0.855072
5,java,deprecation,0.9,0.6,0.72
6,java,rational,0.314286,0.323529,0.318841
7,python,Usage,0.782178,0.652893,0.711712
8,python,Parameters,0.816,0.796875,0.806324
9,python,DevelopmentNotes,0.4,0.292683,0.338028


In [21]:
try:
  scores.to_csv(f'/content/drive/MyDrive/CS_440_Experiment_Results/distilbert_scores.csv')
  print('Saved scores.')
except:
  print('Could not save scores.')

Saved scores.


In [22]:
for lang in metrics_dict:
  print(f'{lang}: {metrics_dict[lang]}')

java: {'avg_runtime': 1.822890329360962, 'avg_flops': 1875.7829952000163, 'total_flops': 18757.829952000164, 'total_time': 18.22890329360962}
python: {'avg_runtime': 0.428382420539856, 'avg_flops': 441.48851066880087, 'total_flops': 4414.885106688009, 'total_time': 4.28382420539856}
pharo: {'avg_runtime': 0.3034268856048584, 'avg_flops': 314.26161484799815, 'total_flops': 3142.6161484799813, 'total_time': 3.034268856048584}


In [23]:
total_flops = 0
total_time = 0
avg_runtime = 0
avg_flops = 0

for lang in metrics_dict:
  total_flops += metrics_dict[lang]['total_flops']
  total_time += metrics_dict[lang]['total_time']
  avg_runtime += metrics_dict[lang]['avg_runtime']
  avg_flops += metrics_dict[lang]['avg_flops']

In [28]:
print(f'Total Flops: {total_flops}')
print(f'Total Time: {total_time}')
print(f'Average Flops: {avg_flops}')
print(f'Average Time: {avg_runtime}')

print()
for lang in langs:
  f1_ave_lang = scores[scores['lan'] == lang]['f1'].mean()
  print(f'Average F1 for {lang}: {f1_ave_lang}')

  precision_ave_lang = scores[scores['lan'] == lang]['precision'].mean()
  print(f'Average Precision for {lang}: {precision_ave_lang}')

  recall_ave_lang = scores[scores['lan'] == lang]['recall'].mean()
  print(f'Average Recall for {lang}: {recall_ave_lang}')

print()
f1_mean = scores['f1'].mean()
print(f'Average F1 for across all languages: {f1_mean}')
precision_mean = scores['precision'].mean()
print(f'Average Precision for across all languages: {precision_mean}')
recall_mean = scores['recall'].mean()
print(f'Average Recall for across all languages: {recall_mean}')

Total Flops: 26315.331207168154
Total Time: 25.546996355056763
Average Flops: 2631.5331207168156
Average Time: 2.5546996355056764

Average F1 for java: 0.728111197654506
Average Precision for java: 0.7577569788394637
Average Recall for java: 0.7144484891272666
Average F1 for python: 0.5963488595575148
Average Precision for python: 0.6306583222121956
Average Recall for python: 0.5683833904454747
Average F1 for pharo: 0.634779850365692
Average Precision for pharo: 0.6767557447695799
Average Recall for pharo: 0.609101601284521

Average F1 for across all languages: 0.6590516649436294
Average Precision for across all languages: 0.6944674040170675
Average Recall for across all languages: 0.6371982939531521


In [27]:
max_avg_runtime = 5
max_avg_flops = 5000

# s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)×0.2+((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs)×0.2
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
      0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

avg_f1 = scores['f1'].mean()
print(f'Average F1: {avg_f1}')

print(f'score: {round(score(avg_f1, avg_runtime, avg_flops), 2)}')

Average F1: 0.6590516649436294
score: 0.59
