In [None]:
# !pip install transformers==4.46.2 datasets==3.1.0 numpy==1.26.4 sklearn-pandas==2.2.0 torch==2.5.1+cu121

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
import datasets
import time
import wandb
import random
# import os
# from fvcore.nn import FlopCountAnalysis
# from torch.utils.data import DataLoader
import torch

key_file = r'C:\Development\TactitalTensorsFinalProject\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

# Log into WandB with the API key
wandb.login(key=api_key)

# Initialize WandB (no need to manually set the WANDB_API_KEY env variable again)


  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mayoungren94[0m ([33mayoungren-colostate[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayoun\_netrc


True

The WANDB_API_KEY is nessesary to use the Trainer class from HuggingFace and is what's used in the example notebook. However, I think we may all have to put in our own api key secret into google colab for it to work.

In [2]:
# from google.colab import userdata
# os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')


In [2]:
langs = ['java', 'python', 'pharo']

# langs = ['java'] # Using Java as the only language for testing purposes.
# langs = ['python']
# langs = ['pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = datasets.load_dataset('NLBSE/nlbse25-code-comment-classification')

In [3]:
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [4]:
ds['java_test'][0]

{'index': 5,
 'class': 'AbstractContractGetFileStatusTest.java',
 'comment_sentence': 'accept everything.',
 'partition': 1,
 'combo': 'accept everything. | AbstractContractGetFileStatusTest.java',
 'labels': [0, 0, 1, 0, 0, 0, 0]}

In [5]:
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The next few functions are to preprocess the trainng and validation sets, and to let the Trainer class to evaluate how well the trainng is going after each epoch.

In [6]:
# To tokenize the text in the 'combo' column of the training dataset of each language.

def tokenize_dataset(examples):
  return tokenizer(examples['combo'], truncation=True, padding="max_length", max_length=128)

In [7]:
# Tokenizes the text 'combo' column of the dataset, changes the values of the
# labels column to float instead of int, and sets to format of the lables column
# to torch tesnsors, which is required by the Trainer class.

def preprocess_dataset(input_dataset):
  processed_dataset = input_dataset.map(tokenize_dataset, batched=True, load_from_cache_file=False)
  processed_dataset = processed_dataset.cast_column("labels", datasets.features.Sequence(datasets.features.Value("float32")))
  processed_dataset.set_format('pt')
  return processed_dataset

Evaluating the predictions.

Below is non-functional code I still haven't finnished. We need to implement a way for the logits returned from the model to be converted into binary (0 - 1) predictions for the labels, and for those labels to be compaired to the ground truth labels of the testing datasets for each language. The competition also wants us to complete this step using profiling that they have set up, and which is partially shown below. The link to the reference notebook for the competition is here. Also, another notebook that is fairly close to what I believe we are trying to do can be found here_for_multi_label_text_classification.ipynb).

In [8]:
# scores must be global so that it can later be referenced for printing the results
scores = pd.DataFrame(columns=['model', 'lan', 'cat', 'precision', 'recall', 'f1'])

def compute_metrics(eval_pred, lang, categories, mod_name):
    global scores
    #scores.drop(scores[scores['lan'] == lang].index, inplace=True)

    predictions, true_labels = eval_pred
    predictions = (predictions > 0.5).astype(int)
    num_classes = len(categories)

    metrics_list = []

    for i in range(num_classes):
        y_pred = predictions[:, i]
        y_true = true_labels[:, i]

        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        metrics_list.append({
            'model': mod_name,
            'lan': lang,
            'cat': categories[i],
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

    temp_scores = pd.DataFrame([m for m in metrics_list if m['model'] == mod_name])
    scores = pd.concat([scores, temp_scores], ignore_index=True)

    avg_f1 = temp_scores['f1'].mean()
    avg_precision = temp_scores['precision'].mean()
    avg_recall = temp_scores['recall'].mean()

    return {'f1': avg_f1, 'precision': avg_precision, 'recall': avg_recall}

In [9]:
def measure_runtime_and_flops(total_time, total_flops, trainer, validation_dataset):

    for batch in validation_dataset:
        inputs = {key: val.unsqueeze(0).to(trainer.model.device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
        with torch.profiler.profile(with_flops=True, activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]) as prof:
            start_time = time.time()
            _ = trainer.model(**inputs)
            end_time = time.time()

        total_time += (end_time - start_time)
        total_flops += sum(k.flops for k in prof.key_averages()) / 1e9 # Convert Flops to GFLOPs

    avg_runtime = total_time / len(validation_dataset)
    avg_flops = (total_flops / len(validation_dataset))
    
    return avg_runtime, avg_flops, total_flops, total_time

In [10]:
def compute_metrics_wrapper(mod_name):
    def inner(eval_pred):
        return compute_metrics(eval_pred, lang, labels[lang], mod_name)
    return inner


In [11]:
max_avg_runtime = 5
max_avg_flops = 5000

def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
            0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
            0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

In [12]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for CUDA devices
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable to ensure deterministic operations

In [None]:
# Training the models for each language and saving the the models and thier
# tokenizers after training.

# trying different pre-trained models such as roberta-base, distilbert-base, or codebert-large

seed = 44
set_seed(seed)

# learning_rates = [5e-5, 5e-7]
lr = 5e-5
epochs = [5, 10, 15, 20]
batch_sizes = [4, 8]
weights_of_decay = [0.01] #, 0.001]
final_model_stats = {}
lang_model_stats = {}


for epoch in epochs:
  for bs in batch_sizes:
    for wd in weights_of_decay:
      print(f'------------------ Starting model ==> epochs: {epoch}, batch size: {bs}, weights of decay: {wd} ---------------------')
      fin_mod_name = f'epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}'
      total_flops = 0
      total_time = 0
      total_avg_runtime = 0
      total_avg_flops = 0

      for lang in langs:

        lang_mod_name = f'{lang}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}'
        wandb.init(
            project="NBSE2025GridSearch2", 
            entity="ayoungren-colostate",
            name=lang_mod_name,
            config={  # Log hyperparameters for each run
                "epochs": epoch,
                "batch_size": bs,
                "weight_decay": wd,
                "language": lang
            }
        )
        
        num_labels = len(labels[lang])
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")

        dataset = preprocess_dataset(ds[f'{lang}_train'])
        train_valalidation_split = dataset.train_test_split(test_size=0.2, seed=seed)

        train_dataset =  train_valalidation_split['train']
        validation_dataset = train_valalidation_split['test']

        training_args = TrainingArguments(
          output_dir=f'./results_{lang_mod_name}',
          eval_strategy="epoch",
          save_strategy="epoch",
          logging_dir=f'./logs_{lang_mod_name}',
          per_device_train_batch_size=bs,
          per_device_eval_batch_size=bs,
          num_train_epochs=epoch,
          weight_decay=wd,
          learning_rate=lr,
          logging_steps=10000,
          save_total_limit=2,
          load_best_model_at_end=True,
          metric_for_best_model="f1",
          seed=seed
        )

        trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=train_dataset,
          eval_dataset= validation_dataset,
          processing_class=tokenizer,
          compute_metrics=compute_metrics_wrapper(fin_mod_name),
        )

        trainer.train()
        metrics = trainer.evaluate()

        # print("Evaluation Metrics:", metrics)

        avg_runtime, avg_flops, total_flops, total_time = measure_runtime_and_flops(total_time, total_flops, trainer, validation_dataset)

        #store each model and succession score by individual language
        lang_mod_scores = scores[(scores['model'] == fin_mod_name) & (scores['lan'] == lang)]
        lang_avg_f1 = lang_mod_scores['f1'].mean()
        lang_model_stats[lang_mod_name] = round(score(lang_avg_f1, avg_runtime, avg_flops), 2)

        total_avg_runtime += avg_runtime
        total_avg_flops += avg_flops

        trainer.model.save_pretrained(f'./models/{lang_mod_name}')
        tokenizer.save_pretrained(f'./tokenizers/{lang_mod_name}')

      # store combined average succession score of all languages trained on. (Average Combined score of Java, Python, and Pharo)
      model_stats = scores[scores['model'] == fin_mod_name]
      fin_avg_f1 = model_stats['f1'].mean()
      final_model_stats[fin_mod_name] = round(score(fin_avg_f1, total_avg_runtime, total_avg_flops), 2)

------------------ Starting model ==> epochs: 5, batch size: 4, weights of decay: 0.01 ---------------------


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 18683.97 examples/s]
 20%|██        | 1523/7615 [01:35<06:34, 15.45it/s]

Evaluating the predictions.

In [15]:
scores

Unnamed: 0,model,lan,cat,precision,recall,f1
0,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,summary,0.949224,0.957326,0.953258
1,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,Ownership,0.964912,1.0,0.982143
2,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,Expand,0.681818,0.735294,0.707547
3,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,usage,0.964953,0.94508,0.954913
4,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,Pointer,0.988506,0.910053,0.947658
5,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,deprecation,0.933333,0.933333,0.933333
6,java_epoch-25_batchSize-8_weightsOfDecay-0.001,java,rational,0.733333,0.611111,0.666667
7,python_epoch-25_batchSize-8_weightsOfDecay-0.001,python,Usage,0.883929,0.818182,0.849785
8,python_epoch-25_batchSize-8_weightsOfDecay-0.001,python,Parameters,0.85,0.910714,0.87931
9,python_epoch-25_batchSize-8_weightsOfDecay-0.001,python,DevelopmentNotes,0.783784,0.725,0.753247


In [None]:
lang_model_stats

In [16]:
final_model_stats

{'epoch-10_batchSize-4_weightsOfDecay-0.01': nan,
 'epoch-10_batchSize-4_weightsOfDecay-0.001': nan,
 'epoch-10_batchSize-8_weightsOfDecay-0.01': nan,
 'epoch-10_batchSize-8_weightsOfDecay-0.001': nan,
 'epoch-15_batchSize-4_weightsOfDecay-0.01': nan,
 'epoch-15_batchSize-4_weightsOfDecay-0.001': nan,
 'epoch-15_batchSize-8_weightsOfDecay-0.01': nan,
 'epoch-15_batchSize-8_weightsOfDecay-0.001': nan,
 'epoch-20_batchSize-4_weightsOfDecay-0.01': nan,
 'epoch-20_batchSize-4_weightsOfDecay-0.001': nan,
 'epoch-20_batchSize-8_weightsOfDecay-0.01': nan,
 'epoch-20_batchSize-8_weightsOfDecay-0.001': nan,
 'epoch-25_batchSize-4_weightsOfDecay-0.01': nan,
 'epoch-25_batchSize-4_weightsOfDecay-0.001': nan,
 'epoch-25_batchSize-8_weightsOfDecay-0.01': nan,
 'epoch-25_batchSize-8_weightsOfDecay-0.001': nan}